[llvm] 3176f25 - [IA][AArch64]: Construct (de)interleave4 out of (de)interleave2 (#89276)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 12 09:23:04 PDT 2024
Author: Hassnaa Hamdi
Date: 2024-08-12T17:23:00+01:00
New Revision: 3176f255c9dd43e8bacad0f9e56cf4f9f8816009
URL: https://github.com/llvm/llvm-project/commit/3176f255c9dd43e8bacad0f9e56cf4f9f8816009
DIFF: https://github.com/llvm/llvm-project/commit/3176f255c9dd43e8bacad0f9e56cf4f9f8816009.diff
LOG: [IA][AArch64]: Construct (de)interleave4 out of (de)interleave2 (#89276)
- [AArch64]: TargetLowering is updated to spot load/store (de)interleave4 like sequences using PatternMatch,
and emit equivalent sve.ld4 and sve.st4 intrinsics.
Added:
llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
Modified:
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/include/llvm/IR/PatternMatch.h
llvm/lib/CodeGen/InterleavedAccessPass.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.h
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
llvm/lib/Target/RISCV/RISCVISelLowering.h
llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 3a4e081831e315..deb1d04df3400c 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3156,8 +3156,11 @@ class TargetLoweringBase {
///
/// \p DI is the deinterleave intrinsic.
/// \p LI is the accompanying load instruction
- virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
- LoadInst *LI) const {
+ /// \p DeadInsts is a reference to a vector that keeps track of dead
+ /// instruction during transformations.
+ virtual bool lowerDeinterleaveIntrinsicToLoad(
+ IntrinsicInst *DI, LoadInst *LI,
+ SmallVectorImpl<Instruction *> &DeadInsts) const {
return false;
}
@@ -3167,8 +3170,11 @@ class TargetLoweringBase {
///
/// \p II is the interleave intrinsic.
/// \p SI is the accompanying store instruction
- virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
- StoreInst *SI) const {
+ /// \p DeadInsts is a reference to a vector that keeps track of dead
+ /// instruction during transformations.
+ virtual bool lowerInterleaveIntrinsicToStore(
+ IntrinsicInst *II, StoreInst *SI,
+ SmallVectorImpl<Instruction *> &DeadInsts) const {
return false;
}
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 9c70a60bf50c77..8c6b7895470b8d 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2940,6 +2940,17 @@ inline VScaleVal_match m_VScale() {
return VScaleVal_match();
}
+template <typename Opnd0, typename Opnd1>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1>::Ty
+m_Interleave2(const Opnd0 &Op0, const Opnd1 &Op1) {
+ return m_Intrinsic<Intrinsic::vector_interleave2>(Op0, Op1);
+}
+
+template <typename Opnd>
+inline typename m_Intrinsic_Ty<Opnd>::Ty m_Deinterleave2(const Opnd &Op) {
+ return m_Intrinsic<Intrinsic::vector_deinterleave2>(Op);
+}
+
template <typename LHS, typename RHS, unsigned Opcode, bool Commutable = false>
struct LogicalOp_match {
LHS L;
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 8c9065aec7faa4..ef11713892a53d 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -489,7 +489,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n");
// Try and match this with target specific intrinsics.
- if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI))
+ if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI, DeadInsts))
return false;
// We now have a target-specific load, so delete the old one.
@@ -510,13 +510,16 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n");
+ SmallVector<Instruction *, 4> InterleaveDeadInsts;
// Try and match this with target specific intrinsics.
- if (!TLI->lowerInterleaveIntrinsicToStore(II, SI))
+ if (!TLI->lowerInterleaveIntrinsicToStore(II, SI, InterleaveDeadInsts))
return false;
// We now have a target-specific store, so delete the old one.
DeadInsts.push_back(SI);
DeadInsts.push_back(II);
+ DeadInsts.insert(DeadInsts.end(), InterleaveDeadInsts.begin(),
+ InterleaveDeadInsts.end());
return true;
}
@@ -537,7 +540,7 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) {
// with a factor of 2.
if (II->getIntrinsicID() == Intrinsic::vector_deinterleave2)
Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts);
- if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
+ else if (II->getIntrinsicID() == Intrinsic::vector_interleave2)
Changed |= lowerInterleaveIntrinsic(II, DeadInsts);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7777aa4b50a370..efc1703221d21e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -17202,17 +17202,148 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
return true;
}
+bool getDeinterleave2Values(
+ Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues,
+ SmallVectorImpl<Instruction *> &DeInterleaveDeadInsts) {
+ if (!DI->hasNUses(2))
+ return false;
+ auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
+ auto *Extr2 = dyn_cast<ExtractValueInst>(*(++DI->user_begin()));
+ if (!Extr1 || !Extr2)
+ return false;
+
+ DeinterleavedValues.resize(2);
+ // Place the values into the vector in the order of extraction:
+ DeinterleavedValues[0x1 & (Extr1->getIndices()[0])] = Extr1;
+ DeinterleavedValues[0x1 & (Extr2->getIndices()[0])] = Extr2;
+ if (!DeinterleavedValues[0] || !DeinterleavedValues[1])
+ return false;
+
+ // Make sure that the extracted values match the deinterleave tree pattern
+ if (!match(DeinterleavedValues[0], m_ExtractValue<0>((m_Specific(DI)))) ||
+ !match(DeinterleavedValues[1], m_ExtractValue<1>((m_Specific(DI))))) {
+ LLVM_DEBUG(dbgs() << "matching deinterleave2 failed\n");
+ return false;
+ }
+ // DeinterleavedValues will be replace by output of ld2
+ DeInterleaveDeadInsts.insert(DeInterleaveDeadInsts.end(),
+ DeinterleavedValues.begin(),
+ DeinterleavedValues.end());
+ return true;
+}
+
+/*
+DeinterleaveIntrinsic tree:
+ [DI]
+ / \
+ [Extr<0>] [Extr<1>]
+ | |
+ [DI] [DI]
+ / \ / \
+ [Extr<0>][Extr<1>] [Extr<0>][Extr<1>]
+ | | | |
+roots: A C B D
+roots in correct order of DI4 will be: A B C D.
+Returns true if `DI` is the top of an IR tree that represents a theoretical
+vector.deinterleave4 intrinsic. When true is returned, \p `DeinterleavedValues`
+vector is populated with the results such an intrinsic would return: (i.e. {A,
+B, C, D } = vector.deinterleave4(...))
+*/
+bool getDeinterleave4Values(
+ Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues,
+ SmallVectorImpl<Instruction *> &DeInterleaveDeadInsts) {
+ if (!DI->hasNUses(2))
+ return false;
+ auto *Extr1 = dyn_cast<ExtractValueInst>(*(DI->user_begin()));
+ auto *Extr2 = dyn_cast<ExtractValueInst>(*(++DI->user_begin()));
+ if (!Extr1 || !Extr2)
+ return false;
+
+ if (!Extr1->hasOneUse() || !Extr2->hasOneUse())
+ return false;
+ auto *DI1 = *(Extr1->user_begin());
+ auto *DI2 = *(Extr2->user_begin());
+
+ if (!DI1->hasNUses(2) || !DI2->hasNUses(2))
+ return false;
+ // Leaf nodes of the deinterleave tree:
+ auto *A = dyn_cast<ExtractValueInst>(*(DI1->user_begin()));
+ auto *C = dyn_cast<ExtractValueInst>(*(++DI1->user_begin()));
+ auto *B = dyn_cast<ExtractValueInst>(*(DI2->user_begin()));
+ auto *D = dyn_cast<ExtractValueInst>(*(++DI2->user_begin()));
+ // Make sure that the A,B,C and D are ExtractValue instructions before getting
+ // the extract index
+ if (!A || !B || !C || !D)
+ return false;
+
+ DeinterleavedValues.resize(4);
+ // Place the values into the vector in the order of deinterleave4:
+ DeinterleavedValues[0x3 &
+ ((A->getIndices()[0] * 2) + Extr1->getIndices()[0])] = A;
+ DeinterleavedValues[0x3 &
+ ((B->getIndices()[0] * 2) + Extr2->getIndices()[0])] = B;
+ DeinterleavedValues[0x3 &
+ ((C->getIndices()[0] * 2) + Extr1->getIndices()[0])] = C;
+ DeinterleavedValues[0x3 &
+ ((D->getIndices()[0] * 2) + Extr2->getIndices()[0])] = D;
+ if (!DeinterleavedValues[0] || !DeinterleavedValues[1] ||
+ !DeinterleavedValues[2] || !DeinterleavedValues[3])
+ return false;
+
+ // Make sure that A,B,C,D match the deinterleave tree pattern
+ if (!match(DeinterleavedValues[0], m_ExtractValue<0>(m_Deinterleave2(
+ m_ExtractValue<0>(m_Specific(DI))))) ||
+ !match(DeinterleavedValues[1], m_ExtractValue<0>(m_Deinterleave2(
+ m_ExtractValue<1>(m_Specific(DI))))) ||
+ !match(DeinterleavedValues[2], m_ExtractValue<1>(m_Deinterleave2(
+ m_ExtractValue<0>(m_Specific(DI))))) ||
+ !match(DeinterleavedValues[3], m_ExtractValue<1>(m_Deinterleave2(
+ m_ExtractValue<1>(m_Specific(DI)))))) {
+ LLVM_DEBUG(dbgs() << "matching deinterleave4 failed\n");
+ return false;
+ }
+
+ // These Values will not be used anymore,
+ // DI4 will be created instead of nested DI1 and DI2
+ DeInterleaveDeadInsts.insert(DeInterleaveDeadInsts.end(),
+ DeinterleavedValues.begin(),
+ DeinterleavedValues.end());
+ DeInterleaveDeadInsts.push_back(cast<Instruction>(DI1));
+ DeInterleaveDeadInsts.push_back(cast<Instruction>(Extr1));
+ DeInterleaveDeadInsts.push_back(cast<Instruction>(DI2));
+ DeInterleaveDeadInsts.push_back(cast<Instruction>(Extr2));
+
+ return true;
+}
+
+bool getDeinterleavedValues(
+ Value *DI, SmallVectorImpl<Instruction *> &DeinterleavedValues,
+ SmallVectorImpl<Instruction *> &DeInterleaveDeadInsts) {
+ if (getDeinterleave4Values(DI, DeinterleavedValues, DeInterleaveDeadInsts))
+ return true;
+ return getDeinterleave2Values(DI, DeinterleavedValues, DeInterleaveDeadInsts);
+}
+
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
- IntrinsicInst *DI, LoadInst *LI) const {
+ IntrinsicInst *DI, LoadInst *LI,
+ SmallVectorImpl<Instruction *> &DeadInsts) const {
// Only deinterleave2 supported at present.
if (DI->getIntrinsicID() != Intrinsic::vector_deinterleave2)
return false;
- // Only a factor of 2 supported at present.
- const unsigned Factor = 2;
+ SmallVector<Instruction *, 4> DeinterleavedValues;
+ SmallVector<Instruction *, 8> DeInterleaveDeadInsts;
- VectorType *VTy = cast<VectorType>(DI->getType()->getContainedType(0));
- const DataLayout &DL = DI->getDataLayout();
+ if (!getDeinterleavedValues(DI, DeinterleavedValues, DeInterleaveDeadInsts)) {
+ LLVM_DEBUG(dbgs() << "Matching ld2 and ld4 patterns failed\n");
+ return false;
+ }
+ unsigned Factor = DeinterleavedValues.size();
+ assert((Factor == 2 || Factor == 4) &&
+ "Currently supported Factor is 2 or 4 only");
+ VectorType *VTy = cast<VectorType>(DeinterleavedValues[0]->getType());
+
+ const DataLayout &DL = DI->getModule()->getDataLayout();
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
return false;
@@ -17223,7 +17354,6 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
return false;
unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable);
-
VectorType *LdTy =
VectorType::get(VTy->getElementType(),
VTy->getElementCount().divideCoefficientBy(NumLoads));
@@ -17233,18 +17363,15 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
UseScalable, LdTy, PtrTy);
IRBuilder<> Builder(LI);
-
Value *Pred = nullptr;
if (UseScalable)
Pred =
Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue());
Value *BaseAddr = LI->getPointerOperand();
- Value *Result;
if (NumLoads > 1) {
- Value *Left = PoisonValue::get(VTy);
- Value *Right = PoisonValue::get(VTy);
-
+ // Create multiple legal small ldN.
+ SmallVector<Value *, 4> ExtractedLdValues(Factor, PoisonValue::get(VTy));
for (unsigned I = 0; I < NumLoads; ++I) {
Value *Offset = Builder.getInt64(I * Factor);
@@ -17254,40 +17381,96 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN");
else
LdN = Builder.CreateCall(LdNFunc, Address, "ldN");
-
Value *Idx =
Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue());
- Left = Builder.CreateInsertVector(
- VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx);
- Right = Builder.CreateInsertVector(
- VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx);
+ for (unsigned J = 0; J < Factor; ++J) {
+ ExtractedLdValues[J] = Builder.CreateInsertVector(
+ VTy, ExtractedLdValues[J], Builder.CreateExtractValue(LdN, J), Idx);
+ }
+ LLVM_DEBUG(dbgs() << "LdN4 res: "; LdN->dump());
}
-
- Result = PoisonValue::get(DI->getType());
- Result = Builder.CreateInsertValue(Result, Left, 0);
- Result = Builder.CreateInsertValue(Result, Right, 1);
+ // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
+ for (unsigned J = 0; J < Factor; ++J)
+ DeinterleavedValues[J]->replaceAllUsesWith(ExtractedLdValues[J]);
} else {
+ Value *Result;
if (UseScalable)
Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN");
else
Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN");
+ // Replace output of deinterleave2 intrinsic by output of ldN2/ldN4
+ for (unsigned I = 0; I < Factor; I++) {
+ Value *NewExtract = Builder.CreateExtractValue(Result, I);
+ DeinterleavedValues[I]->replaceAllUsesWith(NewExtract);
+ }
}
-
- DI->replaceAllUsesWith(Result);
+ DeadInsts.insert(DeadInsts.end(), DeInterleaveDeadInsts.begin(),
+ DeInterleaveDeadInsts.end());
return true;
}
+/*
+InterleaveIntrinsic tree.
+ A C B D
+ \ / \ /
+ [II] [II]
+ \ /
+ [II]
+
+values in correct order of interleave4: A B C D.
+Returns true if `II` is the root of an IR tree that represents a theoretical
+vector.interleave4 intrinsic. When true is returned, \p `InterleavedValues`
+vector is populated with the inputs such an intrinsic would take: (i.e.
+vector.interleave4(A, B, C, D)).
+*/
+bool getValuesToInterleave(
+ Value *II, SmallVectorImpl<Value *> &InterleavedValues,
+ SmallVectorImpl<Instruction *> &InterleaveDeadInsts) {
+ Value *A, *B, *C, *D;
+ // Try to match interleave of Factor 4
+ if (match(II, m_Interleave2(m_Interleave2(m_Value(A), m_Value(C)),
+ m_Interleave2(m_Value(B), m_Value(D))))) {
+ InterleavedValues.push_back(A);
+ InterleavedValues.push_back(B);
+ InterleavedValues.push_back(C);
+ InterleavedValues.push_back(D);
+ // intermediate II will not be needed anymore
+ InterleaveDeadInsts.push_back(
+ cast<Instruction>(cast<Instruction>(II)->getOperand(0)));
+ InterleaveDeadInsts.push_back(
+ cast<Instruction>(cast<Instruction>(II)->getOperand(1)));
+ return true;
+ }
+
+ // Try to match interleave of Factor 2
+ if (match(II, m_Interleave2(m_Value(A), m_Value(B)))) {
+ InterleavedValues.push_back(A);
+ InterleavedValues.push_back(B);
+ return true;
+ }
+
+ return false;
+}
+
bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
- IntrinsicInst *II, StoreInst *SI) const {
+ IntrinsicInst *II, StoreInst *SI,
+ SmallVectorImpl<Instruction *> &DeadInsts) const {
// Only interleave2 supported at present.
if (II->getIntrinsicID() != Intrinsic::vector_interleave2)
return false;
- // Only a factor of 2 supported at present.
- const unsigned Factor = 2;
+ SmallVector<Value *, 4> InterleavedValues;
+ SmallVector<Instruction *, 2> InterleaveDeadInsts;
+ if (!getValuesToInterleave(II, InterleavedValues, InterleaveDeadInsts)) {
+ LLVM_DEBUG(dbgs() << "Matching st2 and st4 patterns failed\n");
+ return false;
+ }
+ unsigned Factor = InterleavedValues.size();
+ assert((Factor == 2 || Factor == 4) &&
+ "Currently supported Factor is 2 or 4 only");
+ VectorType *VTy = cast<VectorType>(InterleavedValues[0]->getType());
+ const DataLayout &DL = II->getModule()->getDataLayout();
- VectorType *VTy = cast<VectorType>(II->getOperand(0)->getType());
- const DataLayout &DL = II->getDataLayout();
bool UseScalable;
if (!isLegalInterleavedAccessType(VTy, DL, UseScalable))
return false;
@@ -17316,27 +17499,28 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore(
Pred =
Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue());
- Value *L = II->getOperand(0);
- Value *R = II->getOperand(1);
-
+ auto ExtractedValues = InterleavedValues;
+ if (UseScalable)
+ InterleavedValues.push_back(Pred);
+ InterleavedValues.push_back(BaseAddr);
for (unsigned I = 0; I < NumStores; ++I) {
Value *Address = BaseAddr;
if (NumStores > 1) {
Value *Offset = Builder.getInt64(I * Factor);
Address = Builder.CreateGEP(StTy, BaseAddr, {Offset});
-
Value *Idx =
Builder.getInt64(I * StTy->getElementCount().getKnownMinValue());
- L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx);
- R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx);
+ for (unsigned J = 0; J < Factor; J++) {
+ InterleavedValues[J] =
+ Builder.CreateExtractVector(StTy, ExtractedValues[J], Idx);
+ }
+ // update the address
+ InterleavedValues[InterleavedValues.size() - 1] = Address;
}
-
- if (UseScalable)
- Builder.CreateCall(StNFunc, {L, R, Pred, Address});
- else
- Builder.CreateCall(StNFunc, {L, R, Address});
+ Builder.CreateCall(StNFunc, InterleavedValues);
}
-
+ DeadInsts.insert(DeadInsts.end(), InterleaveDeadInsts.begin(),
+ InterleaveDeadInsts.end());
return true;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 81e15185f985d5..ace682fb89379d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -703,11 +703,13 @@ class AArch64TargetLowering : public TargetLowering {
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
- LoadInst *LI) const override;
+ bool lowerDeinterleaveIntrinsicToLoad(
+ IntrinsicInst *DI, LoadInst *LI,
+ SmallVectorImpl<Instruction *> &DeadInsts) const override;
- bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
- StoreInst *SI) const override;
+ bool lowerInterleaveIntrinsicToStore(
+ IntrinsicInst *II, StoreInst *SI,
+ SmallVectorImpl<Instruction *> &DeadInsts) const override;
bool isLegalAddImmediate(int64_t) const override;
bool isLegalAddScalableImmediate(int64_t) const override;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 872c288f5ad8cd..fbf60187435ac9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21365,8 +21365,9 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
return true;
}
-bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
- LoadInst *LI) const {
+bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
+ IntrinsicInst *DI, LoadInst *LI,
+ SmallVectorImpl<Instruction *> &DeadInsts) const {
assert(LI->isSimple());
IRBuilder<> Builder(LI);
@@ -21415,8 +21416,9 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI,
return true;
}
-bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
- StoreInst *SI) const {
+bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
+ IntrinsicInst *II, StoreInst *SI,
+ SmallVectorImpl<Instruction *> &DeadInsts) const {
assert(SI->isSimple());
IRBuilder<> Builder(SI);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index d1d0760d8ffd1c..2298998b47357d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -879,11 +879,13 @@ class RISCVTargetLowering : public TargetLowering {
bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
unsigned Factor) const override;
- bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *II,
- LoadInst *LI) const override;
+ bool lowerDeinterleaveIntrinsicToLoad(
+ IntrinsicInst *II, LoadInst *LI,
+ SmallVectorImpl<Instruction *> &DeadInsts) const override;
- bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II,
- StoreInst *SI) const override;
+ bool lowerInterleaveIntrinsicToStore(
+ IntrinsicInst *II, StoreInst *SI,
+ SmallVectorImpl<Instruction *> &DeadInsts) const override;
bool supportKCFIBundles() const override { return true; }
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
index 24d624c221f46b..09e2c53465cd73 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll
@@ -6,123 +6,165 @@
target triple = "aarch64-linux-gnu"
-define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2(ptr %ptr) {
-; NEON-LABEL: define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2
+define void @deinterleave_i8_factor2(ptr %ptr) {
+; NEON-LABEL: define void @deinterleave_i8_factor2
; NEON-SAME: (ptr [[PTR:%.*]]) {
; NEON-NEXT: [[LDN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[PTR]])
-; NEON-NEXT: ret { <16 x i8>, <16 x i8> } [[LDN]]
+; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 0
+; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 1
+; NEON-NEXT: ret void
;
-; SVE-FIXED-LABEL: define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2
+; SVE-FIXED-LABEL: define void @deinterleave_i8_factor2
; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <32 x i8>, ptr [[PTR]], align 1
; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[LOAD]])
-; SVE-FIXED-NEXT: ret { <16 x i8>, <16 x i8> } [[DEINTERLEAVE]]
+; SVE-FIXED-NEXT: [[EXTRACT1:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[DEINTERLEAVE]], 0
+; SVE-FIXED-NEXT: [[EXTRACT2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[DEINTERLEAVE]], 1
+; SVE-FIXED-NEXT: ret void
;
%load = load <32 x i8>, ptr %ptr, align 1
%deinterleave = tail call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %load)
- ret { <16 x i8>, <16 x i8> } %deinterleave
+ %extract1 = extractvalue { <16 x i8>, <16 x i8> } %deinterleave, 0
+ %extract2 = extractvalue { <16 x i8>, <16 x i8> } %deinterleave, 1
+ ret void
}
-define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2(ptr %ptr) {
-; NEON-LABEL: define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2
+define void @deinterleave_i16_factor2(ptr %ptr) {
+; NEON-LABEL: define void @deinterleave_i16_factor2
; NEON-SAME: (ptr [[PTR:%.*]]) {
; NEON-NEXT: [[LDN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[PTR]])
-; NEON-NEXT: ret { <8 x i16>, <8 x i16> } [[LDN]]
+; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 0
+; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 1
+; NEON-NEXT: ret void
;
-; SVE-FIXED-LABEL: define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2
+; SVE-FIXED-LABEL: define void @deinterleave_i16_factor2
; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <16 x i16>, ptr [[PTR]], align 2
; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16> [[LOAD]])
-; SVE-FIXED-NEXT: ret { <8 x i16>, <8 x i16> } [[DEINTERLEAVE]]
+; SVE-FIXED-NEXT: [[EXTRACT1:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[DEINTERLEAVE]], 0
+; SVE-FIXED-NEXT: [[EXTRACT2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[DEINTERLEAVE]], 1
+; SVE-FIXED-NEXT: ret void
;
%load = load <16 x i16>, ptr %ptr, align 2
%deinterleave = tail call { <8 x i16>, <8 x i16> } @llvm.vector.deinterleave2.v16i16(<16 x i16> %load)
- ret { <8 x i16>, <8 x i16> } %deinterleave
+ %extract1 = extractvalue { <8 x i16>, <8 x i16> } %deinterleave, 0
+ %extract2 = extractvalue { <8 x i16>, <8 x i16> } %deinterleave, 1
+ ret void
}
-define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2(ptr %ptr) {
-; NEON-LABEL: define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2
+define void @deinterleave_8xi32_factor2(ptr %ptr) {
+; NEON-LABEL: define void @deinterleave_8xi32_factor2
; NEON-SAME: (ptr [[PTR:%.*]]) {
; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[PTR]])
-; NEON-NEXT: ret { <4 x i32>, <4 x i32> } [[LDN]]
+; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT: ret void
;
-; SVE-FIXED-LABEL: define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2
+; SVE-FIXED-LABEL: define void @deinterleave_8xi32_factor2
; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <8 x i32>, ptr [[PTR]], align 4
; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> [[LOAD]])
-; SVE-FIXED-NEXT: ret { <4 x i32>, <4 x i32> } [[DEINTERLEAVE]]
+; SVE-FIXED-NEXT: [[EXTRACT1:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[DEINTERLEAVE]], 0
+; SVE-FIXED-NEXT: [[EXTRACT2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[DEINTERLEAVE]], 1
+; SVE-FIXED-NEXT: ret void
;
%load = load <8 x i32>, ptr %ptr, align 4
%deinterleave = tail call { <4 x i32>, <4 x i32> } @llvm.vector.deinterleave2.v8i32(<8 x i32> %load)
- ret { <4 x i32>, <4 x i32> } %deinterleave
+ %extract1 = extractvalue { <4 x i32>, <4 x i32> } %deinterleave, 0
+ %extract2 = extractvalue { <4 x i32>, <4 x i32> } %deinterleave, 1
+ ret void
}
-define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2(ptr %ptr) {
-; NEON-LABEL: define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2
+define void @deinterleave_i64_factor2(ptr %ptr) {
+; NEON-LABEL: define void @deinterleave_i64_factor2
; NEON-SAME: (ptr [[PTR:%.*]]) {
; NEON-NEXT: [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]])
-; NEON-NEXT: ret { <2 x i64>, <2 x i64> } [[LDN]]
+; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0
+; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1
+; NEON-NEXT: ret void
;
-; SVE-FIXED-LABEL: define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2
+; SVE-FIXED-LABEL: define void @deinterleave_i64_factor2
; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <4 x i64>, ptr [[PTR]], align 8
; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <2 x i64>, <2 x i64> } @llvm.vector.deinterleave2.v4i64(<4 x i64> [[LOAD]])
-; SVE-FIXED-NEXT: ret { <2 x i64>, <2 x i64> } [[DEINTERLEAVE]]
+; SVE-FIXED-NEXT: [[EXTRACT1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[DEINTERLEAVE]], 0
+; SVE-FIXED-NEXT: [[EXTRACT2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[DEINTERLEAVE]], 1
+; SVE-FIXED-NEXT: ret void
;
%load = load <4 x i64>, ptr %ptr, align 8
%deinterleave = tail call { <2 x i64>, <2 x i64> } @llvm.vector.deinterleave2.v4i64(<4 x i64> %load)
- ret { <2 x i64>, <2 x i64> } %deinterleave
+ %extract1 = extractvalue { <2 x i64>, <2 x i64> } %deinterleave, 0
+ %extract2 = extractvalue { <2 x i64>, <2 x i64> } %deinterleave, 1
+ ret void
}
-define { <4 x float>, <4 x float> } @deinterleave_float_factor2(ptr %ptr) {
-; NEON-LABEL: define { <4 x float>, <4 x float> } @deinterleave_float_factor2
+define void @deinterleave_float_factor2(ptr %ptr) {
+; NEON-LABEL: define void @deinterleave_float_factor2
; NEON-SAME: (ptr [[PTR:%.*]]) {
; NEON-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr [[PTR]])
-; NEON-NEXT: ret { <4 x float>, <4 x float> } [[LDN]]
+; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0
+; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1
+; NEON-NEXT: ret void
;
-; SVE-FIXED-LABEL: define { <4 x float>, <4 x float> } @deinterleave_float_factor2
+; SVE-FIXED-LABEL: define void @deinterleave_float_factor2
; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <8 x float>, ptr [[PTR]], align 4
; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v8f32(<8 x float> [[LOAD]])
-; SVE-FIXED-NEXT: ret { <4 x float>, <4 x float> } [[DEINTERLEAVE]]
+; SVE-FIXED-NEXT: [[EXTRACT1:%.*]] = extractvalue { <4 x float>, <4 x float> } [[DEINTERLEAVE]], 0
+; SVE-FIXED-NEXT: [[EXTRACT2:%.*]] = extractvalue { <4 x float>, <4 x float> } [[DEINTERLEAVE]], 1
+; SVE-FIXED-NEXT: ret void
;
%load = load <8 x float>, ptr %ptr, align 4
%deinterleave = tail call { <4 x float>, <4 x float> } @llvm.vector.deinterleave2.v8f32(<8 x float> %load)
- ret { <4 x float>, <4 x float> } %deinterleave
+ %extract1 = extractvalue { <4 x float>, <4 x float> } %deinterleave, 0
+ %extract2 = extractvalue { <4 x float>, <4 x float> } %deinterleave, 1
+ ret void
}
-define { <2 x double>, <2 x double> } @deinterleave_double_factor2(ptr %ptr) {
-; NEON-LABEL: define { <2 x double>, <2 x double> } @deinterleave_double_factor2
+define void @deinterleave_double_factor2(ptr %ptr) {
+; NEON-LABEL: define void @deinterleave_double_factor2
; NEON-SAME: (ptr [[PTR:%.*]]) {
; NEON-NEXT: [[LDN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0(ptr [[PTR]])
-; NEON-NEXT: ret { <2 x double>, <2 x double> } [[LDN]]
+; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 0
+; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 1
+; NEON-NEXT: ret void
;
-; SVE-FIXED-LABEL: define { <2 x double>, <2 x double> } @deinterleave_double_factor2
+; SVE-FIXED-LABEL: define void @deinterleave_double_factor2
; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <4 x double>, ptr [[PTR]], align 8
; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double> [[LOAD]])
-; SVE-FIXED-NEXT: ret { <2 x double>, <2 x double> } [[DEINTERLEAVE]]
+; SVE-FIXED-NEXT: [[EXTRACT1:%.*]] = extractvalue { <2 x double>, <2 x double> } [[DEINTERLEAVE]], 0
+; SVE-FIXED-NEXT: [[EXTRACT2:%.*]] = extractvalue { <2 x double>, <2 x double> } [[DEINTERLEAVE]], 1
+; SVE-FIXED-NEXT: ret void
;
%load = load <4 x double>, ptr %ptr, align 8
%deinterleave = tail call { <2 x double>, <2 x double> } @llvm.vector.deinterleave2.v4f64(<4 x double> %load)
- ret { <2 x double>, <2 x double> } %deinterleave
+ %extract1 = extractvalue { <2 x double>, <2 x double> } %deinterleave, 0
+ %extract2 = extractvalue { <2 x double>, <2 x double> } %deinterleave, 1
+ ret void
}
-define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2(ptr %ptr) {
-; NEON-LABEL: define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2
+define void @deinterleave_ptr_factor2(ptr %ptr) {
+; NEON-LABEL: define void @deinterleave_ptr_factor2
; NEON-SAME: (ptr [[PTR:%.*]]) {
; NEON-NEXT: [[LDN:%.*]] = call { <2 x ptr>, <2 x ptr> } @llvm.aarch64.neon.ld2.v2p0.p0(ptr [[PTR]])
-; NEON-NEXT: ret { <2 x ptr>, <2 x ptr> } [[LDN]]
+; NEON-NEXT: [[TMP1:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 0
+; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 1
+; NEON-NEXT: ret void
;
-; SVE-FIXED-LABEL: define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2
+; SVE-FIXED-LABEL: define void @deinterleave_ptr_factor2
; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <4 x ptr>, ptr [[PTR]], align 8
; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <2 x ptr>, <2 x ptr> } @llvm.vector.deinterleave2.v4p0(<4 x ptr> [[LOAD]])
-; SVE-FIXED-NEXT: ret { <2 x ptr>, <2 x ptr> } [[DEINTERLEAVE]]
+; SVE-FIXED-NEXT: [[EXTRACT1:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[DEINTERLEAVE]], 0
+; SVE-FIXED-NEXT: [[EXTRACT2:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[DEINTERLEAVE]], 1
+; SVE-FIXED-NEXT: ret void
;
%load = load <4 x ptr>, ptr %ptr, align 8
%deinterleave = tail call { <2 x ptr>, <2 x ptr> } @llvm.vector.deinterleave2.v4p0(<4 x ptr> %load)
- ret { <2 x ptr>, <2 x ptr> } %deinterleave
+ %extract1 = extractvalue { <2 x ptr>, <2 x ptr> } %deinterleave, 0
+ %extract2 = extractvalue { <2 x ptr>, <2 x ptr> } %deinterleave, 1
+ ret void
}
define void @interleave_i8_factor2(ptr %ptr, <16 x i8> %l, <16 x i8> %r) {
@@ -244,8 +286,8 @@ define void @interleave_ptr_factor2(ptr %ptr, <2 x ptr> %l, <2 x ptr> %r) {
ret void
}
-define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2(ptr %ptr) #0 {
-; NEON-LABEL: define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2
+define void @deinterleave_wide_i16_factor2(ptr %ptr) #0 {
+; NEON-LABEL: define void @deinterleave_wide_i16_factor2
; NEON-SAME: (ptr [[PTR:%.*]]) {
; NEON-NEXT: [[TMP1:%.*]] = getelementptr <8 x i16>, ptr [[PTR]], i64 0
; NEON-NEXT: [[LDN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[TMP1]])
@@ -259,19 +301,21 @@ define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2(ptr %ptr) #0 {
; NEON-NEXT: [[TMP8:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP3]], <8 x i16> [[TMP7]], i64 8)
; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN1]], 1
; NEON-NEXT: [[TMP10:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP5]], <8 x i16> [[TMP9]], i64 8)
-; NEON-NEXT: [[TMP11:%.*]] = insertvalue { <16 x i16>, <16 x i16> } poison, <16 x i16> [[TMP8]], 0
-; NEON-NEXT: [[TMP12:%.*]] = insertvalue { <16 x i16>, <16 x i16> } [[TMP11]], <16 x i16> [[TMP10]], 1
-; NEON-NEXT: ret { <16 x i16>, <16 x i16> } [[TMP12]]
+; NEON-NEXT: ret void
;
-; SVE-FIXED-LABEL: define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2
+; SVE-FIXED-LABEL: define void @deinterleave_wide_i16_factor2
; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <32 x i16>, ptr [[PTR]], align 2
; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <16 x i16>, <16 x i16> } @llvm.vector.deinterleave2.v32i16(<32 x i16> [[LOAD]])
-; SVE-FIXED-NEXT: ret { <16 x i16>, <16 x i16> } [[DEINTERLEAVE]]
+; SVE-FIXED-NEXT: [[EXTRACT1:%.*]] = extractvalue { <16 x i16>, <16 x i16> } [[DEINTERLEAVE]], 0
+; SVE-FIXED-NEXT: [[EXTRACT2:%.*]] = extractvalue { <16 x i16>, <16 x i16> } [[DEINTERLEAVE]], 1
+; SVE-FIXED-NEXT: ret void
;
%load = load <32 x i16>, ptr %ptr, align 2
%deinterleave = tail call { <16 x i16>, <16 x i16> } @llvm.vector.deinterleave2.v32i16(<32 x i16> %load)
- ret { <16 x i16>, <16 x i16> } %deinterleave
+ %extract1 = extractvalue { <16 x i16>, <16 x i16> } %deinterleave, 0
+ %extract2 = extractvalue { <16 x i16>, <16 x i16> } %deinterleave, 1
+ ret void
}
define void @interleave_wide_ptr_factor2(ptr %ptr, <8 x ptr> %l, <8 x ptr> %r) {
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
index 2a05718cc4161c..e5b56eb54f927d 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll
@@ -4,81 +4,109 @@
target triple = "aarch64-linux-gnu"
-define { <vscale x 16 x i8>, <vscale x 16 x i8> } @deinterleave_nxi8_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: define { <vscale x 16 x i8>, <vscale x 16 x i8> } @deinterleave_nxi8_factor2
+define void @deinterleave_nxi8_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define void @deinterleave_nxi8_factor2
; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.ld2.sret.nxv16i8(<vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT: ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]]
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[LDN]], 1
+; CHECK-NEXT: ret void
;
%load = load <vscale x 32 x i8>, ptr %ptr, align 1
%deinterleave = tail call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %load)
- ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleave
+ %extract1 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleave, 1
+ %extract2 = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %deinterleave, 0
+ ret void
}
-define { <vscale x 8 x i16>, <vscale x 8 x i16> } @deinterleave_nxi16_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: define { <vscale x 8 x i16>, <vscale x 8 x i16> } @deinterleave_nxi16_factor2
+define void @deinterleave_nxi16_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define void @deinterleave_nxi16_factor2
; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT: ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]]
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 1
+; CHECK-NEXT: ret void
;
%load = load <vscale x 16 x i16>, ptr %ptr, align 2
%deinterleave = tail call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> %load)
- ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %deinterleave
+ %extract1 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %deinterleave, 0
+ %extract2 = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %deinterleave, 1
+ ret void
}
-define { <vscale x 4 x i32>, <vscale x 4 x i32> } @deinterleave_nx8xi32_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: define { <vscale x 4 x i32>, <vscale x 4 x i32> } @deinterleave_nx8xi32_factor2
+define void @deinterleave_nx8xi32_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define void @deinterleave_nx8xi32_factor2
; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT: ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]]
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT: ret void
;
%load = load <vscale x 8 x i32>, ptr %ptr, align 4
%deinterleave = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %load)
- ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave
+ %extract1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 0
+ %extract2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave, 1
+ ret void
}
-define { <vscale x 2 x i64>, <vscale x 2 x i64> } @deinterleave_nxi64_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: define { <vscale x 2 x i64>, <vscale x 2 x i64> } @deinterleave_nxi64_factor2
+define void @deinterleave_nxi64_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define void @deinterleave_nxi64_factor2
; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT: ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]]
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT: ret void
;
%load = load <vscale x 4 x i64>, ptr %ptr, align 8
%deinterleave = tail call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64> %load)
- ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %deinterleave
+ %extract1 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %deinterleave, 0
+ %extract2 = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } %deinterleave, 1
+ ret void
}
-define { <vscale x 4 x float>, <vscale x 4 x float> } @deinterleave_nxfloat_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: define { <vscale x 4 x float>, <vscale x 4 x float> } @deinterleave_nxfloat_factor2
+define void @deinterleave_nxfloat_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define void @deinterleave_nxfloat_factor2
; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld2.sret.nxv4f32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT: ret { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]]
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 1
+; CHECK-NEXT: ret void
;
%load = load <vscale x 8 x float>, ptr %ptr, align 4
%deinterleave = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %load)
- ret { <vscale x 4 x float>, <vscale x 4 x float> } %deinterleave
+ %extract1 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %deinterleave, 0
+ %extract2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %deinterleave, 1
+ ret void
}
-define { <vscale x 2 x double>, <vscale x 2 x double> } @deinterleave_nxdouble_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: define { <vscale x 2 x double>, <vscale x 2 x double> } @deinterleave_nxdouble_factor2
+define void @deinterleave_nxdouble_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define void @deinterleave_nxdouble_factor2
; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT: ret { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]]
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 1
+; CHECK-NEXT: ret void
;
%load = load <vscale x 4 x double>, ptr %ptr, align 8
%deinterleave = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %load)
- ret { <vscale x 2 x double>, <vscale x 2 x double> } %deinterleave
+ %extract1 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %deinterleave, 0
+ %extract2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %deinterleave, 1
+ ret void
}
-define { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @deinterleave_nxptr_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: define { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @deinterleave_nxptr_factor2
+define void @deinterleave_nxptr_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define void @deinterleave_nxptr_factor2
; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.aarch64.sve.ld2.sret.nxv2p0(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[PTR]])
-; CHECK-NEXT: ret { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]]
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } [[LDN]], 1
+; CHECK-NEXT: ret void
;
%load = load <vscale x 4 x ptr>, ptr %ptr, align 8
%deinterleave = tail call { <vscale x 2 x ptr>, <vscale x 2 x ptr> } @llvm.vector.deinterleave2.nxv4p0(<vscale x 4 x ptr> %load)
- ret { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %deinterleave
+ %extract1 = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %deinterleave, 0
+ %extract2 = extractvalue { <vscale x 2 x ptr>, <vscale x 2 x ptr> } %deinterleave, 1
+ ret void
}
define void @interleave_nxi8_factor2(ptr %ptr, <vscale x 16 x i8> %l, <vscale x 16 x i8> %r) #0 {
@@ -160,8 +188,8 @@ define void @interleave_nxptr_factor2(ptr %ptr, <vscale x 2 x ptr> %l, <vscale x
;;; Check that we 'legalize' operations that are wider than the target supports.
-define { <vscale x 16 x i32>, <vscale x 16 x i32> } @deinterleave_wide_nxi32_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: define { <vscale x 16 x i32>, <vscale x 16 x i32> } @deinterleave_wide_nxi32_factor2
+define void @deinterleave_wide_nxi32_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define void @deinterleave_wide_nxi32_factor2
; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[PTR]], i64 0
; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP1]])
@@ -187,17 +215,17 @@ define { <vscale x 16 x i32>, <vscale x 16 x i32> } @deinterleave_wide_nxi32_fac
; CHECK-NEXT: [[TMP18:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP13]], <vscale x 4 x i32> [[TMP17]], i64 12)
; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN3]], 1
; CHECK-NEXT: [[TMP20:%.*]] = call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP15]], <vscale x 4 x i32> [[TMP19]], i64 12)
-; CHECK-NEXT: [[TMP21:%.*]] = insertvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } poison, <vscale x 16 x i32> [[TMP18]], 0
-; CHECK-NEXT: [[TMP22:%.*]] = insertvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } [[TMP21]], <vscale x 16 x i32> [[TMP20]], 1
-; CHECK-NEXT: ret { <vscale x 16 x i32>, <vscale x 16 x i32> } [[TMP22]]
+; CHECK-NEXT: ret void
;
%load = load <vscale x 32 x i32>, ptr %ptr, align 4
%deinterleave = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %load)
- ret { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave
+ %extract1 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave, 0
+ %extract2 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave, 1
+ ret void
}
-define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_wide_nxdouble_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_wide_nxdouble_factor2
+define void @deinterleave_wide_nxdouble_factor2(ptr %ptr) #0 {
+; CHECK-LABEL: define void @deinterleave_wide_nxdouble_factor2
; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[PTR]], i64 0
; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP1]])
@@ -211,13 +239,13 @@ define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_wide_nxdou
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP7]], i64 2)
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 1
; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP5]], <vscale x 2 x double> [[TMP9]], i64 2)
-; CHECK-NEXT: [[TMP11:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } poison, <vscale x 4 x double> [[TMP8]], 0
-; CHECK-NEXT: [[TMP12:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP11]], <vscale x 4 x double> [[TMP10]], 1
-; CHECK-NEXT: ret { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]]
+; CHECK-NEXT: ret void
;
%load = load <vscale x 8 x double>, ptr %ptr, align 8
%deinterleave = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %load)
- ret { <vscale x 4 x double>, <vscale x 4 x double> } %deinterleave
+ %extract1 = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %deinterleave, 0
+ %extract2 = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %deinterleave, 1
+ ret void
}
define void @interleave_wide_nxdouble_factor2(ptr %ptr, <vscale x 4 x double> %l, <vscale x 4 x double> %r) #0 {
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
new file mode 100644
index 00000000000000..06ecff67298813
--- /dev/null
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave4.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s
+
+
+define void @deinterleave4(ptr %src) {
+; CHECK-LABEL: define void @deinterleave4
+; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[SRC]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
+; CHECK-NEXT: [[SUM:%.*]] = add <vscale x 4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[SUB:%.*]] = sub <vscale x 4 x i32> [[TMP3]], [[TMP4]]
+; CHECK-NEXT: ret void
+;
+
+ %load = load <vscale x 16 x i32>, ptr %src, align 4
+ %deinterleave_src = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %load)
+ %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 0
+ %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 1
+ %deinterleave_half1 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+ %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 0
+ %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 1
+ %deinterleave_half2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+ %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 0
+ %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
+ %sum = add <vscale x 4 x i32> %5, %7
+ %sub = sub <vscale x 4 x i32> %6, %8
+ ret void
+}
+
+define void @wide_deinterleave4(ptr %src) {
+; CHECK-LABEL: define void @wide_deinterleave4
+; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[SRC]], i64 0
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP1]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP6]], i64 0)
+; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
+; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP8]], i64 0)
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[SRC]], i64 4
+; CHECK-NEXT: [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP10]])
+; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
+; CHECK-NEXT: [[TMP12:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP11]], i64 4)
+; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
+; CHECK-NEXT: [[TMP14:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP13]], i64 4)
+; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 2
+; CHECK-NEXT: [[TMP16:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP15]], i64 4)
+; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 3
+; CHECK-NEXT: [[TMP18:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP17]], i64 4)
+; CHECK-NEXT: [[SUM:%.*]] = add <vscale x 8 x i32> [[TMP12]], [[TMP14]]
+; CHECK-NEXT: [[SUB:%.*]] = sub <vscale x 8 x i32> [[TMP16]], [[TMP18]]
+; CHECK-NEXT: ret void
+;
+ %load = load <vscale x 32 x i32>, ptr %src, align 4
+ %deinterleave_src = tail call { <vscale x 16 x i32>, <vscale x 16 x i32> } @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32> %load)
+ %3 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave_src, 0
+ %4 = extractvalue { <vscale x 16 x i32>, <vscale x 16 x i32> } %deinterleave_src, 1
+ %deinterleave_half1 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %3)
+ %5 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half1, 0
+ %6 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half1, 1
+ %deinterleave_half2 = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %4)
+ %7 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half2, 0
+ %8 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_half2, 1
+ %sum = add <vscale x 8 x i32> %5, %7
+ %sub = sub <vscale x 8 x i32> %6, %8
+ ret void
+}
+
+define void @mix_deinterleave4_deinterleave2(ptr %src) {
+; CHECK-LABEL: define void @mix_deinterleave4_deinterleave2
+; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld4.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[SRC]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 3
+; CHECK-NEXT: [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[SRC]])
+; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
+; CHECK-NEXT: ret void
+;
+
+ %load = load <vscale x 16 x i32>, ptr %src, align 4
+ %deinterleave_src = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %load)
+ %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 0
+ %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 1
+ %deinterleave_half1 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+ %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 0
+ %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 1
+ %deinterleave_half2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+ %7 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 0
+ %8 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
+
+ %load2 = load <vscale x 8 x i32>, ptr %src, align 4
+ %deinterleave_src2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 8 x i32> %load2)
+ %ld2_1 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_src2, 0
+ %ld2_2 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_src2, 1
+ ret void
+}
+
+define void @negative_deinterleave4_test(ptr %src) {
+; CHECK-LABEL: define void @negative_deinterleave4_test
+; CHECK-SAME: (ptr [[SRC:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[SRC]], i64 0
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP1]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[SRC]], i64 2
+; CHECK-NEXT: [[LDN1:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld2.sret.nxv4i32(<vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP6]])
+; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 0
+; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP7]], i64 4)
+; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN1]], 1
+; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP9]], i64 4)
+; CHECK-NEXT: [[DEINTERLEAVE_HALF1:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP8]])
+; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE_HALF1]], 0
+; CHECK-NEXT: [[DEINTERLEAVE_HALF2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[TMP10]])
+; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[DEINTERLEAVE_HALF2]], 1
+; CHECK-NEXT: ret void
+;
+ %load = load <vscale x 16 x i32>, ptr %src, align 4
+ %deinterleave_src = tail call { <vscale x 8 x i32>, <vscale x 8 x i32> } @llvm.vector.deinterleave2.nxv16i32(<vscale x 16 x i32> %load)
+ %3 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 0
+ %4 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %deinterleave_src, 1
+ %deinterleave_half1 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %3)
+ %5 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half1, 0
+ %deinterleave_half2 = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> %4)
+ %6 = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } %deinterleave_half2, 1
+
+ ret void
+}
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
new file mode 100644
index 00000000000000..ba9bff093678c2
--- /dev/null
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleave4.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt < %s -passes=interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -S | FileCheck %s
+
+
+define void @interleave4(ptr %dst, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d) {
+; CHECK-LABEL: define void @interleave4
+; CHECK-SAME: (ptr [[DST:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], <vscale x 4 x i32> [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]], <vscale x 4 x i32> [[D]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST]])
+; CHECK-NEXT: ret void
+;
+ %interleaved.half1 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c)
+ %interleaved.half2 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %b, <vscale x 4 x i32> %d)
+ %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.half1, <vscale x 8 x i32> %interleaved.half2)
+ store <vscale x 16 x i32> %interleaved.vec, ptr %dst, align 4
+ ret void
+}
+
+define void @wide_interleave4(ptr %dst, <vscale x 8 x i32> %a, <vscale x 8 x i32> %b, <vscale x 8 x i32> %c, <vscale x 8 x i32> %d) {
+; CHECK-LABEL: define void @wide_interleave4
+; CHECK-SAME: (ptr [[DST:%.*]], <vscale x 8 x i32> [[A:%.*]], <vscale x 8 x i32> [[B:%.*]], <vscale x 8 x i32> [[C:%.*]], <vscale x 8 x i32> [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[DST]], i64 0
+; CHECK-NEXT: [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[A]], i64 0)
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[B]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[C]], i64 0)
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[D]], i64 0)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP1]])
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr <vscale x 4 x i32>, ptr [[DST]], i64 4
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[A]], i64 4)
+; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[B]], i64 4)
+; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[C]], i64 4)
+; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[D]], i64 4)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[TMP7]], <vscale x 4 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP10]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[TMP6]])
+; CHECK-NEXT: ret void
+;
+ %interleaved.half1 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %a, <vscale x 8 x i32> %c)
+ %interleaved.half2 = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %b, <vscale x 8 x i32> %d)
+ %interleaved.vec = tail call <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32> %interleaved.half1, <vscale x 16 x i32> %interleaved.half2)
+ store <vscale x 32 x i32> %interleaved.vec, ptr %dst, align 4
+ ret void
+}
+
+define void @mix_interleave4_interleave2(ptr %dst1, ptr %dst2, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d) {
+; CHECK-LABEL: define void @mix_interleave4_interleave2
+; CHECK-SAME: (ptr [[DST1:%.*]], ptr [[DST2:%.*]], <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], <vscale x 4 x i32> [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[B]], <vscale x 4 x i32> [[C]], <vscale x 4 x i32> [[D]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST1]])
+; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv4i32(<vscale x 4 x i32> [[A]], <vscale x 4 x i32> [[C]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), ptr [[DST2]])
+; CHECK-NEXT: ret void
+;
+ %interleaved.half1 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c)
+ %interleaved.half2 = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %b, <vscale x 4 x i32> %d)
+ %interleaved.vec = tail call <vscale x 16 x i32> @llvm.vector.interleave2.nxv16i32(<vscale x 8 x i32> %interleaved.half1, <vscale x 8 x i32> %interleaved.half2)
+ store <vscale x 16 x i32> %interleaved.vec, ptr %dst1, align 4
+
+ %interleaved = tail call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %c)
+ store <vscale x 8 x i32> %interleaved, ptr %dst2, align 4
+ ret void
+}
diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
index 73f26814f3a4bc..8821255a86b2f8 100644
--- a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll
@@ -1,36 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt < %s -interleaved-access -S | FileCheck %s
; RUN: opt < %s -passes=interleaved-access -S | FileCheck %s
target triple = "aarch64-linux-gnu"
define void @load_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: @load_factor2(
-; CHECK-NEXT: [[PTRUE:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 1
-; CHECK-NEXT: [[EXT1:%.*]] = call <16 x i16> @llvm.vector.extract.v16i16.nxv8i16(<vscale x 8 x i16> [[TMP2]], i64 0)
-; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 0
-; CHECK-NEXT: [[EXT2:%.*]] = call <16 x i16> @llvm.vector.extract.v16i16.nxv8i16(<vscale x 8 x i16> [[TMP3]], i64 0)
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @load_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.ld2.sret.nxv8i16(<vscale x 8 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.vector.extract.v16i16.nxv8i16(<vscale x 8 x i16> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[LDN]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.vector.extract.v16i16.nxv8i16(<vscale x 8 x i16> [[TMP4]], i64 0)
+; CHECK-NEXT: ret void
+;
%interleaved.vec = load <32 x i16>, ptr %ptr, align 4
%v0 = shufflevector <32 x i16> %interleaved.vec, <32 x i16> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14,
- i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+ i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
%v1 = shufflevector <32 x i16> %interleaved.vec, <32 x i16> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15,
- i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+ i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
ret void
}
define void @load_factor3(ptr %ptr) #0 {
-; CHECK-LABEL: @load_factor3(
-; CHECK-NEXT: [[PTRUE:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3.sret.nxv4i32(<vscale x 4 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
-; CHECK-NEXT: [[EXT1:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32(<vscale x 4 x i32> [[TMP2]], i64 0)
-; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
-; CHECK-NEXT: [[EXT2:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32(<vscale x 4 x i32> [[TMP3]], i64 0)
-; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
-; CHECK-NEXT: [[EXT3:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32(<vscale x 4 x i32> [[TMP4]], i64 0)
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @load_factor3(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.ld3.sret.nxv4i32(<vscale x 4 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32(<vscale x 4 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32(<vscale x 4 x i32> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[LDN]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.vector.extract.v8i32.nxv4i32(<vscale x 4 x i32> [[TMP6]], i64 0)
+; CHECK-NEXT: ret void
+;
%interleaved.vec = load <24 x i32>, ptr %ptr, align 4
%v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
%v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
@@ -39,18 +44,20 @@ define void @load_factor3(ptr %ptr) #0 {
}
define void @load_factor4(ptr %ptr) #0 {
-; CHECK-LABEL: @load_factor4(
-; CHECK-NEXT: [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4.sret.nxv2i64(<vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 3
-; CHECK-NEXT: [[EXT1:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 2
-; CHECK-NEXT: [[EXT2:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
-; CHECK-NEXT: [[EXT3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
-; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT: [[EXT4:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP5]], i64 0)
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @load_factor4(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 2
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT: ret void
+;
%interleaved.vec = load <16 x i64>, ptr %ptr, align 4
%v0 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
%v1 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
@@ -60,54 +67,64 @@ define void @load_factor4(ptr %ptr) #0 {
}
define void @store_factor2(ptr %ptr, <16 x i16> %v0, <16 x i16> %v1) #0 {
-; CHECK-LABEL: @store_factor2(
-; CHECK-NEXT: [[PTRUE:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> %v0, <16 x i16> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[INS1:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> [[TMP1]], i64 0)
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> %v0, <16 x i16> %v1, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
-; CHECK-NEXT: [[INS2:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> [[TMP2]], i64 0)
-; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[INS1]], <vscale x 8 x i16> [[INS2]], <vscale x 8 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @store_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]], <16 x i16> [[V0:%.*]], <16 x i16> [[V1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[V0]], <16 x i16> [[V1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[V0]], <16 x i16> [[V1]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v16i16(<vscale x 8 x i16> undef, <16 x i16> [[TMP4]], i64 0)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv8i16(<vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP5]], <vscale x 8 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT: ret void
+;
%interleaved.vec = shufflevector <16 x i16> %v0, <16 x i16> %v1, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23,
- i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+ i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
store <32 x i16> %interleaved.vec, ptr %ptr, align 4
ret void
}
define void @store_factor3(ptr %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) #0 {
-; CHECK-LABEL: @store_factor3(
-; CHECK: [[PTRUE:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[INS1:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v8i32(<vscale x 4 x i32> undef, <8 x i32> [[TMP1]], i64 0)
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[INS2:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v8i32(<vscale x 4 x i32> undef, <8 x i32> [[TMP2]], i64 0)
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; CHECK-NEXT: [[INS3:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v8i32(<vscale x 4 x i32> undef, <8 x i32> [[TMP3]], i64 0)
-; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[INS1]], <vscale x 4 x i32> [[INS2]], <vscale x 4 x i32> [[INS3]], <vscale x 4 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @store_factor3(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x i32> [[V0:%.*]], <8 x i32> [[V1:%.*]], <8 x i32> [[V2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[V0]], <8 x i32> [[V1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[V2]], <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v8i32(<vscale x 4 x i32> undef, <8 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v8i32(<vscale x 4 x i32> undef, <8 x i32> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> [[S0]], <16 x i32> [[S1]], <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v8i32(<vscale x 4 x i32> undef, <8 x i32> [[TMP6]], i64 0)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv4i32(<vscale x 4 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP7]], <vscale x 4 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT: ret void
+;
%s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
- i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%s1 = shufflevector <8 x i32> %v2, <8 x i32> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
- i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
%interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19,
- i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+ i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
store <24 x i32> %interleaved.vec, ptr %ptr, align 4
ret void
}
define void @store_factor4(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64> %v2, <4 x i64> %v3) #0 {
-; CHECK-LABEL: @store_factor4(
-; CHECK: [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> %s0, <8 x i64> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INS1:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP1]], i64 0)
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> %s0, <8 x i64> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[INS2:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> %s0, <8 x i64> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT: [[INS3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> %s0, <8 x i64> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[INS4:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
-; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[INS1]], <vscale x 2 x i64> [[INS2]], <vscale x 2 x i64> [[INS3]], <vscale x 2 x i64> [[INS4]], <vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @store_factor4(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[V0:%.*]], <4 x i64> [[V1:%.*]], <4 x i64> [[V2:%.*]], <4 x i64> [[V3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[S0:%.*]] = shufflevector <4 x i64> [[V0]], <4 x i64> [[V1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[S1:%.*]] = shufflevector <4 x i64> [[V2]], <4 x i64> [[V3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[S0]], <8 x i64> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT: ret void
+;
%s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%s1 = shufflevector <4 x i64> %v2, <4 x i64> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%interleaved.vec = shufflevector <8 x i64> %s0, <8 x i64> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13, i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
@@ -116,16 +133,18 @@ define void @store_factor4(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64> %v2
}
define void @load_ptrvec_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: @load_ptrvec_factor2(
-; CHECK-NEXT: [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
-; CHECK-NEXT: [[EXT1:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT: [[TOP1:%.*]] = inttoptr <4 x i64> [[EXT1]] to <4 x ptr>
-; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT: [[EXT2:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT: [[TOP2:%.*]] = inttoptr <4 x i64> [[EXT2]] to <4 x ptr>
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @load_ptrvec_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = inttoptr <4 x i64> [[TMP3]] to <4 x ptr>
+; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP5]], i64 0)
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr <4 x i64> [[TMP6]] to <4 x ptr>
+; CHECK-NEXT: ret void
+;
%interleaved.vec = load <8 x ptr>, ptr %ptr, align 4
%v0 = shufflevector <8 x ptr> %interleaved.vec, <8 x ptr> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%v1 = shufflevector <8 x ptr> %interleaved.vec, <8 x ptr> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -133,19 +152,21 @@ define void @load_ptrvec_factor2(ptr %ptr) #0 {
}
define void @load_ptrvec_factor3(ptr %ptr) #0 {
-; CHECK-LABEL: @load_ptrvec_factor3(
-; CHECK-NEXT: [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 2
-; CHECK-NEXT: [[EXT1:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT: [[TOP1:%.*]] = inttoptr <4 x i64> [[EXT1]] to <4 x ptr>
-; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
-; CHECK-NEXT: [[EXT2:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT: [[TOP2:%.*]] = inttoptr <4 x i64> [[EXT2]] to <4 x ptr>
-; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT: [[EXT3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
-; CHECK-NEXT: [[TOP3:%.*]] = inttoptr <4 x i64> [[EXT3]] to <4 x ptr>
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @load_ptrvec_factor3(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld3.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = inttoptr <4 x i64> [[TMP3]] to <4 x ptr>
+; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP5]], i64 0)
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr <4 x i64> [[TMP6]] to <4 x ptr>
+; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT: [[TMP10:%.*]] = inttoptr <4 x i64> [[TMP9]] to <4 x ptr>
+; CHECK-NEXT: ret void
+;
%interleaved.vec = load <12 x ptr>, ptr %ptr, align 4
%v0 = shufflevector <12 x ptr> %interleaved.vec, <12 x ptr> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
%v1 = shufflevector <12 x ptr> %interleaved.vec, <12 x ptr> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
@@ -154,22 +175,24 @@ define void @load_ptrvec_factor3(ptr %ptr) #0 {
}
define void @load_ptrvec_factor4(ptr %ptr) #0 {
-; CHECK-LABEL: @load_ptrvec_factor4(
-; CHECK-NEXT: [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4.sret.nxv2i64(<vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 3
-; CHECK-NEXT: [[EXT1:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT: [[TOP1:%.*]] = inttoptr <4 x i64> [[EXT1]] to <4 x ptr>
-; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 2
-; CHECK-NEXT: [[EXT2:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT: [[TOP2:%.*]] = inttoptr <4 x i64> [[EXT2]] to <4 x ptr>
-; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
-; CHECK-NEXT: [[EXT3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
-; CHECK-NEXT: [[TOP3:%.*]] = inttoptr <4 x i64> [[EXT3]] to <4 x ptr>
-; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT: [[EXT4:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP5]], i64 0)
-; CHECK-NEXT: [[TOP4:%.*]] = inttoptr <4 x i64> [[EXT4]] to <4 x ptr>
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @load_ptrvec_factor4(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld4.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = inttoptr <4 x i64> [[TMP3]] to <4 x ptr>
+; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 2
+; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP5]], i64 0)
+; CHECK-NEXT: [[TMP7:%.*]] = inttoptr <4 x i64> [[TMP6]] to <4 x ptr>
+; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT: [[TMP9:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT: [[TMP10:%.*]] = inttoptr <4 x i64> [[TMP9]] to <4 x ptr>
+; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT: [[TMP12:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP11]], i64 0)
+; CHECK-NEXT: [[TMP13:%.*]] = inttoptr <4 x i64> [[TMP12]] to <4 x ptr>
+; CHECK-NEXT: ret void
+;
%interleaved.vec = load <16 x ptr>, ptr %ptr, align 4
%v0 = shufflevector <16 x ptr> %interleaved.vec, <16 x ptr> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
%v1 = shufflevector <16 x ptr> %interleaved.vec, <16 x ptr> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
@@ -179,34 +202,40 @@ define void @load_ptrvec_factor4(ptr %ptr) #0 {
}
define void @store_ptrvec_factor2(ptr %ptr, <4 x ptr> %v0, <4 x ptr> %v1) #0 {
-; CHECK-LABEL: @store_ptrvec_factor2(
-; CHECK-NEXT: [[TOI1:%.*]] = ptrtoint <4 x ptr> %v0 to <4 x i64>
-; CHECK-NEXT: [[TOI2:%.*]] = ptrtoint <4 x ptr> %v1 to <4 x i64>
-; CHECK-NEXT: [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[TOI1]], <4 x i64> [[TOI2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INS1:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP1]], i64 0)
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TOI1]], <4 x i64> [[TOI2]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[INS2:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[INS1]], <vscale x 2 x i64> [[INS2]], <vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @store_ptrvec_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x ptr> [[V0:%.*]], <4 x ptr> [[V1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint <4 x ptr> [[V0]] to <4 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint <4 x ptr> [[V1]] to <4 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP1]], <4 x i64> [[TMP2]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i1> [[TMP3]], ptr [[PTR]])
+; CHECK-NEXT: ret void
+;
%interleaved.vec = shufflevector <4 x ptr> %v0, <4 x ptr> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
store <8 x ptr> %interleaved.vec, ptr %ptr, align 4
ret void
}
define void @store_ptrvec_factor3(ptr %ptr, <4 x ptr> %v0, <4 x ptr> %v1, <4 x ptr> %v2) #0 {
-; CHECK-LABEL: @store_ptrvec_factor3(
-; CHECK: [[TOI1:%.*]] = ptrtoint <8 x ptr> %s0 to <8 x i64>
-; CHECK-NEXT: [[TOI2:%.*]] = ptrtoint <8 x ptr> %s1 to <8 x i64>
-; CHECK-NEXT: [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INS1:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP1]], i64 0)
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[INS2:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT: [[INS3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[INS1]], <vscale x 2 x i64> [[INS2]], <vscale x 2 x i64> [[INS3]], <vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @store_ptrvec_factor3(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x ptr> [[V0:%.*]], <4 x ptr> [[V1:%.*]], <4 x ptr> [[V2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[S0:%.*]] = shufflevector <4 x ptr> [[V0]], <4 x ptr> [[V1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[S1:%.*]] = shufflevector <4 x ptr> [[V2]], <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint <8 x ptr> [[S0]] to <8 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint <8 x ptr> [[S1]] to <8 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i1> [[TMP3]], ptr [[PTR]])
+; CHECK-NEXT: ret void
+;
%s0 = shufflevector <4 x ptr> %v0, <4 x ptr> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%s1 = shufflevector <4 x ptr> %v2, <4 x ptr> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%interleaved.vec = shufflevector <8 x ptr> %s0, <8 x ptr> %s1, <12 x i32> <i32 0, i32 4, i32 8, i32 1, i32 5, i32 9, i32 2, i32 6, i32 10, i32 3, i32 7, i32 11>
@@ -215,45 +244,51 @@ define void @store_ptrvec_factor3(ptr %ptr, <4 x ptr> %v0, <4 x ptr> %v1, <4 x p
}
define void @store_ptrvec_factor4(ptr %ptr, <4 x ptr> %v0, <4 x ptr> %v1, <4 x ptr> %v2, <4 x ptr> %v3) #0 {
-; CHECK-LABEL: @store_ptrvec_factor4(
-; CHECK: [[TOI1:%.*]] = ptrtoint <8 x ptr> %s0 to <8 x i64>
-; CHECK-NEXT: [[TOI2:%.*]] = ptrtoint <8 x ptr> %s1 to <8 x i64>
-; CHECK-NEXT: [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INS1:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP1]], i64 0)
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[INS2:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT: [[INS3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[INS4:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
-; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[INS1]], <vscale x 2 x i64> [[INS2]], <vscale x 2 x i64> [[INS3]], <vscale x 2 x i64> [[INS4]], <vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @store_ptrvec_factor4(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x ptr> [[V0:%.*]], <4 x ptr> [[V1:%.*]], <4 x ptr> [[V2:%.*]], <4 x ptr> [[V3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[S0:%.*]] = shufflevector <4 x ptr> [[V0]], <4 x ptr> [[V1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[S1:%.*]] = shufflevector <4 x ptr> [[V2]], <4 x ptr> [[V3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint <8 x ptr> [[S0]] to <8 x i64>
+; CHECK-NEXT: [[TMP2:%.*]] = ptrtoint <8 x ptr> [[S1]] to <8 x i64>
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> [[TMP2]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP10]], i64 0)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i1> [[TMP3]], ptr [[PTR]])
+; CHECK-NEXT: ret void
+;
%s0 = shufflevector <4 x ptr> %v0, <4 x ptr> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%s1 = shufflevector <4 x ptr> %v2, <4 x ptr> %v3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
%interleaved.vec = shufflevector <8 x ptr> %s0, <8 x ptr> %s1, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 1, i32 5, i32 9, i32 13,
- i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
+ i32 2, i32 6, i32 10, i32 14, i32 3, i32 7, i32 11, i32 15>
store <16 x ptr> %interleaved.vec, ptr %ptr, align 4
ret void
}
define void @load_factor2_wide(ptr %ptr) #0 {
-; CHECK-LABEL: @load_factor2_wide(
-; CHECK-NEXT: [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
-; CHECK-NEXT: [[EXT1:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT: [[EXT2:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr %ptr, i32 8
-; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[PTRUE]], ptr [[TMP4]])
-; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
-; CHECK-NEXT: [[EXT3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP5]], i64 0)
-; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
-; CHECK-NEXT: [[EXT4:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP6]], i64 0)
-; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[EXT1]], <4 x i64> [[EXT3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[EXT2]], <4 x i64> [[EXT4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @load_factor2_wide(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[PTR]], i32 8
+; CHECK-NEXT: [[LDN1:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[TMP6]])
+; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 1
+; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP7]], i64 0)
+; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN1]], 0
+; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP9]], i64 0)
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: ret void
+;
%interleaved.vec = load <16 x i64>, ptr %ptr, align 4
%v0 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
%v1 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
@@ -261,20 +296,22 @@ define void @load_factor2_wide(ptr %ptr) #0 {
}
define void @store_factor2_wide(ptr %ptr, <8 x i64> %v0, <8 x i64> %v1) #0 {
-; CHECK-LABEL: @store_factor2_wide(
-; CHECK-NEXT: [[PTRUE:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> %v0, <8 x i64> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT: [[INS1:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
-; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> %v0, <8 x i64> %v1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT: [[INS2:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP3]], i64 0)
-; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[INS1]], <vscale x 2 x i64> [[INS2]], <vscale x 2 x i1> [[PTRUE]], ptr %ptr)
-; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> %v0, <8 x i64> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[INS3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
-; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> %v0, <8 x i64> %v1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[INS4:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP5]], i64 0)
-; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr %ptr, i32 8
-; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[INS3]], <vscale x 2 x i64> [[INS4]], <vscale x 2 x i1> [[PTRUE]], ptr [[TMP6]])
-; CHECK-NEXT: ret void
+; CHECK-LABEL: define void @store_factor2_wide(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x i64> [[V0:%.*]], <8 x i64> [[V1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[V0]], <8 x i64> [[V1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[V0]], <8 x i64> [[V1]], <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i64> [[V0]], <8 x i64> [[V1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP6]], i64 0)
+; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[V0]], <8 x i64> [[V1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP8]], i64 0)
+; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[PTR]], i32 8
+; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i1> [[TMP1]], ptr [[TMP10]])
+; CHECK-NEXT: ret void
+;
%interleaved.vec = shufflevector <8 x i64> %v0, <8 x i64> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
store <16 x i64> %interleaved.vec, ptr %ptr, align 4
ret void
@@ -282,9 +319,27 @@ define void @store_factor2_wide(ptr %ptr, <8 x i64> %v0, <8 x i64> %v1) #0 {
; Check that neon is used for illegal multiples of 128-bit types
define void @load_384bit(ptr %ptr) #0 {
-; CHECK-LABEL: @load_384bit(
-; CHECK: llvm.aarch64.neon.ld2
-; CHECK-NOT: llvm.aarch64.sve.ld2
+; CHECK-LABEL: define void @load_384bit(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0
+; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i64, ptr [[PTR]], i32 4
+; CHECK-NEXT: [[LDN1:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[TMP3]])
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN1]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN1]], 0
+; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, ptr [[TMP3]], i32 4
+; CHECK-NEXT: [[LDN2:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[TMP6]])
+; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN2]], 1
+; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN2]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> [[TMP10]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i64> [[TMP12]], <4 x i64> [[TMP13]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT: ret void
+;
%interleaved.vec = load <12 x i64>, ptr %ptr, align 4
%v0 = shufflevector <12 x i64> %interleaved.vec, <12 x i64> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10>
%v1 = shufflevector <12 x i64> %interleaved.vec, <12 x i64> poison, <6 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
@@ -293,9 +348,13 @@ define void @load_384bit(ptr %ptr) #0 {
; Check that neon is used for 128-bit vectors
define void @load_128bit(ptr %ptr) #0 {
-; CHECK-LABEL: @load_128bit(
-; CHECK: llvm.aarch64.neon.ld2
-; CHECK-NOT: llvm.aarch64.sve.ld2
+; CHECK-LABEL: define void @load_128bit(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]])
+; CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0
+; CHECK-NEXT: ret void
+;
%interleaved.vec = load <4 x i64>, ptr %ptr, align 4
%v0 = shufflevector <4 x i64> %interleaved.vec, <4 x i64> poison, <2 x i32> <i32 0, i32 2>
%v1 = shufflevector <4 x i64> %interleaved.vec, <4 x i64> poison, <2 x i32> <i32 1, i32 3>
@@ -304,8 +363,16 @@ define void @load_128bit(ptr %ptr) #0 {
; Check that correct ptrues are generated for min != max case
define void @load_min_not_max(ptr %ptr) #1 {
-; CHECK-LABEL: @load_min_not_max(
-; CHECK: call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-LABEL: define void @load_min_not_max(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT: ret void
+;
%interleaved.vec = load <8 x i64>, ptr %ptr, align 4
%v0 = shufflevector <8 x i64> %interleaved.vec, <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%v1 = shufflevector <8 x i64> %interleaved.vec, <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -313,8 +380,16 @@ define void @load_min_not_max(ptr %ptr) #1 {
}
define void @store_min_not_max(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1) #1 {
-; CHECK-LABEL: @store_min_not_max(
-; CHECK: call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-LABEL: define void @store_min_not_max(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[V0:%.*]], <4 x i64> [[V1:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[V0]], <4 x i64> [[V1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[V0]], <4 x i64> [[V1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT: ret void
+;
%interleaved.vec = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
store <8 x i64> %interleaved.vec, ptr %ptr, align 4
ret void
@@ -322,8 +397,16 @@ define void @store_min_not_max(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1) #1 {
; Check that correct ptrues are generated for min > type case
define void @load_min_ge_type(ptr %ptr) #2 {
-; CHECK-LABEL: @load_min_ge_type(
-; CHECK: call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-LABEL: define void @load_min_ge_type(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.ld2.sret.nxv2i64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x i64>, <vscale x 2 x i64> } [[LDN]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT: ret void
+;
%interleaved.vec = load <8 x i64>, ptr %ptr, align 4
%v0 = shufflevector <8 x i64> %interleaved.vec, <8 x i64> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
%v1 = shufflevector <8 x i64> %interleaved.vec, <8 x i64> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
@@ -331,25 +414,34 @@ define void @load_min_ge_type(ptr %ptr) #2 {
}
define void @store_min_ge_type(ptr %ptr, <4 x i64> %v0, <4 x i64> %v1) #2 {
-; CHECK-LABEL: @store_min_ge_type(
-; CHECK: call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-LABEL: define void @store_min_ge_type(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x i64> [[V0:%.*]], <4 x i64> [[V1:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 4)
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[V0]], <4 x i64> [[V1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[V0]], <4 x i64> [[V1]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP4]], i64 0)
+; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64(<vscale x 2 x i64> [[TMP3]], <vscale x 2 x i64> [[TMP5]], <vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT: ret void
+;
%interleaved.vec = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
store <8 x i64> %interleaved.vec, ptr %ptr, align 4
ret void
}
define void @load_double_factor4(ptr %ptr) #0 {
-; CHECK-LABEL: @load_double_factor4(
+; CHECK-LABEL: define void @load_double_factor4(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
-; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4.sret.nxv2f64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR:%.*]])
-; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 3
-; CHECK-NEXT: [[TMP4:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP3]], i64 0)
-; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 2
-; CHECK-NEXT: [[TMP6:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP5]], i64 0)
-; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 1
-; CHECK-NEXT: [[TMP8:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP7]], i64 0)
-; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 0
-; CHECK-NEXT: [[TMP10:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP9]], i64 0)
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld4.sret.nxv2f64(<vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 3
+; CHECK-NEXT: [[TMP3:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 2
+; CHECK-NEXT: [[TMP5:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 1
+; CHECK-NEXT: [[TMP7:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP6]], i64 0)
+; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } [[LDN]], 0
+; CHECK-NEXT: [[TMP9:%.*]] = call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[TMP8]], i64 0)
; CHECK-NEXT: ret void
;
%interleaved.vec = load <16 x double>, ptr %ptr, align 4
@@ -361,15 +453,16 @@ define void @load_double_factor4(ptr %ptr) #0 {
}
define void @load_float_factor3(ptr %ptr) #0 {
-; CHECK-LABEL: @load_float_factor3(
+; CHECK-LABEL: define void @load_float_factor3(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
-; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3.sret.nxv4f32(<vscale x 4 x i1> [[TMP1]], ptr [[PTR:%.*]])
-; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 2
-; CHECK-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[TMP3]], i64 0)
-; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 1
-; CHECK-NEXT: [[TMP6:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[TMP5]], i64 0)
-; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 0
-; CHECK-NEXT: [[TMP8:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[TMP7]], i64 0)
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld3.sret.nxv4f32(<vscale x 4 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 2
+; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 1
+; CHECK-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[TMP4]], i64 0)
+; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[LDN]], 0
+; CHECK-NEXT: [[TMP7:%.*]] = call <8 x float> @llvm.vector.extract.v8f32.nxv4f32(<vscale x 4 x float> [[TMP6]], i64 0)
; CHECK-NEXT: ret void
;
%interleaved.vec = load <24 x float>, ptr %ptr, align 4
@@ -380,13 +473,14 @@ define void @load_float_factor3(ptr %ptr) #0 {
}
define void @load_half_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: @load_half_factor2(
+; CHECK-LABEL: define void @load_half_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2.sret.nxv8f16(<vscale x 8 x i1> [[TMP1]], ptr [[PTR:%.*]])
-; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[LDN]], 1
-; CHECK-NEXT: [[TMP4:%.*]] = call <16 x half> @llvm.vector.extract.v16f16.nxv8f16(<vscale x 8 x half> [[TMP3]], i64 0)
-; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[LDN]], 0
-; CHECK-NEXT: [[TMP6:%.*]] = call <16 x half> @llvm.vector.extract.v16f16.nxv8f16(<vscale x 8 x half> [[TMP5]], i64 0)
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.ld2.sret.nxv8f16(<vscale x 8 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[LDN]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = call <16 x half> @llvm.vector.extract.v16f16.nxv8f16(<vscale x 8 x half> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } [[LDN]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = call <16 x half> @llvm.vector.extract.v16f16.nxv8f16(<vscale x 8 x half> [[TMP4]], i64 0)
; CHECK-NEXT: ret void
;
%interleaved.vec = load <32 x half>, ptr %ptr, align 4
@@ -396,13 +490,14 @@ define void @load_half_factor2(ptr %ptr) #0 {
}
define void @load_bfloat_factor2(ptr %ptr) #0 {
-; CHECK-LABEL: @load_bfloat_factor2(
+; CHECK-LABEL: define void @load_bfloat_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2.sret.nxv8bf16(<vscale x 8 x i1> [[TMP1]], ptr [[PTR:%.*]])
-; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[LDN]], 1
-; CHECK-NEXT: [[TMP4:%.*]] = call <16 x bfloat> @llvm.vector.extract.v16bf16.nxv8bf16(<vscale x 8 x bfloat> [[TMP3]], i64 0)
-; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[LDN]], 0
-; CHECK-NEXT: [[TMP6:%.*]] = call <16 x bfloat> @llvm.vector.extract.v16bf16.nxv8bf16(<vscale x 8 x bfloat> [[TMP5]], i64 0)
+; CHECK-NEXT: [[LDN:%.*]] = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.ld2.sret.nxv8bf16(<vscale x 8 x i1> [[TMP1]], ptr [[PTR]])
+; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[LDN]], 1
+; CHECK-NEXT: [[TMP3:%.*]] = call <16 x bfloat> @llvm.vector.extract.v16bf16.nxv8bf16(<vscale x 8 x bfloat> [[TMP2]], i64 0)
+; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[LDN]], 0
+; CHECK-NEXT: [[TMP5:%.*]] = call <16 x bfloat> @llvm.vector.extract.v16bf16.nxv8bf16(<vscale x 8 x bfloat> [[TMP4]], i64 0)
; CHECK-NEXT: ret void
;
%interleaved.vec = load <32 x bfloat>, ptr %ptr, align 4
@@ -412,9 +507,10 @@ define void @load_bfloat_factor2(ptr %ptr) #0 {
}
define void @store_double_factor4(ptr %ptr, <4 x double> %v0, <4 x double> %v1, <4 x double> %v2, <4 x double> %v3) #0 {
-; CHECK-LABEL: @store_double_factor4(
-; CHECK-NEXT: [[S0:%.*]] = shufflevector <4 x double> [[V0:%.*]], <4 x double> [[V1:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: [[S1:%.*]] = shufflevector <4 x double> [[V2:%.*]], <4 x double> [[V3:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-LABEL: define void @store_double_factor4(
+; CHECK-SAME: ptr [[PTR:%.*]], <4 x double> [[V0:%.*]], <4 x double> [[V1:%.*]], <4 x double> [[V2:%.*]], <4 x double> [[V3:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[S0:%.*]] = shufflevector <4 x double> [[V0]], <4 x double> [[V1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: [[S1:%.*]] = shufflevector <4 x double> [[V2]], <4 x double> [[V3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[S0]], <8 x double> [[S1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v4f64(<vscale x 2 x double> undef, <4 x double> [[TMP2]], i64 0)
@@ -424,7 +520,7 @@ define void @store_double_factor4(ptr %ptr, <4 x double> %v0, <4 x double> %v1,
; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v4f64(<vscale x 2 x double> undef, <4 x double> [[TMP6]], i64 0)
; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x double> [[S0]], <8 x double> [[S1]], <4 x i32> <i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v4f64(<vscale x 2 x double> undef, <4 x double> [[TMP8]], i64 0)
-; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP7]], <vscale x 2 x double> [[TMP9]], <vscale x 2 x i1> [[TMP1]], ptr [[PTR:%.*]])
+; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv2f64(<vscale x 2 x double> [[TMP3]], <vscale x 2 x double> [[TMP5]], <vscale x 2 x double> [[TMP7]], <vscale x 2 x double> [[TMP9]], <vscale x 2 x i1> [[TMP1]], ptr [[PTR]])
; CHECK-NEXT: ret void
;
%s0 = shufflevector <4 x double> %v0, <4 x double> %v1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -435,9 +531,10 @@ define void @store_double_factor4(ptr %ptr, <4 x double> %v0, <4 x double> %v1,
}
define void @store_float_factor3(ptr %ptr, <8 x float> %v0, <8 x float> %v1, <8 x float> %v2) #0 {
-; CHECK-LABEL: @store_float_factor3(
-; CHECK-NEXT: [[S0:%.*]] = shufflevector <8 x float> [[V0:%.*]], <8 x float> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x float> [[V2:%.*]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-LABEL: define void @store_float_factor3(
+; CHECK-SAME: ptr [[PTR:%.*]], <8 x float> [[V0:%.*]], <8 x float> [[V1:%.*]], <8 x float> [[V2:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[S0:%.*]] = shufflevector <8 x float> [[V0]], <8 x float> [[V1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x float> [[V2]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 31)
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x float> [[S0]], <16 x float> [[S1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[TMP2]], i64 0)
@@ -445,7 +542,7 @@ define void @store_float_factor3(ptr %ptr, <8 x float> %v0, <8 x float> %v1, <8
; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[TMP4]], i64 0)
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[S0]], <16 x float> [[S1]], <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> undef, <8 x float> [[TMP6]], i64 0)
-; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP7]], <vscale x 4 x i1> [[TMP1]], ptr [[PTR:%.*]])
+; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv4f32(<vscale x 4 x float> [[TMP3]], <vscale x 4 x float> [[TMP5]], <vscale x 4 x float> [[TMP7]], <vscale x 4 x i1> [[TMP1]], ptr [[PTR]])
; CHECK-NEXT: ret void
;
%s0 = shufflevector <8 x float> %v0, <8 x float> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
@@ -459,13 +556,14 @@ define void @store_float_factor3(ptr %ptr, <8 x float> %v0, <8 x float> %v1, <8
}
define void @store_half_factor2(ptr %ptr, <16 x half> %v0, <16 x half> %v1) #0 {
-; CHECK-LABEL: @store_half_factor2(
+; CHECK-LABEL: define void @store_half_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]], <16 x half> [[V0:%.*]], <16 x half> [[V1:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x half> [[V0:%.*]], <16 x half> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x half> [[V0]], <16 x half> [[V1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v16f16(<vscale x 8 x half> undef, <16 x half> [[TMP2]], i64 0)
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x half> [[V0]], <16 x half> [[V1]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 8 x half> @llvm.vector.insert.nxv8f16.v16f16(<vscale x 8 x half> undef, <16 x half> [[TMP4]], i64 0)
-; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x i1> [[TMP1]], ptr [[PTR:%.*]])
+; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv8f16(<vscale x 8 x half> [[TMP3]], <vscale x 8 x half> [[TMP5]], <vscale x 8 x i1> [[TMP1]], ptr [[PTR]])
; CHECK-NEXT: ret void
;
%interleaved.vec = shufflevector <16 x half> %v0, <16 x half> %v1, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23,
@@ -476,13 +574,14 @@ define void @store_half_factor2(ptr %ptr, <16 x half> %v0, <16 x half> %v1) #0 {
define void @store_bfloat_factor2(ptr %ptr, <16 x bfloat> %v0, <16 x bfloat> %v1) #0 {
-; CHECK-LABEL: @store_bfloat_factor2(
+; CHECK-LABEL: define void @store_bfloat_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]], <16 x bfloat> [[V0:%.*]], <16 x bfloat> [[V1:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 31)
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x bfloat> [[V0:%.*]], <16 x bfloat> [[V1:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x bfloat> [[V0]], <16 x bfloat> [[V1]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; CHECK-NEXT: [[TMP3:%.*]] = call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v16bf16(<vscale x 8 x bfloat> undef, <16 x bfloat> [[TMP2]], i64 0)
; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x bfloat> [[V0]], <16 x bfloat> [[V1]], <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v16bf16(<vscale x 8 x bfloat> undef, <16 x bfloat> [[TMP4]], i64 0)
-; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP1]], ptr [[PTR:%.*]])
+; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv8bf16(<vscale x 8 x bfloat> [[TMP3]], <vscale x 8 x bfloat> [[TMP5]], <vscale x 8 x i1> [[TMP1]], ptr [[PTR]])
; CHECK-NEXT: ret void
;
%interleaved.vec = shufflevector <16 x bfloat> %v0, <16 x bfloat> %v1, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23,
@@ -492,8 +591,9 @@ define void @store_bfloat_factor2(ptr %ptr, <16 x bfloat> %v0, <16 x bfloat> %v1
}
; Ensure vscale_range property does not affect scalable vector types.
-define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_nxptr_factor2(ptr %ptr) #2 {
-; CHECK-LABEL: define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_nxptr_factor2(
+define void @deinterleave_nxptr_factor2(ptr %ptr) #2 {
+; CHECK-LABEL: define void @deinterleave_nxptr_factor2(
+; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR2]] {
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr <vscale x 2 x double>, ptr [[PTR]], i64 0
; CHECK-NEXT: [[LDN1:%.*]] = call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld2.sret.nxv2f64(<vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer), ptr [[TMP1]])
; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN1]], 0
@@ -506,13 +606,13 @@ define { <vscale x 4 x double>, <vscale x 4 x double> } @deinterleave_nxptr_fact
; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP3]], <vscale x 2 x double> [[TMP7]], i64 2)
; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[LDN2]], 1
; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 4 x double> @llvm.vector.insert.nxv4f64.nxv2f64(<vscale x 4 x double> [[TMP5]], <vscale x 2 x double> [[TMP9]], i64 2)
-; CHECK-NEXT: [[TMP11:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } poison, <vscale x 4 x double> [[TMP8]], 0
-; CHECK-NEXT: [[TMP12:%.*]] = insertvalue { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP11]], <vscale x 4 x double> [[TMP10]], 1
-; CHECK-NEXT: ret { <vscale x 4 x double>, <vscale x 4 x double> } [[TMP12]]
+; CHECK-NEXT: ret void
;
%wide.vec = load <vscale x 8 x double>, ptr %ptr, align 8
%ldN = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %wide.vec)
- ret { <vscale x 4 x double>, <vscale x 4 x double> } %ldN
+ %extract1 = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %ldN, 0
+ %extract2 = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %ldN, 1
+ ret void
}
attributes #0 = { vscale_range(2,2) "target-features"="+sve" }
More information about the llvm-commits
mailing list