[llvm] 5a1e16f - [IR][RISCV] Add llvm.vector.(de)interleave3/5/7 (#124825)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 5 15:30:38 PST 2025
Author: Min-Yih Hsu
Date: 2025-02-05T15:30:33-08:00
New Revision: 5a1e16f6de26c21cdfae1de05bd075d57029a3e1
URL: https://github.com/llvm/llvm-project/commit/5a1e16f6de26c21cdfae1de05bd075d57029a3e1
DIFF: https://github.com/llvm/llvm-project/commit/5a1e16f6de26c21cdfae1de05bd075d57029a3e1.diff
LOG: [IR][RISCV] Add llvm.vector.(de)interleave3/5/7 (#124825)
These three intrinsics are similar to llvm.vector.(de)interleave2 but
work with 3/5/7 vector operands or results.
For RISC-V, it's important to have them in order to support segmented
load/store with factor of 2 to 8: factor of 2/4/8 can be synthesized
from (de)interleave2; factor of 6 can be synthesized from factor of 2
and 3; factor 5 and 7 have their own intrinsics added by this patch.
This patch only adds codegen support for these intrinsics, we still need
to teach vectorizer to generate them as well as teaching
InterleavedAccessPass to use them.
---------
Co-authored-by: Craig Topper <craig.topper at sifive.com>
Added:
Modified:
llvm/include/llvm/IR/DerivedTypes.h
llvm/include/llvm/IR/Intrinsics.h
llvm/include/llvm/IR/Intrinsics.td
llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
llvm/lib/IR/Intrinsics.cpp
llvm/lib/Target/RISCV/RISCVISelLowering.cpp
llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
Removed:
################################################################################
diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index b44f4f8c8687dc..60606d34c32c31 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -536,6 +536,15 @@ class VectorType : public Type {
EltCnt.divideCoefficientBy(2));
}
+ static VectorType *getOneNthElementsVectorType(VectorType *VTy,
+ unsigned Denominator) {
+ auto EltCnt = VTy->getElementCount();
+ assert(EltCnt.isKnownMultipleOf(Denominator) &&
+ "Cannot take one-nth of a vector");
+ return VectorType::get(VTy->getScalarType(),
+ EltCnt.divideCoefficientBy(Denominator));
+ }
+
/// This static method returns a VectorType with twice as many elements as the
/// input type and the same element type.
static VectorType *getDoubleElementsVectorType(VectorType *VTy) {
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 82f72131b9d2f4..65a7fc0ce2c1c9 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -148,6 +148,9 @@ namespace Intrinsic {
ExtendArgument,
TruncArgument,
HalfVecArgument,
+ OneThirdVecArgument,
+ OneFifthVecArgument,
+ OneSeventhVecArgument,
SameVecWidthArgument,
VecOfAnyPtrsToElt,
VecElementArgument,
@@ -159,6 +162,9 @@ namespace Intrinsic {
AArch64Svcount,
} Kind;
+ // These three have to be contiguous.
+ static_assert(OneFifthVecArgument == OneThirdVecArgument + 1 &&
+ OneSeventhVecArgument == OneFifthVecArgument + 1);
union {
unsigned Integer_Width;
unsigned Float_Width;
@@ -178,15 +184,17 @@ namespace Intrinsic {
unsigned getArgumentNumber() const {
assert(Kind == Argument || Kind == ExtendArgument ||
Kind == TruncArgument || Kind == HalfVecArgument ||
- Kind == SameVecWidthArgument || Kind == VecElementArgument ||
- Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
- Kind == VecOfBitcastsToInt);
+ Kind == OneThirdVecArgument || Kind == OneFifthVecArgument ||
+ Kind == OneSeventhVecArgument || Kind == SameVecWidthArgument ||
+ Kind == VecElementArgument || Kind == Subdivide2Argument ||
+ Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt);
return Argument_Info >> 3;
}
ArgKind getArgumentKind() const {
assert(Kind == Argument || Kind == ExtendArgument ||
Kind == TruncArgument || Kind == HalfVecArgument ||
- Kind == SameVecWidthArgument ||
+ Kind == OneThirdVecArgument || Kind == OneFifthVecArgument ||
+ Kind == OneSeventhVecArgument || Kind == SameVecWidthArgument ||
Kind == VecElementArgument || Kind == Subdivide2Argument ||
Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt);
return (ArgKind)(Argument_Info & 7);
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index ee877349a33149..d4ce4b1d199d7b 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -327,6 +327,9 @@ def IIT_I4 : IIT_Int<4, 58>;
def IIT_AARCH64_SVCOUNT : IIT_VT<aarch64svcount, 59>;
def IIT_V6 : IIT_Vec<6, 60>;
def IIT_V10 : IIT_Vec<10, 61>;
+def IIT_ONE_THIRD_VEC_ARG : IIT_Base<62>;
+def IIT_ONE_FIFTH_VEC_ARG : IIT_Base<63>;
+def IIT_ONE_SEVENTH_VEC_ARG : IIT_Base<64>;
}
defvar IIT_all_FixedTypes = !filter(iit, IIT_all,
@@ -467,6 +470,15 @@ class LLVMVectorElementType<int num> : LLVMMatchType<num, IIT_VEC_ELEMENT>;
class LLVMHalfElementsVectorType<int num>
: LLVMMatchType<num, IIT_HALF_VEC_ARG>;
+class LLVMOneThirdElementsVectorType<int num>
+ : LLVMMatchType<num, IIT_ONE_THIRD_VEC_ARG>;
+
+class LLVMOneFifthElementsVectorType<int num>
+ : LLVMMatchType<num, IIT_ONE_FIFTH_VEC_ARG>;
+
+class LLVMOneSeventhElementsVectorType<int num>
+ : LLVMMatchType<num, IIT_ONE_SEVENTH_VEC_ARG>;
+
// Match the type of another intrinsic parameter that is expected to be a
// vector type (i.e. <N x iM>) but with each element subdivided to
// form a vector with more elements that are smaller than the original.
@@ -2728,6 +2740,54 @@ def int_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType
[llvm_anyvector_ty],
[IntrNoMem]>;
+def int_vector_interleave3 : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMOneThirdElementsVectorType<0>,
+ LLVMOneThirdElementsVectorType<0>,
+ LLVMOneThirdElementsVectorType<0>],
+ [IntrNoMem]>;
+
+def int_vector_deinterleave3 : DefaultAttrsIntrinsic<[LLVMOneThirdElementsVectorType<0>,
+ LLVMOneThirdElementsVectorType<0>,
+ LLVMOneThirdElementsVectorType<0>],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+
+def int_vector_interleave5 : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMOneFifthElementsVectorType<0>,
+ LLVMOneFifthElementsVectorType<0>,
+ LLVMOneFifthElementsVectorType<0>,
+ LLVMOneFifthElementsVectorType<0>,
+ LLVMOneFifthElementsVectorType<0>],
+ [IntrNoMem]>;
+
+def int_vector_deinterleave5 : DefaultAttrsIntrinsic<[LLVMOneFifthElementsVectorType<0>,
+ LLVMOneFifthElementsVectorType<0>,
+ LLVMOneFifthElementsVectorType<0>,
+ LLVMOneFifthElementsVectorType<0>,
+ LLVMOneFifthElementsVectorType<0>],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+
+def int_vector_interleave7 : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMOneSeventhElementsVectorType<0>,
+ LLVMOneSeventhElementsVectorType<0>,
+ LLVMOneSeventhElementsVectorType<0>,
+ LLVMOneSeventhElementsVectorType<0>,
+ LLVMOneSeventhElementsVectorType<0>,
+ LLVMOneSeventhElementsVectorType<0>,
+ LLVMOneSeventhElementsVectorType<0>],
+ [IntrNoMem]>;
+
+def int_vector_deinterleave7 : DefaultAttrsIntrinsic<[LLVMOneSeventhElementsVectorType<0>,
+ LLVMOneSeventhElementsVectorType<0>,
+ LLVMOneSeventhElementsVectorType<0>,
+ LLVMOneSeventhElementsVectorType<0>,
+ LLVMOneSeventhElementsVectorType<0>,
+ LLVMOneSeventhElementsVectorType<0>,
+ LLVMOneSeventhElementsVectorType<0>],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+
//===-------------- Intrinsics to perform partial reduction ---------------===//
def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index f1a91a782bbf93..a0f29496df7777 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -5881,15 +5881,19 @@ SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SPLICE(SDNode *N) {
}
SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_INTERLEAVE_DEINTERLEAVE(SDNode *N) {
- SDLoc dl(N);
+ SDLoc DL(N);
+ unsigned Factor = N->getNumOperands();
+
+ SmallVector<SDValue, 8> Ops(Factor);
+ for (unsigned i = 0; i != Factor; i++)
+ Ops[i] = GetPromotedInteger(N->getOperand(i));
+
+ SmallVector<EVT, 8> ResVTs(Factor, Ops[0].getValueType());
+ SDValue Res = DAG.getNode(N->getOpcode(), DL, DAG.getVTList(ResVTs), Ops);
+
+ for (unsigned i = 0; i != Factor; i++)
+ SetPromotedInteger(SDValue(N, i), Res.getValue(i));
- SDValue V0 = GetPromotedInteger(N->getOperand(0));
- SDValue V1 = GetPromotedInteger(N->getOperand(1));
- EVT ResVT = V0.getValueType();
- SDValue Res = DAG.getNode(N->getOpcode(), dl,
- DAG.getVTList(ResVT, ResVT), V0, V1);
- SetPromotedInteger(SDValue(N, 0), Res.getValue(0));
- SetPromotedInteger(SDValue(N, 1), Res.getValue(1));
return SDValue();
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 1000235ab4061f..e00be6c3958765 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1668,6 +1668,15 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
return;
}
+ if (getTypeAction(SubVecVT) == TargetLowering::TypeWidenVector &&
+ Vec.isUndef() && SubVecVT.getVectorElementType() == MVT::i1) {
+ SDValue WideSubVec = GetWidenedVector(SubVec);
+ if (WideSubVec.getValueType() == VecVT) {
+ std::tie(Lo, Hi) = DAG.SplitVector(WideSubVec, SDLoc(WideSubVec));
+ return;
+ }
+ }
+
// Spill the vector to the stack.
// In cases where the vector is illegal it will be broken down into parts
// and stored in parts - we should use the alignment for the smallest part.
@@ -3183,34 +3192,53 @@ void DAGTypeLegalizer::SplitVecRes_VP_REVERSE(SDNode *N, SDValue &Lo,
}
void DAGTypeLegalizer::SplitVecRes_VECTOR_DEINTERLEAVE(SDNode *N) {
+ unsigned Factor = N->getNumOperands();
+
+ SmallVector<SDValue, 8> Ops(Factor * 2);
+ for (unsigned i = 0; i != Factor; ++i) {
+ SDValue OpLo, OpHi;
+ GetSplitVector(N->getOperand(i), OpLo, OpHi);
+ Ops[i * 2] = OpLo;
+ Ops[i * 2 + 1] = OpHi;
+ }
+
+ SmallVector<EVT, 8> VTs(Factor, Ops[0].getValueType());
- SDValue Op0Lo, Op0Hi, Op1Lo, Op1Hi;
- GetSplitVector(N->getOperand(0), Op0Lo, Op0Hi);
- GetSplitVector(N->getOperand(1), Op1Lo, Op1Hi);
- EVT VT = Op0Lo.getValueType();
SDLoc DL(N);
- SDValue ResLo = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL,
- DAG.getVTList(VT, VT), Op0Lo, Op0Hi);
- SDValue ResHi = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL,
- DAG.getVTList(VT, VT), Op1Lo, Op1Hi);
+ SDValue ResLo = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTs,
+ ArrayRef(Ops).slice(0, Factor));
+ SDValue ResHi = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTs,
+ ArrayRef(Ops).slice(Factor, Factor));
- SetSplitVector(SDValue(N, 0), ResLo.getValue(0), ResHi.getValue(0));
- SetSplitVector(SDValue(N, 1), ResLo.getValue(1), ResHi.getValue(1));
+ for (unsigned i = 0; i != Factor; ++i)
+ SetSplitVector(SDValue(N, i), ResLo.getValue(i), ResHi.getValue(i));
}
void DAGTypeLegalizer::SplitVecRes_VECTOR_INTERLEAVE(SDNode *N) {
- SDValue Op0Lo, Op0Hi, Op1Lo, Op1Hi;
- GetSplitVector(N->getOperand(0), Op0Lo, Op0Hi);
- GetSplitVector(N->getOperand(1), Op1Lo, Op1Hi);
- EVT VT = Op0Lo.getValueType();
+ unsigned Factor = N->getNumOperands();
+
+ SmallVector<SDValue, 8> Ops(Factor * 2);
+ for (unsigned i = 0; i != Factor; ++i) {
+ SDValue OpLo, OpHi;
+ GetSplitVector(N->getOperand(i), OpLo, OpHi);
+ Ops[i] = OpLo;
+ Ops[i + Factor] = OpHi;
+ }
+
+ SmallVector<EVT, 8> VTs(Factor, Ops[0].getValueType());
+
SDLoc DL(N);
- SDValue Res[] = {DAG.getNode(ISD::VECTOR_INTERLEAVE, DL,
- DAG.getVTList(VT, VT), Op0Lo, Op1Lo),
- DAG.getNode(ISD::VECTOR_INTERLEAVE, DL,
- DAG.getVTList(VT, VT), Op0Hi, Op1Hi)};
+ SDValue Res[] = {DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTs,
+ ArrayRef(Ops).slice(0, Factor)),
+ DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTs,
+ ArrayRef(Ops).slice(Factor, Factor))};
- SetSplitVector(SDValue(N, 0), Res[0].getValue(0), Res[0].getValue(1));
- SetSplitVector(SDValue(N, 1), Res[1].getValue(0), Res[1].getValue(1));
+ for (unsigned i = 0; i != Factor; ++i) {
+ unsigned IdxLo = 2 * i;
+ unsigned IdxHi = 2 * i + 1;
+ SetSplitVector(SDValue(N, i), Res[IdxLo / Factor].getValue(IdxLo % Factor),
+ Res[IdxHi / Factor].getValue(IdxHi % Factor));
+ }
}
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 428e7a316d247b..4e1ce6af3abc84 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8251,10 +8251,28 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
visitCallBrLandingPad(I);
return;
case Intrinsic::vector_interleave2:
- visitVectorInterleave(I);
+ visitVectorInterleave(I, 2);
+ return;
+ case Intrinsic::vector_interleave3:
+ visitVectorInterleave(I, 3);
+ return;
+ case Intrinsic::vector_interleave5:
+ visitVectorInterleave(I, 5);
+ return;
+ case Intrinsic::vector_interleave7:
+ visitVectorInterleave(I, 7);
return;
case Intrinsic::vector_deinterleave2:
- visitVectorDeinterleave(I);
+ visitVectorDeinterleave(I, 2);
+ return;
+ case Intrinsic::vector_deinterleave3:
+ visitVectorDeinterleave(I, 3);
+ return;
+ case Intrinsic::vector_deinterleave5:
+ visitVectorDeinterleave(I, 5);
+ return;
+ case Intrinsic::vector_deinterleave7:
+ visitVectorDeinterleave(I, 7);
return;
case Intrinsic::experimental_vector_compress:
setValue(&I, DAG.getNode(ISD::VECTOR_COMPRESS, sdl,
@@ -12565,26 +12583,31 @@ void SelectionDAGBuilder::visitVectorReverse(const CallInst &I) {
setValue(&I, DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), Mask));
}
-void SelectionDAGBuilder::visitVectorDeinterleave(const CallInst &I) {
+void SelectionDAGBuilder::visitVectorDeinterleave(const CallInst &I,
+ unsigned Factor) {
auto DL = getCurSDLoc();
SDValue InVec = getValue(I.getOperand(0));
- EVT OutVT =
- InVec.getValueType().getHalfNumVectorElementsVT(*DAG.getContext());
+ SmallVector<EVT, 4> ValueVTs;
+ ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), I.getType(),
+ ValueVTs);
+
+ EVT OutVT = ValueVTs[0];
unsigned OutNumElts = OutVT.getVectorMinNumElements();
- // ISD Node needs the input vectors split into two equal parts
- SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, InVec,
- DAG.getVectorIdxConstant(0, DL));
- SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, InVec,
- DAG.getVectorIdxConstant(OutNumElts, DL));
+ SmallVector<SDValue, 4> SubVecs(Factor);
+ for (unsigned i = 0; i != Factor; ++i) {
+ assert(ValueVTs[i] == OutVT && "Expected VTs to be the same");
+ SubVecs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, InVec,
+ DAG.getVectorIdxConstant(OutNumElts * i, DL));
+ }
- // Use VECTOR_SHUFFLE for fixed-length vectors to benefit from existing
- // legalisation and combines.
- if (OutVT.isFixedLengthVector()) {
- SDValue Even = DAG.getVectorShuffle(OutVT, DL, Lo, Hi,
+ // Use VECTOR_SHUFFLE for fixed-length vectors with factor of 2 to benefit
+ // from existing legalisation and combines.
+ if (OutVT.isFixedLengthVector() && Factor == 2) {
+ SDValue Even = DAG.getVectorShuffle(OutVT, DL, SubVecs[0], SubVecs[1],
createStrideMask(0, 2, OutNumElts));
- SDValue Odd = DAG.getVectorShuffle(OutVT, DL, Lo, Hi,
+ SDValue Odd = DAG.getVectorShuffle(OutVT, DL, SubVecs[0], SubVecs[1],
createStrideMask(1, 2, OutNumElts));
SDValue Res = DAG.getMergeValues({Even, Odd}, getCurSDLoc());
setValue(&I, Res);
@@ -12592,32 +12615,43 @@ void SelectionDAGBuilder::visitVectorDeinterleave(const CallInst &I) {
}
SDValue Res = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL,
- DAG.getVTList(OutVT, OutVT), Lo, Hi);
+ DAG.getVTList(ValueVTs), SubVecs);
setValue(&I, Res);
}
-void SelectionDAGBuilder::visitVectorInterleave(const CallInst &I) {
+void SelectionDAGBuilder::visitVectorInterleave(const CallInst &I,
+ unsigned Factor) {
auto DL = getCurSDLoc();
- EVT InVT = getValue(I.getOperand(0)).getValueType();
- SDValue InVec0 = getValue(I.getOperand(0));
- SDValue InVec1 = getValue(I.getOperand(1));
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ EVT InVT = getValue(I.getOperand(0)).getValueType();
EVT OutVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
- // Use VECTOR_SHUFFLE for fixed-length vectors to benefit from existing
- // legalisation and combines.
- if (OutVT.isFixedLengthVector()) {
+ SmallVector<SDValue, 8> InVecs(Factor);
+ for (unsigned i = 0; i < Factor; ++i) {
+ InVecs[i] = getValue(I.getOperand(i));
+ assert(InVecs[i].getValueType() == InVecs[0].getValueType() &&
+ "Expected VTs to be the same");
+ }
+
+ // Use VECTOR_SHUFFLE for fixed-length vectors with factor of 2 to benefit
+ // from existing legalisation and combines.
+ if (OutVT.isFixedLengthVector() && Factor == 2) {
unsigned NumElts = InVT.getVectorMinNumElements();
- SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, InVec0, InVec1);
+ SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, InVecs);
setValue(&I, DAG.getVectorShuffle(OutVT, DL, V, DAG.getUNDEF(OutVT),
createInterleaveMask(NumElts, 2)));
return;
}
- SDValue Res = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL,
- DAG.getVTList(InVT, InVT), InVec0, InVec1);
- Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Res.getValue(0),
- Res.getValue(1));
+ SmallVector<EVT, 8> ValueVTs(Factor, InVT);
+ SDValue Res =
+ DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, DAG.getVTList(ValueVTs), InVecs);
+
+ SmallVector<SDValue, 8> Results(Factor);
+ for (unsigned i = 0; i < Factor; ++i)
+ Results[i] = Res.getValue(i);
+
+ Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Results);
setValue(&I, Res);
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index ed85deef64fa79..ece48c9bedf722 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -659,8 +659,8 @@ class SelectionDAGBuilder {
void visitVectorReduce(const CallInst &I, unsigned Intrinsic);
void visitVectorReverse(const CallInst &I);
void visitVectorSplice(const CallInst &I);
- void visitVectorInterleave(const CallInst &I);
- void visitVectorDeinterleave(const CallInst &I);
+ void visitVectorInterleave(const CallInst &I, unsigned Factor);
+ void visitVectorDeinterleave(const CallInst &I, unsigned Factor);
void visitStepVector(const CallInst &I);
void visitUserOp1(const Instruction &I) {
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index be8f33dc22f546..6e3c49b6d50b9a 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -363,6 +363,24 @@ DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
IITDescriptor::get(IITDescriptor::HalfVecArgument, ArgInfo));
return;
}
+ case IIT_ONE_THIRD_VEC_ARG: {
+ unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+ OutputTable.push_back(
+ IITDescriptor::get(IITDescriptor::OneThirdVecArgument, ArgInfo));
+ return;
+ }
+ case IIT_ONE_FIFTH_VEC_ARG: {
+ unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+ OutputTable.push_back(
+ IITDescriptor::get(IITDescriptor::OneFifthVecArgument, ArgInfo));
+ return;
+ }
+ case IIT_ONE_SEVENTH_VEC_ARG: {
+ unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+ OutputTable.push_back(
+ IITDescriptor::get(IITDescriptor::OneSeventhVecArgument, ArgInfo));
+ return;
+ }
case IIT_SAME_VEC_WIDTH_ARG: {
unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
OutputTable.push_back(
@@ -556,6 +574,12 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
case IITDescriptor::HalfVecArgument:
return VectorType::getHalfElementsVectorType(
cast<VectorType>(Tys[D.getArgumentNumber()]));
+ case IITDescriptor::OneThirdVecArgument:
+ case IITDescriptor::OneFifthVecArgument:
+ case IITDescriptor::OneSeventhVecArgument:
+ return VectorType::getOneNthElementsVectorType(
+ cast<VectorType>(Tys[D.getArgumentNumber()]),
+ 3 + (D.Kind - IITDescriptor::OneThirdVecArgument) * 2);
case IITDescriptor::SameVecWidthArgument: {
Type *EltTy = DecodeFixedType(Infos, Tys, Context);
Type *Ty = Tys[D.getArgumentNumber()];
@@ -932,6 +956,16 @@ matchIntrinsicType(Type *Ty, ArrayRef<Intrinsic::IITDescriptor> &Infos,
return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
VectorType::getHalfElementsVectorType(
cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+ case IITDescriptor::OneThirdVecArgument:
+ case IITDescriptor::OneFifthVecArgument:
+ case IITDescriptor::OneSeventhVecArgument:
+ // If this is a forward reference, defer the check for later.
+ if (D.getArgumentNumber() >= ArgTys.size())
+ return IsDeferredCheck || DeferCheck(Ty);
+ return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
+ VectorType::getOneNthElementsVectorType(
+ cast<VectorType>(ArgTys[D.getArgumentNumber()]),
+ 3 + (D.Kind - IITDescriptor::OneThirdVecArgument) * 2) != Ty;
case IITDescriptor::SameVecWidthArgument: {
if (D.getArgumentNumber() >= ArgTys.size()) {
// Defer check and subsequent check for the vector element type.
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 0284099c517b43..69decb4229139f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1189,6 +1189,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
{ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS, ISD::VECTOR_REVERSE}, VT,
Custom);
+ setOperationAction({ISD::VECTOR_INTERLEAVE, ISD::VECTOR_DEINTERLEAVE},
+ VT, Custom);
+
setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::EXTRACT_VECTOR_ELT},
VT, Custom);
@@ -1344,6 +1347,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::VECTOR_SHUFFLE, ISD::VECTOR_COMPRESS},
VT, Custom);
+ setOperationAction({ISD::VECTOR_INTERLEAVE, ISD::VECTOR_DEINTERLEAVE},
+ VT, Custom);
+
setOperationAction({ISD::LOAD, ISD::STORE, ISD::MLOAD, ISD::MSTORE,
ISD::MGATHER, ISD::MSCATTER},
VT, Custom);
@@ -11030,78 +11036,161 @@ SDValue RISCVTargetLowering::lowerVECTOR_DEINTERLEAVE(SDValue Op,
SDLoc DL(Op);
MVT VecVT = Op.getSimpleValueType();
- assert(VecVT.isScalableVector() &&
- "vector_interleave on non-scalable vector!");
+ const unsigned Factor = Op->getNumValues();
+ assert(Factor <= 8);
// 1 bit element vectors need to be widened to e8
if (VecVT.getVectorElementType() == MVT::i1)
return widenVectorOpsToi8(Op, DL, DAG);
- // If the VT is LMUL=8, we need to split and reassemble.
- if (VecVT.getSizeInBits().getKnownMinValue() ==
+ // Convert to scalable vectors first.
+ if (VecVT.isFixedLengthVector()) {
+ MVT ContainerVT = getContainerForFixedLengthVector(VecVT);
+ SmallVector<SDValue, 8> Ops(Factor);
+ for (unsigned i = 0U; i < Factor; ++i)
+ Ops[i] = convertToScalableVector(ContainerVT, Op.getOperand(i), DAG,
+ Subtarget);
+
+ SmallVector<EVT, 8> VTs(Factor, ContainerVT);
+ SDValue NewDeinterleave =
+ DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTs, Ops);
+
+ SmallVector<SDValue, 8> Res(Factor);
+ for (unsigned i = 0U; i < Factor; ++i)
+ Res[i] = convertFromScalableVector(VecVT, NewDeinterleave.getValue(i),
+ DAG, Subtarget);
+ return DAG.getMergeValues(Res, DL);
+ }
+
+ // If concatenating would exceed LMUL=8, we need to split.
+ if ((VecVT.getSizeInBits().getKnownMinValue() * Factor) >
(8 * RISCV::RVVBitsPerBlock)) {
- auto [Op0Lo, Op0Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
- auto [Op1Lo, Op1Hi] = DAG.SplitVectorOperand(Op.getNode(), 1);
- EVT SplitVT = Op0Lo.getValueType();
-
- SDValue ResLo = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL,
- DAG.getVTList(SplitVT, SplitVT), Op0Lo, Op0Hi);
- SDValue ResHi = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL,
- DAG.getVTList(SplitVT, SplitVT), Op1Lo, Op1Hi);
-
- SDValue Even = DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT,
- ResLo.getValue(0), ResHi.getValue(0));
- SDValue Odd = DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, ResLo.getValue(1),
- ResHi.getValue(1));
- return DAG.getMergeValues({Even, Odd}, DL);
+ SmallVector<SDValue, 8> Ops(Factor * 2);
+ for (unsigned i = 0; i != Factor; ++i) {
+ auto [OpLo, OpHi] = DAG.SplitVectorOperand(Op.getNode(), i);
+ Ops[i * 2] = OpLo;
+ Ops[i * 2 + 1] = OpHi;
+ }
+
+ SmallVector<EVT, 8> VTs(Factor, Ops[0].getValueType());
+
+ SDValue Lo = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTs,
+ ArrayRef(Ops).slice(0, Factor));
+ SDValue Hi = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTs,
+ ArrayRef(Ops).slice(Factor, Factor));
+
+ SmallVector<SDValue, 8> Res(Factor);
+ for (unsigned i = 0; i != Factor; ++i)
+ Res[i] = DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo.getValue(i),
+ Hi.getValue(i));
+
+ return DAG.getMergeValues(Res, DL);
}
- // Concatenate the two vectors as one vector to deinterleave
+ SmallVector<SDValue, 8> Ops(Op->op_values());
+
+ // Concatenate the vectors as one vector to deinterleave
MVT ConcatVT =
MVT::getVectorVT(VecVT.getVectorElementType(),
- VecVT.getVectorElementCount().multiplyCoefficientBy(2));
- SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT,
- Op.getOperand(0), Op.getOperand(1));
+ VecVT.getVectorElementCount().multiplyCoefficientBy(
+ PowerOf2Ceil(Factor)));
+ if (Ops.size() < PowerOf2Ceil(Factor))
+ Ops.append(PowerOf2Ceil(Factor) - Factor, DAG.getUNDEF(VecVT));
+ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, Ops);
+
+ if (Factor == 2) {
+ // We can deinterleave through vnsrl.wi if the element type is smaller than
+ // ELEN
+ if (VecVT.getScalarSizeInBits() < Subtarget.getELen()) {
+ SDValue Even = getDeinterleaveShiftAndTrunc(DL, VecVT, Concat, 2, 0, DAG);
+ SDValue Odd = getDeinterleaveShiftAndTrunc(DL, VecVT, Concat, 2, 1, DAG);
+ return DAG.getMergeValues({Even, Odd}, DL);
+ }
+
+ // For the indices, use the vmv.v.x of an i8 constant to fill the largest
+ // possibly mask vector, then extract the required subvector. Doing this
+ // (instead of a vid, vmsne sequence) reduces LMUL, and allows the mask
+ // creation to be rematerialized during register allocation to reduce
+ // register pressure if needed.
+
+ MVT MaskVT = ConcatVT.changeVectorElementType(MVT::i1);
+
+ SDValue EvenSplat = DAG.getConstant(0b01010101, DL, MVT::nxv8i8);
+ EvenSplat = DAG.getBitcast(MVT::nxv64i1, EvenSplat);
+ SDValue EvenMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskVT,
+ EvenSplat, DAG.getVectorIdxConstant(0, DL));
+
+ SDValue OddSplat = DAG.getConstant(0b10101010, DL, MVT::nxv8i8);
+ OddSplat = DAG.getBitcast(MVT::nxv64i1, OddSplat);
+ SDValue OddMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskVT, OddSplat,
+ DAG.getVectorIdxConstant(0, DL));
+
+ // vcompress the even and odd elements into two separate vectors
+ SDValue EvenWide = DAG.getNode(ISD::VECTOR_COMPRESS, DL, ConcatVT, Concat,
+ EvenMask, DAG.getUNDEF(ConcatVT));
+ SDValue OddWide = DAG.getNode(ISD::VECTOR_COMPRESS, DL, ConcatVT, Concat,
+ OddMask, DAG.getUNDEF(ConcatVT));
+
+ // Extract the result half of the gather for even and odd
+ SDValue Even = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, EvenWide,
+ DAG.getVectorIdxConstant(0, DL));
+ SDValue Odd = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, OddWide,
+ DAG.getVectorIdxConstant(0, DL));
- // We can deinterleave through vnsrl.wi if the element type is smaller than
- // ELEN
- if (VecVT.getScalarSizeInBits() < Subtarget.getELen()) {
- SDValue Even = getDeinterleaveShiftAndTrunc(DL, VecVT, Concat, 2, 0, DAG);
- SDValue Odd = getDeinterleaveShiftAndTrunc(DL, VecVT, Concat, 2, 1, DAG);
return DAG.getMergeValues({Even, Odd}, DL);
}
- // For the indices, use the vmv.v.x of an i8 constant to fill the largest
- // possibly mask vector, then extract the required subvector. Doing this
- // (instead of a vid, vmsne sequence) reduces LMUL, and allows the mask
- // creation to be rematerialized during register allocation to reduce
- // register pressure if needed.
+ // Store with unit-stride store and load it back with segmented load.
+ MVT XLenVT = Subtarget.getXLenVT();
+ SDValue VL = getDefaultScalableVLOps(ConcatVT, DL, DAG, Subtarget).second;
+ SDValue Passthru = DAG.getUNDEF(ConcatVT);
- MVT MaskVT = ConcatVT.changeVectorElementType(MVT::i1);
+ // Allocate a stack slot.
+ Align Alignment = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
+ SDValue StackPtr =
+ DAG.CreateStackTemporary(ConcatVT.getStoreSize(), Alignment);
+ auto &MF = DAG.getMachineFunction();
+ auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
- SDValue EvenSplat = DAG.getConstant(0b01010101, DL, MVT::nxv8i8);
- EvenSplat = DAG.getBitcast(MVT::nxv64i1, EvenSplat);
- SDValue EvenMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskVT, EvenSplat,
- DAG.getVectorIdxConstant(0, DL));
+ SDValue StoreOps[] = {DAG.getEntryNode(),
+ DAG.getTargetConstant(Intrinsic::riscv_vse, DL, XLenVT),
+ Concat, StackPtr, VL};
- SDValue OddSplat = DAG.getConstant(0b10101010, DL, MVT::nxv8i8);
- OddSplat = DAG.getBitcast(MVT::nxv64i1, OddSplat);
- SDValue OddMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskVT, OddSplat,
- DAG.getVectorIdxConstant(0, DL));
+ SDValue Chain = DAG.getMemIntrinsicNode(
+ ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), StoreOps,
+ ConcatVT.getVectorElementType(), PtrInfo, Alignment,
+ MachineMemOperand::MOStore, MemoryLocation::UnknownSize);
- // vcompress the even and odd elements into two separate vectors
- SDValue EvenWide = DAG.getNode(ISD::VECTOR_COMPRESS, DL, ConcatVT, Concat,
- EvenMask, DAG.getUNDEF(ConcatVT));
- SDValue OddWide = DAG.getNode(ISD::VECTOR_COMPRESS, DL, ConcatVT, Concat,
- OddMask, DAG.getUNDEF(ConcatVT));
+ static const Intrinsic::ID VlsegIntrinsicsIds[] = {
+ Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3, Intrinsic::riscv_vlseg4,
+ Intrinsic::riscv_vlseg5, Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7,
+ Intrinsic::riscv_vlseg8};
- // Extract the result half of the gather for even and odd
- SDValue Even = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, EvenWide,
- DAG.getVectorIdxConstant(0, DL));
- SDValue Odd = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VecVT, OddWide,
- DAG.getVectorIdxConstant(0, DL));
+ SDValue LoadOps[] = {
+ Chain,
+ DAG.getTargetConstant(VlsegIntrinsicsIds[Factor - 2], DL, XLenVT),
+ Passthru,
+ StackPtr,
+ VL,
+ DAG.getTargetConstant(Log2_64(VecVT.getScalarSizeInBits()), DL, XLenVT)};
+
+ unsigned Sz =
+ Factor * VecVT.getVectorMinNumElements() * VecVT.getScalarSizeInBits();
+ EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, Factor);
+
+ SDValue Load = DAG.getMemIntrinsicNode(
+ ISD::INTRINSIC_W_CHAIN, DL, DAG.getVTList({VecTupTy, MVT::Other}),
+ LoadOps, ConcatVT.getVectorElementType(), PtrInfo, Alignment,
+ MachineMemOperand::MOLoad, MemoryLocation::UnknownSize);
- return DAG.getMergeValues({Even, Odd}, DL);
+ SmallVector<SDValue, 8> Res(Factor);
+
+ for (unsigned i = 0U; i < Factor; ++i)
+ Res[i] = DAG.getNode(RISCVISD::TUPLE_EXTRACT, DL, VecVT, Load,
+ DAG.getVectorIdxConstant(i, DL));
+
+ return DAG.getMergeValues(Res, DL);
}
SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op,
@@ -11109,36 +11198,125 @@ SDValue RISCVTargetLowering::lowerVECTOR_INTERLEAVE(SDValue Op,
SDLoc DL(Op);
MVT VecVT = Op.getSimpleValueType();
- assert(VecVT.isScalableVector() &&
- "vector_interleave on non-scalable vector!");
+ const unsigned Factor = Op.getNumOperands();
+ assert(Factor <= 8);
// i1 vectors need to be widened to i8
if (VecVT.getVectorElementType() == MVT::i1)
return widenVectorOpsToi8(Op, DL, DAG);
+ // Convert to scalable vectors first.
+ if (VecVT.isFixedLengthVector()) {
+ MVT ContainerVT = getContainerForFixedLengthVector(VecVT);
+ SmallVector<SDValue, 8> Ops(Factor);
+ for (unsigned i = 0U; i < Factor; ++i)
+ Ops[i] = convertToScalableVector(ContainerVT, Op.getOperand(i), DAG,
+ Subtarget);
+
+ SmallVector<EVT, 8> VTs(Factor, ContainerVT);
+ SDValue NewInterleave = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTs, Ops);
+
+ SmallVector<SDValue, 8> Res(Factor);
+ for (unsigned i = 0U; i < Factor; ++i)
+ Res[i] = convertFromScalableVector(VecVT, NewInterleave.getValue(i), DAG,
+ Subtarget);
+ return DAG.getMergeValues(Res, DL);
+ }
+
MVT XLenVT = Subtarget.getXLenVT();
SDValue VL = DAG.getRegister(RISCV::X0, XLenVT);
- // If the VT is LMUL=8, we need to split and reassemble.
- if (VecVT.getSizeInBits().getKnownMinValue() == (8 * RISCV::RVVBitsPerBlock)) {
- auto [Op0Lo, Op0Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
- auto [Op1Lo, Op1Hi] = DAG.SplitVectorOperand(Op.getNode(), 1);
- EVT SplitVT = Op0Lo.getValueType();
+ // If the VT is larger than LMUL=8, we need to split and reassemble.
+ if ((VecVT.getSizeInBits().getKnownMinValue() * Factor) >
+ (8 * RISCV::RVVBitsPerBlock)) {
+ SmallVector<SDValue, 8> Ops(Factor * 2);
+ for (unsigned i = 0; i != Factor; ++i) {
+ auto [OpLo, OpHi] = DAG.SplitVectorOperand(Op.getNode(), i);
+ Ops[i] = OpLo;
+ Ops[i + Factor] = OpHi;
+ }
+
+ SmallVector<EVT, 8> VTs(Factor, Ops[0].getValueType());
- SDValue ResLo = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL,
- DAG.getVTList(SplitVT, SplitVT), Op0Lo, Op1Lo);
- SDValue ResHi = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL,
- DAG.getVTList(SplitVT, SplitVT), Op0Hi, Op1Hi);
+ SDValue Res[] = {DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTs,
+ ArrayRef(Ops).take_front(Factor)),
+ DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTs,
+ ArrayRef(Ops).drop_front(Factor))};
- SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT,
- ResLo.getValue(0), ResLo.getValue(1));
- SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT,
- ResHi.getValue(0), ResHi.getValue(1));
- return DAG.getMergeValues({Lo, Hi}, DL);
+ SmallVector<SDValue, 8> Concats(Factor);
+ for (unsigned i = 0; i != Factor; ++i) {
+ unsigned IdxLo = 2 * i;
+ unsigned IdxHi = 2 * i + 1;
+ Concats[i] = DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT,
+ Res[IdxLo / Factor].getValue(IdxLo % Factor),
+ Res[IdxHi / Factor].getValue(IdxHi % Factor));
+ }
+
+ return DAG.getMergeValues(Concats, DL);
}
SDValue Interleaved;
+ // Spill to the stack using a segment store for simplicity.
+ if (Factor != 2) {
+ EVT MemVT =
+ EVT::getVectorVT(*DAG.getContext(), VecVT.getVectorElementType(),
+ VecVT.getVectorElementCount() * Factor);
+
+ // Allocate a stack slot.
+ Align Alignment = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
+ SDValue StackPtr =
+ DAG.CreateStackTemporary(MemVT.getStoreSize(), Alignment);
+ EVT PtrVT = StackPtr.getValueType();
+ auto &MF = DAG.getMachineFunction();
+ auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+ static const Intrinsic::ID IntrIds[] = {
+ Intrinsic::riscv_vsseg2, Intrinsic::riscv_vsseg3,
+ Intrinsic::riscv_vsseg4, Intrinsic::riscv_vsseg5,
+ Intrinsic::riscv_vsseg6, Intrinsic::riscv_vsseg7,
+ Intrinsic::riscv_vsseg8,
+ };
+
+ unsigned Sz =
+ Factor * VecVT.getVectorMinNumElements() * VecVT.getScalarSizeInBits();
+ EVT VecTupTy = MVT::getRISCVVectorTupleVT(Sz, Factor);
+
+ SDValue StoredVal = DAG.getUNDEF(VecTupTy);
+ for (unsigned i = 0; i < Factor; i++)
+ StoredVal = DAG.getNode(RISCVISD::TUPLE_INSERT, DL, VecTupTy, StoredVal,
+ Op.getOperand(i), DAG.getConstant(i, DL, XLenVT));
+
+ SDValue Ops[] = {DAG.getEntryNode(),
+ DAG.getTargetConstant(IntrIds[Factor - 2], DL, XLenVT),
+ StoredVal,
+ StackPtr,
+ VL,
+ DAG.getTargetConstant(Log2_64(VecVT.getScalarSizeInBits()),
+ DL, XLenVT)};
+
+ SDValue Chain = DAG.getMemIntrinsicNode(
+ ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Ops,
+ VecVT.getVectorElementType(), PtrInfo, Alignment,
+ MachineMemOperand::MOStore, MemoryLocation::UnknownSize);
+
+ SmallVector<SDValue, 8> Loads(Factor);
+
+ SDValue Increment =
+ DAG.getVScale(DL, PtrVT,
+ APInt(PtrVT.getFixedSizeInBits(),
+ VecVT.getStoreSize().getKnownMinValue()));
+ for (unsigned i = 0; i != Factor; ++i) {
+ if (i != 0)
+ StackPtr = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, Increment);
+
+ Loads[i] = DAG.getLoad(VecVT, DL, Chain, StackPtr, PtrInfo);
+ }
+
+ return DAG.getMergeValues(Loads, DL);
+ }
+
// If the element type is smaller than ELEN, then we can interleave with
// vwaddu.vv and vwmaccu.vx
if (VecVT.getScalarSizeInBits() < Subtarget.getELen()) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index 2fc5b40a89afad..7115eacf849207 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh | FileCheck %s
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh | FileCheck %s
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh | FileCheck %s --check-prefixes=CHECK,RV64
; Integers
@@ -152,11 +152,268 @@ define {<8 x i64>, <8 x i64>} @vector_deinterleave_v8i64_v16i64(<16 x i64> %vec)
ret {<8 x i64>, <8 x i64>} %retval
}
-declare {<16 x i1>, <16 x i1>} @llvm.vector.deinterleave2.v32i1(<32 x i1>)
-declare {<16 x i8>, <16 x i8>} @llvm.vector.deinterleave2.v32i8(<32 x i8>)
-declare {<8 x i16>, <8 x i16>} @llvm.vector.deinterleave2.v16i16(<16 x i16>)
-declare {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v8i32(<8 x i32>)
-declare {<2 x i64>, <2 x i64>} @llvm.vector.deinterleave2.v4i64(<4 x i64>)
+define {<2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v6i32(<6 x i32> %v) {
+; CHECK-LABEL: vector_deinterleave3_v2i32_v6i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v10, v8, 2
+; CHECK-NEXT: vsetivli zero, 2, e32, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v12, v8, 4
+; CHECK-NEXT: srli a0, a0, 3
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v10, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v9, v12
+; CHECK-NEXT: vs2r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vlseg3e32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %res = call {<2 x i32>, <2 x i32>, <2 x i32>} @llvm.vector.deinterleave3.v6i32(<6 x i32> %v)
+ ret {<2 x i32>, <2 x i32>, <2 x i32>} %res
+}
+
+
+define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterleave5_v2i16_v10i16(<10 x i16> %v) {
+; CHECK-LABEL: vector_deinterleave5_v2i16_v10i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v10, v8, 6
+; CHECK-NEXT: vslidedown.vi v11, v8, 4
+; CHECK-NEXT: vslidedown.vi v12, v8, 2
+; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v14, v8, 8
+; CHECK-NEXT: srli a1, a0, 3
+; CHECK-NEXT: srli a0, a0, 2
+; CHECK-NEXT: add a2, a1, a1
+; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v11, v10, a1
+; CHECK-NEXT: vslideup.vx v8, v12, a1
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v11, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v9, v14
+; CHECK-NEXT: vs2r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vlseg5e16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %res = call {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @llvm.vector.deinterleave5.v10i16(<10 x i16> %v)
+ ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res
+}
+
+define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave7_v14i8_v2i8(<14 x i8> %v) {
+; RV32-LABEL: vector_deinterleave7_v14i8_v2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: .cfi_offset s0, -8
+; RV32-NEXT: .cfi_offset s1, -12
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
+; RV32-NEXT: addi a0, sp, 32
+; RV32-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: csrr s1, vlenb
+; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v10, v8, 10
+; RV32-NEXT: vslidedown.vi v11, v8, 8
+; RV32-NEXT: vslidedown.vi v9, v8, 2
+; RV32-NEXT: srli s0, s1, 3
+; RV32-NEXT: add a0, s0, s0
+; RV32-NEXT: vsetvli zero, a0, e8, mf2, tu, ma
+; RV32-NEXT: vslideup.vx v11, v10, s0
+; RV32-NEXT: vmv1r.v v10, v8
+; RV32-NEXT: vslideup.vx v10, v9, s0
+; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 12
+; RV32-NEXT: srli a0, s1, 2
+; RV32-NEXT: add a1, a0, s0
+; RV32-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
+; RV32-NEXT: vslideup.vx v11, v9, a0
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: slli a2, a2, 1
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 32
+; RV32-NEXT: vs1r.v v11, (a2) # Unknown-size Folded Spill
+; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v9, v8, 4
+; RV32-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
+; RV32-NEXT: vslideup.vx v10, v9, a0
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 32
+; RV32-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill
+; RV32-NEXT: li a1, 3
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: call __mulsi3
+; RV32-NEXT: add s0, a0, s0
+; RV32-NEXT: addi a1, sp, 32
+; RV32-NEXT: vl1r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v8, 6
+; RV32-NEXT: srli s1, s1, 1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 32
+; RV32-NEXT: vl1r.v v9, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, s0, e8, mf2, ta, ma
+; RV32-NEXT: vslideup.vx v9, v8, a0
+; RV32-NEXT: add a0, s1, s1
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a1, a1, 1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 32
+; RV32-NEXT: vl1r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; RV32-NEXT: vslideup.vx v9, v8, s1
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 1
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 32
+; RV32-NEXT: vs1r.v v9, (a0)
+; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
+; RV32-NEXT: vlseg7e8.v v8, (a0)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 2
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: .cfi_def_cfa sp, 48
+; RV32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT: .cfi_restore ra
+; RV32-NEXT: .cfi_restore s0
+; RV32-NEXT: .cfi_restore s1
+; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_deinterleave7_v14i8_v2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -64
+; RV64-NEXT: .cfi_def_cfa_offset 64
+; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s1, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: .cfi_offset s0, -16
+; RV64-NEXT: .cfi_offset s1, -24
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: sub sp, sp, a0
+; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 4 * vlenb
+; RV64-NEXT: addi a0, sp, 32
+; RV64-NEXT: vs1r.v v8, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: csrr s1, vlenb
+; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v10, v8, 10
+; RV64-NEXT: vslidedown.vi v11, v8, 8
+; RV64-NEXT: vslidedown.vi v9, v8, 2
+; RV64-NEXT: srli s0, s1, 3
+; RV64-NEXT: add a0, s0, s0
+; RV64-NEXT: vsetvli zero, a0, e8, mf2, tu, ma
+; RV64-NEXT: vslideup.vx v11, v10, s0
+; RV64-NEXT: vmv1r.v v10, v8
+; RV64-NEXT: vslideup.vx v10, v9, s0
+; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 12
+; RV64-NEXT: srli a0, s1, 2
+; RV64-NEXT: add a1, a0, s0
+; RV64-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
+; RV64-NEXT: vslideup.vx v11, v9, a0
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: slli a2, a2, 1
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 32
+; RV64-NEXT: vs1r.v v11, (a2) # Unknown-size Folded Spill
+; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v9, v8, 4
+; RV64-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
+; RV64-NEXT: vslideup.vx v10, v9, a0
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 32
+; RV64-NEXT: vs1r.v v10, (a0) # Unknown-size Folded Spill
+; RV64-NEXT: li a1, 3
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __muldi3
+; RV64-NEXT: add s0, a0, s0
+; RV64-NEXT: addi a1, sp, 32
+; RV64-NEXT: vl1r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v8, 6
+; RV64-NEXT: srli s1, s1, 1
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 32
+; RV64-NEXT: vl1r.v v9, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsetvli zero, s0, e8, mf2, ta, ma
+; RV64-NEXT: vslideup.vx v9, v8, a0
+; RV64-NEXT: add a0, s1, s1
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a1, a1, 1
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 32
+; RV64-NEXT: vl1r.v v8, (a1) # Unknown-size Folded Reload
+; RV64-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; RV64-NEXT: vslideup.vx v9, v8, s1
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a1, a0, 1
+; RV64-NEXT: add a0, a1, a0
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 32
+; RV64-NEXT: vs1r.v v9, (a0)
+; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
+; RV64-NEXT: vlseg7e8.v v8, (a0)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 2
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: .cfi_def_cfa sp, 64
+; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s1, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT: .cfi_restore ra
+; RV64-NEXT: .cfi_restore s0
+; RV64-NEXT: .cfi_restore s1
+; RV64-NEXT: addi sp, sp, 64
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
+ %res = call {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @llvm.vector.deinterleave7.v14i8(<14 x i8> %v)
+ ret {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} %res
+}
+
; Floats
@@ -267,9 +524,125 @@ define {<4 x double>, <4 x double>} @vector_deinterleave_v4f64_v8f64(<8 x double
ret {<4 x double>, <4 x double>} %retval
}
-declare {<2 x half>,<2 x half>} @llvm.vector.deinterleave2.v4f16(<4 x half>)
-declare {<4 x half>, <4 x half>} @llvm.vector.deinterleave2.v8f16(<8 x half>)
-declare {<2 x float>, <2 x float>} @llvm.vector.deinterleave2.v4f32(<4 x float>)
-declare {<8 x half>, <8 x half>} @llvm.vector.deinterleave2.v16f16(<16 x half>)
-declare {<4 x float>, <4 x float>} @llvm.vector.deinterleave2.v8f32(<8 x float>)
-declare {<2 x double>, <2 x double>} @llvm.vector.deinterleave2.v4f64(<4 x double>)
+define {<2 x float>, <2 x float>, <2 x float>} @vector_deinterleave3_v632_v2f32(<6 x float> %v) {
+; CHECK-LABEL: vector_deinterleave3_v632_v2f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v10, v8, 2
+; CHECK-NEXT: vsetivli zero, 2, e32, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v12, v8, 4
+; CHECK-NEXT: srli a0, a0, 3
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v10, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v9, v12
+; CHECK-NEXT: vs2r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vlseg3e32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %res = call {<2 x float>, <2 x float>, <2 x float>} @llvm.vector.deinterleave3.v6f32(<6 x float> %v)
+ ret {<2 x float>, <2 x float>, <2 x float>} %res
+}
+
+
+define {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @vector_deinterleave5_v10f16_v2f16(<10 x half> %v) {
+; CHECK-LABEL: vector_deinterleave5_v10f16_v2f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v10, v8, 6
+; CHECK-NEXT: vslidedown.vi v11, v8, 4
+; CHECK-NEXT: vslidedown.vi v12, v8, 2
+; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v14, v8, 8
+; CHECK-NEXT: srli a1, a0, 3
+; CHECK-NEXT: srli a0, a0, 2
+; CHECK-NEXT: add a2, a1, a1
+; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v11, v10, a1
+; CHECK-NEXT: vslideup.vx v8, v12, a1
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v11, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v9, v14
+; CHECK-NEXT: vs2r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vlseg5e16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %res = call {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @llvm.vector.deinterleave5.v10f16(<10 x half> %v)
+ ret {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} %res
+}
+
+define {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} @vector_deinterleave7_v7f16_v1f16(<7 x half> %v) {
+; CHECK-LABEL: vector_deinterleave7_v7f16_v1f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v9, v8, 3
+; CHECK-NEXT: vslidedown.vi v10, v8, 2
+; CHECK-NEXT: vslidedown.vi v11, v8, 1
+; CHECK-NEXT: vmv1r.v v12, v8
+; CHECK-NEXT: vslidedown.vi v14, v8, 5
+; CHECK-NEXT: vslidedown.vi v15, v8, 6
+; CHECK-NEXT: srli a1, a0, 3
+; CHECK-NEXT: srli a0, a0, 2
+; CHECK-NEXT: add a2, a1, a1
+; CHECK-NEXT: add a3, a0, a0
+; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v10, v9, a1
+; CHECK-NEXT: vslideup.vx v12, v11, a1
+; CHECK-NEXT: vsetvli zero, a3, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v12, v10, a0
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v13, v8, 4
+; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v13, v14, a1
+; CHECK-NEXT: vsetvli zero, a3, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v13, v15, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs2r.v v12, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vlseg7e16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %res = call {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} @llvm.vector.deinterleave7.v7f16(<7 x half> %v)
+ ret {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index bb71c2973bb57d..81b6de9e662d59 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -1,8 +1,8 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin | FileCheck %s
-; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin | FileCheck %s
-; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin | FileCheck %s
-; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,RV64
; Integers
@@ -104,11 +104,6 @@ define {<vscale x 4 x i64>, <vscale x 4 x i64>} @vector_deinterleave_nxv4i64_nxv
ret {<vscale x 4 x i64>, <vscale x 4 x i64>} %retval
}
-declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
-declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
-declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
define {<vscale x 64 x i1>, <vscale x 64 x i1>} @vector_deinterleave_nxv64i1_nxv128i1(<vscale x 128 x i1> %vec) {
; CHECK-LABEL: vector_deinterleave_nxv64i1_nxv128i1:
@@ -228,11 +223,6 @@ define {<vscale x 8 x i64>, <vscale x 8 x i64>} @vector_deinterleave_nxv8i64_nxv
ret {<vscale x 8 x i64>, <vscale x 8 x i64>} %retval
}
-declare {<vscale x 64 x i1>, <vscale x 64 x i1>} @llvm.vector.deinterleave2.nxv128i1(<vscale x 128 x i1>)
-declare {<vscale x 64 x i8>, <vscale x 64 x i8>} @llvm.vector.deinterleave2.nxv128i8(<vscale x 128 x i8>)
-declare {<vscale x 32 x i16>, <vscale x 32 x i16>} @llvm.vector.deinterleave2.nxv64i16(<vscale x 64 x i16>)
-declare {<vscale x 16 x i32>, <vscale x 16 x i32>} @llvm.vector.deinterleave2.nxv32i32(<vscale x 32 x i32>)
-declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
; Floats
@@ -358,12 +348,6 @@ define {<vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f
ret {<vscale x 2 x double>, <vscale x 2 x double>} %retval
}
-declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
-declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
-declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
-declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
-declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
define {<vscale x 32 x bfloat>, <vscale x 32 x bfloat>} @vector_deinterleave_nxv32bf16_nxv64bf16(<vscale x 64 x bfloat> %vec) {
; CHECK-LABEL: vector_deinterleave_nxv32bf16_nxv64bf16:
@@ -461,6 +445,629 @@ define {<vscale x 8 x double>, <vscale x 8 x double>} @vector_deinterleave_nxv8f
ret {<vscale x 8 x double>, <vscale x 8 x double>} %retval
}
-declare {<vscale x 32 x half>, <vscale x 32 x half>} @llvm.vector.deinterleave2.nxv64f16(<vscale x 64 x half>)
-declare {<vscale x 16 x float>, <vscale x 16 x float>} @llvm.vector.deinterleave2.nxv32f32(<vscale x 32 x float>)
-declare {<vscale x 8 x double>, <vscale x 8 x double>} @llvm.vector.deinterleave2.nxv16f64(<vscale x 16 x double>)
+define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv48i1(<vscale x 48 x i1> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv48i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmv.v.i v10, 0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a1, a0, 2
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v8, v0, a1
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v16, v10, 1, v0
+; CHECK-NEXT: srli a0, a0, 1
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v9, v0, a0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v18, v10, 1, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmerge.vim v20, v10, 1, v0
+; CHECK-NEXT: vs8r.v v16, (a1)
+; CHECK-NEXT: vlseg3e8.v v8, (a1)
+; CHECK-NEXT: vmsne.vi v0, v8, 0
+; CHECK-NEXT: vmsne.vi v8, v10, 0
+; CHECK-NEXT: vmsne.vi v9, v12, 0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave3.nxv48i1(<vscale x 48 x i1> %vec)
+ ret {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} %retval
+}
+
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv48i8(<vscale x 48 x i8> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv48i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma
+; CHECK-NEXT: vlseg3e8.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave3.nxv48i8(<vscale x 48 x i8> %vec)
+ ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %retval
+}
+
+
+define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv24i16(<vscale x 24 x i16> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv24i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vlseg3e16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave3.nxv24i16(<vscale x 24 x i16> %vec)
+ ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %retval
+}
+
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxv12i32(<vscale x 12 x i32> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4i32_nxv12i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; CHECK-NEXT: vlseg3e32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32> %vec)
+ ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %retval
+}
+
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv6i64(<vscale x 6 x i64> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv6i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-NEXT: vlseg3e64.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave3.nxv6i64(<vscale x 6 x i64> %vec)
+ ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
+}
+
+define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv80i1(<vscale x 80 x i1> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv80i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v12, 0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vmerge.vim v16, v12, 1, v0
+; CHECK-NEXT: srli a1, a0, 2
+; CHECK-NEXT: srli a2, a0, 1
+; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a1
+; CHECK-NEXT: srli a1, a0, 3
+; CHECK-NEXT: vslidedown.vx v10, v9, a2
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v18, v12, 1, v0
+; CHECK-NEXT: sub a0, a0, a1
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v9, v9, a0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v20, v12, 1, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmerge.vim v14, v12, 1, v0
+; CHECK-NEXT: vmv1r.v v10, v15
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vim v12, v12, 1, v0
+; CHECK-NEXT: vmv1r.v v11, v12
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v8, v21
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vmv1r.v v9, v14
+; CHECK-NEXT: vs8r.v v16, (a0)
+; CHECK-NEXT: vmv1r.v v12, v13
+; CHECK-NEXT: vs8r.v v8, (a1)
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vlseg5e8.v v8, (a0)
+; CHECK-NEXT: vlseg5e8.v v14, (a1)
+; CHECK-NEXT: vmv2r.v v20, v8
+; CHECK-NEXT: vmv2r.v v22, v10
+; CHECK-NEXT: vmv1r.v v21, v14
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmsne.vi v0, v20, 0
+; CHECK-NEXT: vmv1r.v v14, v9
+; CHECK-NEXT: vmsne.vi v8, v14, 0
+; CHECK-NEXT: vmv1r.v v23, v16
+; CHECK-NEXT: vmsne.vi v9, v22, 0
+; CHECK-NEXT: vmv1r.v v16, v11
+; CHECK-NEXT: vmsne.vi v10, v16, 0
+; CHECK-NEXT: vmv1r.v v13, v18
+; CHECK-NEXT: vmsne.vi v11, v12, 0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave5.nxv80i1(<vscale x 80 x i1> %vec)
+ ret {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} %retval
+}
+
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv80i8(<vscale x 80 x i8> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv80i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v26, v15
+; CHECK-NEXT: vmv1r.v v27, v16
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v24, v13
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vmv1r.v v25, v14
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vmv1r.v v28, v17
+; CHECK-NEXT: vs8r.v v24, (a1)
+; CHECK-NEXT: vlseg5e8.v v12, (a0)
+; CHECK-NEXT: vlseg5e8.v v18, (a1)
+; CHECK-NEXT: vmv2r.v v8, v12
+; CHECK-NEXT: vmv1r.v v9, v18
+; CHECK-NEXT: vmv1r.v v18, v13
+; CHECK-NEXT: vmv2r.v v12, v14
+; CHECK-NEXT: vmv1r.v v13, v20
+; CHECK-NEXT: vmv1r.v v20, v15
+; CHECK-NEXT: vmv1r.v v17, v22
+; CHECK-NEXT: vmv2r.v v10, v18
+; CHECK-NEXT: vmv2r.v v14, v20
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave5.nxv80i8(<vscale x 80 x i8> %vec)
+ ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %retval
+}
+
+
+define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv40i16(<vscale x 40 x i16> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv40i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v26, v15
+; CHECK-NEXT: vmv1r.v v27, v16
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v24, v13
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vmv1r.v v25, v14
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vmv1r.v v28, v17
+; CHECK-NEXT: vs8r.v v24, (a1)
+; CHECK-NEXT: vlseg5e16.v v12, (a0)
+; CHECK-NEXT: vlseg5e16.v v18, (a1)
+; CHECK-NEXT: vmv2r.v v8, v12
+; CHECK-NEXT: vmv1r.v v9, v18
+; CHECK-NEXT: vmv1r.v v18, v13
+; CHECK-NEXT: vmv2r.v v12, v14
+; CHECK-NEXT: vmv1r.v v13, v20
+; CHECK-NEXT: vmv1r.v v20, v15
+; CHECK-NEXT: vmv1r.v v17, v22
+; CHECK-NEXT: vmv2r.v v10, v18
+; CHECK-NEXT: vmv2r.v v14, v20
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave5.nxv40i16(<vscale x 40 x i16> %vec)
+ ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %retval
+}
+
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxv20i32(<vscale x 20 x i32> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4i32_nxv20i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v26, v15
+; CHECK-NEXT: vmv1r.v v27, v16
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v24, v13
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vmv1r.v v25, v14
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vmv1r.v v28, v17
+; CHECK-NEXT: vs8r.v v24, (a1)
+; CHECK-NEXT: vlseg5e32.v v12, (a0)
+; CHECK-NEXT: vlseg5e32.v v18, (a1)
+; CHECK-NEXT: vmv2r.v v8, v12
+; CHECK-NEXT: vmv1r.v v9, v18
+; CHECK-NEXT: vmv1r.v v18, v13
+; CHECK-NEXT: vmv2r.v v12, v14
+; CHECK-NEXT: vmv1r.v v13, v20
+; CHECK-NEXT: vmv1r.v v20, v15
+; CHECK-NEXT: vmv1r.v v17, v22
+; CHECK-NEXT: vmv2r.v v10, v18
+; CHECK-NEXT: vmv2r.v v14, v20
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave5.nxv20i32(<vscale x 20 x i32> %vec)
+ ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %retval
+}
+
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv10i64(<vscale x 10 x i64> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv10i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v26, v15
+; CHECK-NEXT: vmv1r.v v27, v16
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v24, v13
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vmv1r.v v25, v14
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vmv1r.v v28, v17
+; CHECK-NEXT: vs8r.v v24, (a1)
+; CHECK-NEXT: vlseg5e64.v v12, (a0)
+; CHECK-NEXT: vlseg5e64.v v18, (a1)
+; CHECK-NEXT: vmv2r.v v8, v12
+; CHECK-NEXT: vmv1r.v v9, v18
+; CHECK-NEXT: vmv1r.v v18, v13
+; CHECK-NEXT: vmv2r.v v12, v14
+; CHECK-NEXT: vmv1r.v v13, v20
+; CHECK-NEXT: vmv1r.v v20, v15
+; CHECK-NEXT: vmv1r.v v17, v22
+; CHECK-NEXT: vmv2r.v v10, v18
+; CHECK-NEXT: vmv2r.v v14, v20
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave5.nxv10i64(<vscale x 10 x i64> %vec)
+ ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
+}
+
+define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv112i1(<vscale x 112 x i1> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv112i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v12, 0
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: vmerge.vim v16, v12, 1, v0
+; CHECK-NEXT: srli a2, a1, 2
+; CHECK-NEXT: srli a0, a1, 1
+; CHECK-NEXT: srli a3, a1, 3
+; CHECK-NEXT: vsetvli a4, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a2
+; CHECK-NEXT: vslidedown.vx v10, v9, a0
+; CHECK-NEXT: slli a3, a3, 1
+; CHECK-NEXT: vslidedown.vx v11, v8, a2
+; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v18, v12, 1, v0
+; CHECK-NEXT: sub a1, a1, a3
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v9, v9, a1
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v20, v12, 1, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmerge.vim v22, v12, 1, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vim v14, v12, 1, v0
+; CHECK-NEXT: vmv1r.v v10, v15
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: vmerge.vim v24, v12, 1, v0
+; CHECK-NEXT: vmv1r.v v11, v24
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v8, a0
+; CHECK-NEXT: vmv1r.v v8, v23
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v9, v14
+; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v14, v12, 1, v0
+; CHECK-NEXT: vmv1r.v v12, v25
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vmv1r.v v13, v14
+; CHECK-NEXT: vs8r.v v16, (a0)
+; CHECK-NEXT: vmv1r.v v14, v15
+; CHECK-NEXT: vs8r.v v8, (a1)
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vlseg7e8.v v8, (a0)
+; CHECK-NEXT: vlseg7e8.v v16, (a1)
+; CHECK-NEXT: vmv2r.v v24, v8
+; CHECK-NEXT: vmv2r.v v26, v10
+; CHECK-NEXT: vmv2r.v v28, v12
+; CHECK-NEXT: vmv1r.v v25, v16
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmsne.vi v0, v24, 0
+; CHECK-NEXT: vmv1r.v v16, v9
+; CHECK-NEXT: vmsne.vi v8, v16, 0
+; CHECK-NEXT: vmv1r.v v27, v18
+; CHECK-NEXT: vmsne.vi v9, v26, 0
+; CHECK-NEXT: vmv1r.v v18, v11
+; CHECK-NEXT: vmsne.vi v10, v18, 0
+; CHECK-NEXT: vmv1r.v v29, v20
+; CHECK-NEXT: vmsne.vi v11, v28, 0
+; CHECK-NEXT: vmv1r.v v20, v13
+; CHECK-NEXT: vmsne.vi v12, v20, 0
+; CHECK-NEXT: vmv1r.v v15, v22
+; CHECK-NEXT: vmsne.vi v13, v14, 0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave7.nxv112i1(<vscale x 112 x i1> %vec)
+ ret {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} %retval
+}
+
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv112i8(<vscale x 112 x i8> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv112i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v30, v21
+; CHECK-NEXT: vmv1r.v v28, v19
+; CHECK-NEXT: vmv1r.v v29, v20
+; CHECK-NEXT: vmv1r.v v26, v17
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v27, v18
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vmv1r.v v24, v15
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vmv1r.v v25, v16
+; CHECK-NEXT: vs8r.v v24, (a1)
+; CHECK-NEXT: vlseg7e8.v v14, (a0)
+; CHECK-NEXT: vlseg7e8.v v22, (a1)
+; CHECK-NEXT: vmv2r.v v8, v14
+; CHECK-NEXT: vmv1r.v v9, v22
+; CHECK-NEXT: vmv1r.v v22, v15
+; CHECK-NEXT: vmv2r.v v12, v16
+; CHECK-NEXT: vmv1r.v v13, v24
+; CHECK-NEXT: vmv1r.v v24, v17
+; CHECK-NEXT: vmv2r.v v16, v18
+; CHECK-NEXT: vmv1r.v v17, v26
+; CHECK-NEXT: vmv1r.v v26, v19
+; CHECK-NEXT: vmv1r.v v21, v28
+; CHECK-NEXT: vmv2r.v v10, v22
+; CHECK-NEXT: vmv2r.v v14, v24
+; CHECK-NEXT: vmv2r.v v18, v26
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave7.nxv112i8(<vscale x 112 x i8> %vec)
+ ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %retval
+}
+
+
+define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv56i16(<vscale x 56 x i16> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv56i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v30, v21
+; CHECK-NEXT: vmv1r.v v28, v19
+; CHECK-NEXT: vmv1r.v v29, v20
+; CHECK-NEXT: vmv1r.v v26, v17
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v27, v18
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vmv1r.v v24, v15
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vmv1r.v v25, v16
+; CHECK-NEXT: vs8r.v v24, (a1)
+; CHECK-NEXT: vlseg7e16.v v14, (a0)
+; CHECK-NEXT: vlseg7e16.v v22, (a1)
+; CHECK-NEXT: vmv2r.v v8, v14
+; CHECK-NEXT: vmv1r.v v9, v22
+; CHECK-NEXT: vmv1r.v v22, v15
+; CHECK-NEXT: vmv2r.v v12, v16
+; CHECK-NEXT: vmv1r.v v13, v24
+; CHECK-NEXT: vmv1r.v v24, v17
+; CHECK-NEXT: vmv2r.v v16, v18
+; CHECK-NEXT: vmv1r.v v17, v26
+; CHECK-NEXT: vmv1r.v v26, v19
+; CHECK-NEXT: vmv1r.v v21, v28
+; CHECK-NEXT: vmv2r.v v10, v22
+; CHECK-NEXT: vmv2r.v v14, v24
+; CHECK-NEXT: vmv2r.v v18, v26
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave7.nxv56i16(<vscale x 56 x i16> %vec)
+ ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %retval
+}
+
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxv28i32(<vscale x 28 x i32> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4i32_nxv28i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v30, v21
+; CHECK-NEXT: vmv1r.v v28, v19
+; CHECK-NEXT: vmv1r.v v29, v20
+; CHECK-NEXT: vmv1r.v v26, v17
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v27, v18
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vmv1r.v v24, v15
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vmv1r.v v25, v16
+; CHECK-NEXT: vs8r.v v24, (a1)
+; CHECK-NEXT: vlseg7e32.v v14, (a0)
+; CHECK-NEXT: vlseg7e32.v v22, (a1)
+; CHECK-NEXT: vmv2r.v v8, v14
+; CHECK-NEXT: vmv1r.v v9, v22
+; CHECK-NEXT: vmv1r.v v22, v15
+; CHECK-NEXT: vmv2r.v v12, v16
+; CHECK-NEXT: vmv1r.v v13, v24
+; CHECK-NEXT: vmv1r.v v24, v17
+; CHECK-NEXT: vmv2r.v v16, v18
+; CHECK-NEXT: vmv1r.v v17, v26
+; CHECK-NEXT: vmv1r.v v26, v19
+; CHECK-NEXT: vmv1r.v v21, v28
+; CHECK-NEXT: vmv2r.v v10, v22
+; CHECK-NEXT: vmv2r.v v14, v24
+; CHECK-NEXT: vmv2r.v v18, v26
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave7.nxv28i32(<vscale x 28 x i32> %vec)
+ ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %retval
+}
+
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv14i64(<vscale x 14 x i64> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv14i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v30, v21
+; CHECK-NEXT: vmv1r.v v28, v19
+; CHECK-NEXT: vmv1r.v v29, v20
+; CHECK-NEXT: vmv1r.v v26, v17
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v27, v18
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vmv1r.v v24, v15
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vmv1r.v v25, v16
+; CHECK-NEXT: vs8r.v v24, (a1)
+; CHECK-NEXT: vlseg7e64.v v14, (a0)
+; CHECK-NEXT: vlseg7e64.v v22, (a1)
+; CHECK-NEXT: vmv2r.v v8, v14
+; CHECK-NEXT: vmv1r.v v9, v22
+; CHECK-NEXT: vmv1r.v v22, v15
+; CHECK-NEXT: vmv2r.v v12, v16
+; CHECK-NEXT: vmv1r.v v13, v24
+; CHECK-NEXT: vmv1r.v v24, v17
+; CHECK-NEXT: vmv2r.v v16, v18
+; CHECK-NEXT: vmv1r.v v17, v26
+; CHECK-NEXT: vmv1r.v v26, v19
+; CHECK-NEXT: vmv1r.v v21, v28
+; CHECK-NEXT: vmv2r.v v10, v22
+; CHECK-NEXT: vmv2r.v v14, v24
+; CHECK-NEXT: vmv2r.v v18, v26
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave7.nxv14i64(<vscale x 14 x i64> %vec)
+ ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
index 08aa02c7e869a1..1e4cb064801635 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
@@ -122,10 +122,237 @@ define <4 x i64> @vector_interleave_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b) {
ret <4 x i64> %res
}
-declare <32 x i1> @llvm.vector.interleave2.v32i1(<16 x i1>, <16 x i1>)
-declare <16 x i16> @llvm.vector.interleave2.v16i16(<8 x i16>, <8 x i16>)
-declare <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32>, <4 x i32>)
-declare <4 x i64> @llvm.vector.interleave2.v4i64(<2 x i64>, <2 x i64>)
+define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) {
+; CHECK-LABEL: vector_interleave3_v6i32_v2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a1, a1, 1
+; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsseg3e32.v v8, (a0)
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: vle32.v v9, (a2)
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: vle32.v v10, (a1)
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v9, 2
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v10, 4
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave3_v6i32_v2i32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: .cfi_def_cfa_offset 16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a1, a1, 1
+; ZVBB-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vsseg3e32.v v8, (a0)
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: vle32.v v9, (a2)
+; ZVBB-NEXT: vle32.v v8, (a0)
+; ZVBB-NEXT: add a1, a2, a1
+; ZVBB-NEXT: vle32.v v10, (a1)
+; ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v9, 2
+; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v10, 4
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: .cfi_def_cfa sp, 16
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: .cfi_def_cfa_offset 0
+; ZVBB-NEXT: ret
+ %res = call <6 x i32> @llvm.vector.interleave3.v6i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c)
+ ret <6 x i32> %res
+}
+
+
+define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e) {
+; CHECK-LABEL: vector_interleave5_v10i16_v2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a1, a1, 2
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vsetvli a4, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vsseg5e16.v v8, (a0)
+; CHECK-NEXT: add a4, a3, a1
+; CHECK-NEXT: vle16.v v9, (a2)
+; CHECK-NEXT: vle16.v v10, (a4)
+; CHECK-NEXT: vle16.v v11, (a3)
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: add a1, a4, a1
+; CHECK-NEXT: vle16.v v12, (a1)
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v11, v10, 2
+; CHECK-NEXT: vslideup.vi v8, v9, 2
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v11, 4
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v12, 8
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave5_v10i16_v2i16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: .cfi_def_cfa_offset 16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a1, a1, 2
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vsetvli a4, zero, e16, mf4, ta, ma
+; ZVBB-NEXT: vsseg5e16.v v8, (a0)
+; ZVBB-NEXT: add a4, a3, a1
+; ZVBB-NEXT: vle16.v v9, (a2)
+; ZVBB-NEXT: vle16.v v10, (a4)
+; ZVBB-NEXT: vle16.v v11, (a3)
+; ZVBB-NEXT: vle16.v v8, (a0)
+; ZVBB-NEXT: add a1, a4, a1
+; ZVBB-NEXT: vle16.v v12, (a1)
+; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVBB-NEXT: vslideup.vi v11, v10, 2
+; ZVBB-NEXT: vslideup.vi v8, v9, 2
+; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v11, 4
+; ZVBB-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v12, 8
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: .cfi_def_cfa sp, 16
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: .cfi_def_cfa_offset 0
+; ZVBB-NEXT: ret
+ %res = call <10 x i16> @llvm.vector.interleave5.v10i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e)
+ ret <10 x i16> %res
+}
+
+define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g) {
+; CHECK-LABEL: vector_interleave7_v14i8_v2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a1, a1, 3
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: add a4, a3, a1
+; CHECK-NEXT: vsetvli a5, zero, e8, mf8, ta, ma
+; CHECK-NEXT: vsseg7e8.v v8, (a0)
+; CHECK-NEXT: vle8.v v9, (a4)
+; CHECK-NEXT: add a4, a4, a1
+; CHECK-NEXT: vle8.v v10, (a2)
+; CHECK-NEXT: add a2, a4, a1
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: vle8.v v11, (a2)
+; CHECK-NEXT: vle8.v v12, (a4)
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vle8.v v13, (a1)
+; CHECK-NEXT: vle8.v v14, (a3)
+; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
+; CHECK-NEXT: vslideup.vi v12, v11, 2
+; CHECK-NEXT: vslideup.vi v8, v10, 2
+; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
+; CHECK-NEXT: vslideup.vi v12, v13, 4
+; CHECK-NEXT: vslideup.vi v8, v14, 4
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v9, 6
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v12, 8
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave7_v14i8_v2i8:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: .cfi_def_cfa_offset 16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a1, a1, 3
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: add a4, a3, a1
+; ZVBB-NEXT: vsetvli a5, zero, e8, mf8, ta, ma
+; ZVBB-NEXT: vsseg7e8.v v8, (a0)
+; ZVBB-NEXT: vle8.v v9, (a4)
+; ZVBB-NEXT: add a4, a4, a1
+; ZVBB-NEXT: vle8.v v10, (a2)
+; ZVBB-NEXT: add a2, a4, a1
+; ZVBB-NEXT: add a1, a2, a1
+; ZVBB-NEXT: vle8.v v11, (a2)
+; ZVBB-NEXT: vle8.v v12, (a4)
+; ZVBB-NEXT: vle8.v v8, (a0)
+; ZVBB-NEXT: vle8.v v13, (a1)
+; ZVBB-NEXT: vle8.v v14, (a3)
+; ZVBB-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
+; ZVBB-NEXT: vslideup.vi v12, v11, 2
+; ZVBB-NEXT: vslideup.vi v8, v10, 2
+; ZVBB-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
+; ZVBB-NEXT: vslideup.vi v12, v13, 4
+; ZVBB-NEXT: vslideup.vi v8, v14, 4
+; ZVBB-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v9, 6
+; ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v12, 8
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: .cfi_def_cfa sp, 16
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: .cfi_def_cfa_offset 0
+; ZVBB-NEXT: ret
+ %res = call <14 x i8> @llvm.vector.interleave7.v14i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g)
+ ret <14 x i8> %res
+}
+
; Floats
@@ -270,13 +497,240 @@ define <4 x double> @vector_interleave_v4f64_v2f64(<2 x double> %a, <2 x double>
ret <4 x double> %res
}
+define <6 x float> @vector_interleave3_v632_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; CHECK-LABEL: vector_interleave3_v632_v2f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a1, a1, 1
+; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsseg3e32.v v8, (a0)
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: vle32.v v9, (a2)
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: vle32.v v10, (a1)
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v9, 2
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v10, 4
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave3_v632_v2f32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: .cfi_def_cfa_offset 16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a1, a1, 1
+; ZVBB-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vsseg3e32.v v8, (a0)
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: vle32.v v9, (a2)
+; ZVBB-NEXT: vle32.v v8, (a0)
+; ZVBB-NEXT: add a1, a2, a1
+; ZVBB-NEXT: vle32.v v10, (a1)
+; ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v9, 2
+; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v10, 4
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: .cfi_def_cfa sp, 16
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: .cfi_def_cfa_offset 0
+; ZVBB-NEXT: ret
+ %res = call <6 x float> @llvm.vector.interleave3.v6f32(<2 x float> %a, <2 x float> %b, <2 x float> %c)
+ ret <6 x float> %res
+}
+
-declare <4 x half> @llvm.vector.interleave2.v4f16(<2 x half>, <2 x half>)
-declare <8 x half> @llvm.vector.interleave2.v8f16(<4 x half>, <4 x half>)
-declare <4 x float> @llvm.vector.interleave2.v4f32(<2 x float>, <2 x float>)
-declare <16 x half> @llvm.vector.interleave2.v16f16(<8 x half>, <8 x half>)
-declare <8 x float> @llvm.vector.interleave2.v8f32(<4 x float>, <4 x float>)
-declare <4 x double> @llvm.vector.interleave2.v4f64(<2 x double>, <2 x double>)
+define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e) {
+; CHECK-LABEL: vector_interleave5_v10f16_v2f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a1, a1, 2
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vsetvli a4, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vsseg5e16.v v8, (a0)
+; CHECK-NEXT: add a4, a3, a1
+; CHECK-NEXT: vle16.v v9, (a2)
+; CHECK-NEXT: vle16.v v10, (a4)
+; CHECK-NEXT: vle16.v v11, (a3)
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: add a1, a4, a1
+; CHECK-NEXT: vle16.v v12, (a1)
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v11, v10, 2
+; CHECK-NEXT: vslideup.vi v8, v9, 2
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v11, 4
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v12, 8
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave5_v10f16_v2f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: .cfi_def_cfa_offset 16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a1, a1, 2
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vsetvli a4, zero, e16, mf4, ta, ma
+; ZVBB-NEXT: vsseg5e16.v v8, (a0)
+; ZVBB-NEXT: add a4, a3, a1
+; ZVBB-NEXT: vle16.v v9, (a2)
+; ZVBB-NEXT: vle16.v v10, (a4)
+; ZVBB-NEXT: vle16.v v11, (a3)
+; ZVBB-NEXT: vle16.v v8, (a0)
+; ZVBB-NEXT: add a1, a4, a1
+; ZVBB-NEXT: vle16.v v12, (a1)
+; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVBB-NEXT: vslideup.vi v11, v10, 2
+; ZVBB-NEXT: vslideup.vi v8, v9, 2
+; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v11, 4
+; ZVBB-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v12, 8
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: .cfi_def_cfa sp, 16
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: .cfi_def_cfa_offset 0
+; ZVBB-NEXT: ret
+ %res = call <10 x half> @llvm.vector.interleave5.v10f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e)
+ ret <10 x half> %res
+}
+
+define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g) {
+; CHECK-LABEL: vector_interleave7_v7f16_v1f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a1, a1, 2
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: add a4, a3, a1
+; CHECK-NEXT: vsetvli a5, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vsseg7e16.v v8, (a0)
+; CHECK-NEXT: vle16.v v9, (a4)
+; CHECK-NEXT: add a4, a4, a1
+; CHECK-NEXT: vle16.v v10, (a2)
+; CHECK-NEXT: add a2, a4, a1
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: vle16.v v11, (a2)
+; CHECK-NEXT: vle16.v v12, (a4)
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: vle16.v v13, (a1)
+; CHECK-NEXT: vle16.v v14, (a3)
+; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma
+; CHECK-NEXT: vslideup.vi v12, v11, 1
+; CHECK-NEXT: vslideup.vi v8, v10, 1
+; CHECK-NEXT: vsetivli zero, 3, e16, mf2, tu, ma
+; CHECK-NEXT: vslideup.vi v12, v13, 2
+; CHECK-NEXT: vslideup.vi v8, v14, 2
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v9, 3
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v12, 4
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave7_v7f16_v1f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: .cfi_def_cfa_offset 16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a1, a1, 2
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: add a4, a3, a1
+; ZVBB-NEXT: vsetvli a5, zero, e16, mf4, ta, ma
+; ZVBB-NEXT: vsseg7e16.v v8, (a0)
+; ZVBB-NEXT: vle16.v v9, (a4)
+; ZVBB-NEXT: add a4, a4, a1
+; ZVBB-NEXT: vle16.v v10, (a2)
+; ZVBB-NEXT: add a2, a4, a1
+; ZVBB-NEXT: add a1, a2, a1
+; ZVBB-NEXT: vle16.v v11, (a2)
+; ZVBB-NEXT: vle16.v v12, (a4)
+; ZVBB-NEXT: vle16.v v8, (a0)
+; ZVBB-NEXT: vle16.v v13, (a1)
+; ZVBB-NEXT: vle16.v v14, (a3)
+; ZVBB-NEXT: vsetivli zero, 2, e16, mf2, tu, ma
+; ZVBB-NEXT: vslideup.vi v12, v11, 1
+; ZVBB-NEXT: vslideup.vi v8, v10, 1
+; ZVBB-NEXT: vsetivli zero, 3, e16, mf2, tu, ma
+; ZVBB-NEXT: vslideup.vi v12, v13, 2
+; ZVBB-NEXT: vslideup.vi v8, v14, 2
+; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v9, 3
+; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v12, 4
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: .cfi_def_cfa sp, 16
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: .cfi_def_cfa_offset 0
+; ZVBB-NEXT: ret
+ %res = call <7 x half> @llvm.vector.interleave7.v7f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g)
+ ret <7 x half> %res
+}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; RV32: {{.*}}
; RV64: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index 53929d17bb5a1d..6aa62c22569256 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh,+zvfbfmin | FileCheck %s
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh,+zvfbfmin | FileCheck %s
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfhmin,+zvfbfmin | FileCheck %s
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfhmin,+zvfbfmin | FileCheck %s
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvbb,+zvfh,+zvfbfmin | FileCheck %s --check-prefix=ZVBB
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvbb,+zvfh,+zvfbfmin | FileCheck %s --check-prefix=ZVBB
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,RV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc < %s -mtriple=riscv32 -mattr=+m,+v,+zvbb,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=ZVBB,ZVBB-RV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+zvbb,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=ZVBB,ZVBB-RV64
; Integers
@@ -151,11 +151,6 @@ define <vscale x 4 x i64> @vector_interleave_nxv4i64_nxv2i64(<vscale x 2 x i64>
ret <vscale x 4 x i64> %res
}
-declare <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
-declare <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8>, <vscale x 16 x i8>)
-declare <vscale x 16 x i16> @llvm.vector.interleave2.nxv16i16(<vscale x 8 x i16>, <vscale x 8 x i16>)
-declare <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32>, <vscale x 4 x i32>)
-declare <vscale x 4 x i64> @llvm.vector.interleave2.nxv4i64(<vscale x 2 x i64>, <vscale x 2 x i64>)
define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv64i1(<vscale x 64 x i1> %a, <vscale x 64 x i1> %b) {
; CHECK-LABEL: vector_interleave_nxv128i1_nxv64i1:
@@ -324,11 +319,6 @@ define <vscale x 16 x i64> @vector_interleave_nxv16i64_nxv8i64(<vscale x 8 x i64
ret <vscale x 16 x i64> %res
}
-declare <vscale x 128 x i1> @llvm.vector.interleave2.nxv128i1(<vscale x 64 x i1>, <vscale x 64 x i1>)
-declare <vscale x 128 x i8> @llvm.vector.interleave2.nxv128i8(<vscale x 64 x i8>, <vscale x 64 x i8>)
-declare <vscale x 64 x i16> @llvm.vector.interleave2.nxv64i16(<vscale x 32 x i16>, <vscale x 32 x i16>)
-declare <vscale x 32 x i32> @llvm.vector.interleave2.nxv32i32(<vscale x 16 x i32>, <vscale x 16 x i32>)
-declare <vscale x 16 x i64> @llvm.vector.interleave2.nxv16i64(<vscale x 8 x i64>, <vscale x 8 x i64>)
; Floats
@@ -565,12 +555,6 @@ define <vscale x 4 x double> @vector_interleave_nxv4f64_nxv2f64(<vscale x 2 x do
}
-declare <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
-declare <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
-declare <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
-declare <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
define <vscale x 64 x bfloat> @vector_interleave_nxv64bf16_nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) {
; CHECK-LABEL: vector_interleave_nxv64bf16_nxv32bf16:
@@ -734,6 +718,2856 @@ define <vscale x 8 x i32> @vector_interleave_nxv8i32_nxv4i32_poison2(<vscale x 4
ret <vscale x 8 x i32> %res
}
-declare <vscale x 64 x half> @llvm.vector.interleave2.nxv64f16(<vscale x 32 x half>, <vscale x 32 x half>)
-declare <vscale x 32 x float> @llvm.vector.interleave2.nxv32f32(<vscale x 16 x float>, <vscale x 16 x float>)
-declare <vscale x 16 x double> @llvm.vector.interleave2.nxv16f64(<vscale x 8 x double>, <vscale x 8 x double>)
+define <vscale x 48 x i1> @vector_interleave_nxv48i1_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c) nounwind {
+; CHECK-LABEL: vector_interleave_nxv48i1_nxv16i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmv1r.v v10, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmv.v.i v12, 0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: vmerge.vim v16, v12, 1, v0
+; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmerge.vim v14, v12, 1, v0
+; CHECK-NEXT: add a3, a0, a2
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmerge.vim v18, v12, 1, v0
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: vsseg3e8.v v14, (a0)
+; CHECK-NEXT: vl2r.v v8, (a2)
+; CHECK-NEXT: srli a2, a1, 2
+; CHECK-NEXT: srli a1, a1, 1
+; CHECK-NEXT: vl2r.v v10, (a3)
+; CHECK-NEXT: vl2r.v v12, (a0)
+; CHECK-NEXT: add a0, a2, a2
+; CHECK-NEXT: vmsne.vi v14, v8, 0
+; CHECK-NEXT: vmsne.vi v8, v10, 0
+; CHECK-NEXT: vmsne.vi v0, v12, 0
+; CHECK-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v0, v8, a2
+; CHECK-NEXT: add a0, a1, a1
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v0, v14, a1
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv48i1_nxv16i1:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; ZVBB-NEXT: vmv1r.v v10, v0
+; ZVBB-NEXT: vmv1r.v v0, v8
+; ZVBB-NEXT: vmv.v.i v12, 0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: vmerge.vim v16, v12, 1, v0
+; ZVBB-NEXT: slli a2, a1, 1
+; ZVBB-NEXT: vmv1r.v v0, v10
+; ZVBB-NEXT: vmerge.vim v14, v12, 1, v0
+; ZVBB-NEXT: add a3, a0, a2
+; ZVBB-NEXT: vmv1r.v v0, v9
+; ZVBB-NEXT: vmerge.vim v18, v12, 1, v0
+; ZVBB-NEXT: add a2, a3, a2
+; ZVBB-NEXT: vsseg3e8.v v14, (a0)
+; ZVBB-NEXT: vl2r.v v8, (a2)
+; ZVBB-NEXT: srli a2, a1, 2
+; ZVBB-NEXT: srli a1, a1, 1
+; ZVBB-NEXT: vl2r.v v10, (a3)
+; ZVBB-NEXT: vl2r.v v12, (a0)
+; ZVBB-NEXT: add a0, a2, a2
+; ZVBB-NEXT: vmsne.vi v14, v8, 0
+; ZVBB-NEXT: vmsne.vi v8, v10, 0
+; ZVBB-NEXT: vmsne.vi v0, v12, 0
+; ZVBB-NEXT: vsetvli zero, a0, e8, mf2, ta, ma
+; ZVBB-NEXT: vslideup.vx v0, v8, a2
+; ZVBB-NEXT: add a0, a1, a1
+; ZVBB-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v0, v14, a1
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 48 x i1> @llvm.vector.interleave3.nxv48i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c)
+ ret <vscale x 48 x i1> %res
+}
+
+
+define <vscale x 48 x i8> @vector_interleave_nxv48i8_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c) nounwind {
+; CHECK-LABEL: vector_interleave_nxv48i8_nxv16i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT: vsseg3e8.v v8, (a0)
+; CHECK-NEXT: vl2r.v v8, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl2r.v v10, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl2r.v v12, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv48i8_nxv16i8:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 1
+; ZVBB-NEXT: vsetvli a2, zero, e8, m2, ta, ma
+; ZVBB-NEXT: vsseg3e8.v v8, (a0)
+; ZVBB-NEXT: vl2r.v v8, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl2r.v v10, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl2r.v v12, (a0)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 48 x i8> @llvm.vector.interleave3.nxv48i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c)
+ ret <vscale x 48 x i8> %res
+}
+
+
+define <vscale x 24 x i16> @vector_interleave_nxv24i16_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c) nounwind {
+; CHECK-LABEL: vector_interleave_nxv24i16_nxv8i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vsseg3e16.v v8, (a0)
+; CHECK-NEXT: vl2re16.v v8, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl2re16.v v10, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl2re16.v v12, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv24i16_nxv8i16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 1
+; ZVBB-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; ZVBB-NEXT: vsseg3e16.v v8, (a0)
+; ZVBB-NEXT: vl2re16.v v8, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl2re16.v v10, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl2re16.v v12, (a0)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 24 x i16> @llvm.vector.interleave3.nxv24i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c)
+ ret <vscale x 24 x i16> %res
+}
+
+
+define <vscale x 12 x i32> @vector_interleave_nxv12i32_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c) nounwind {
+; CHECK-LABEL: vector_interleave_nxv12i32_nxv4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: vsetvli a2, zero, e32, m2, ta, ma
+; CHECK-NEXT: vsseg3e32.v v8, (a0)
+; CHECK-NEXT: vl2re32.v v8, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl2re32.v v10, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl2re32.v v12, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv12i32_nxv4i32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 1
+; ZVBB-NEXT: vsetvli a2, zero, e32, m2, ta, ma
+; ZVBB-NEXT: vsseg3e32.v v8, (a0)
+; ZVBB-NEXT: vl2re32.v v8, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl2re32.v v10, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl2re32.v v12, (a0)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 12 x i32> @llvm.vector.interleave3.nxv12i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c)
+ ret <vscale x 12 x i32> %res
+}
+
+
+define <vscale x 6 x i64> @vector_interleave_nxv6i64_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c) nounwind {
+; CHECK-LABEL: vector_interleave_nxv6i64_nxv2i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; CHECK-NEXT: vsseg3e64.v v8, (a0)
+; CHECK-NEXT: vl2re64.v v8, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl2re64.v v10, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl2re64.v v12, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv6i64_nxv2i64:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 1
+; ZVBB-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; ZVBB-NEXT: vsseg3e64.v v8, (a0)
+; ZVBB-NEXT: vl2re64.v v8, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl2re64.v v10, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl2re64.v v12, (a0)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 6 x i64> @llvm.vector.interleave3.nxv6i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c)
+ ret <vscale x 6 x i64> %res
+}
+
+define <vscale x 80 x i1> @vector_interleave_nxv80i1_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e) nounwind {
+; CHECK-LABEL: vector_interleave_nxv80i1_nxv16i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 10
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmv.v.i v12, 0
+; CHECK-NEXT: addi a4, sp, 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 2
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: vmerge.vim v14, v12, 1, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vim v18, v12, 1, v0
+; CHECK-NEXT: add a2, a4, a1
+; CHECK-NEXT: srli a3, a1, 2
+; CHECK-NEXT: vmv2r.v v20, v14
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmerge.vim v16, v12, 1, v0
+; CHECK-NEXT: vmv1r.v v21, v18
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmerge.vim v8, v12, 1, v0
+; CHECK-NEXT: vmv1r.v v22, v16
+; CHECK-NEXT: vmv1r.v v16, v19
+; CHECK-NEXT: add a5, a2, a1
+; CHECK-NEXT: vmv1r.v v23, v8
+; CHECK-NEXT: vmv1r.v v18, v9
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: vmerge.vim v24, v12, 1, v0
+; CHECK-NEXT: vsetvli a6, zero, e8, m1, ta, ma
+; CHECK-NEXT: vsseg5e8.v v20, (a4)
+; CHECK-NEXT: vmv1r.v v19, v25
+; CHECK-NEXT: vsseg5e8.v v15, (a0)
+; CHECK-NEXT: vl1r.v v8, (a5)
+; CHECK-NEXT: add a5, a5, a1
+; CHECK-NEXT: vl1r.v v10, (a4)
+; CHECK-NEXT: add a4, a5, a1
+; CHECK-NEXT: vl1r.v v12, (a4)
+; CHECK-NEXT: add a4, a0, a1
+; CHECK-NEXT: vl1r.v v14, (a4)
+; CHECK-NEXT: add a4, a4, a1
+; CHECK-NEXT: vl1r.v v9, (a5)
+; CHECK-NEXT: add a5, a4, a1
+; CHECK-NEXT: vl1r.v v16, (a5)
+; CHECK-NEXT: add a5, a5, a1
+; CHECK-NEXT: srli a1, a1, 1
+; CHECK-NEXT: vl1r.v v11, (a2)
+; CHECK-NEXT: add a2, a3, a3
+; CHECK-NEXT: vl1r.v v15, (a4)
+; CHECK-NEXT: add a4, a1, a1
+; CHECK-NEXT: vl1r.v v13, (a0)
+; CHECK-NEXT: vl1r.v v17, (a5)
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmsne.vi v18, v8, 0
+; CHECK-NEXT: vmsne.vi v0, v10, 0
+; CHECK-NEXT: vmsne.vi v8, v14, 0
+; CHECK-NEXT: vmsne.vi v9, v12, 0
+; CHECK-NEXT: vsetvli zero, a2, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v0, v18, a3
+; CHECK-NEXT: vslideup.vx v9, v8, a3
+; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v0, v9, a1
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmsne.vi v8, v16, 0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 10
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv80i1_nxv16i1:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 10
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; ZVBB-NEXT: vmv.v.i v12, 0
+; ZVBB-NEXT: addi a4, sp, 16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a1, a0, 2
+; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: add a0, sp, a0
+; ZVBB-NEXT: addi a0, a0, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: vmerge.vim v14, v12, 1, v0
+; ZVBB-NEXT: vmv1r.v v0, v8
+; ZVBB-NEXT: vmerge.vim v18, v12, 1, v0
+; ZVBB-NEXT: add a2, a4, a1
+; ZVBB-NEXT: srli a3, a1, 2
+; ZVBB-NEXT: vmv2r.v v20, v14
+; ZVBB-NEXT: vmv1r.v v0, v9
+; ZVBB-NEXT: vmerge.vim v16, v12, 1, v0
+; ZVBB-NEXT: vmv1r.v v21, v18
+; ZVBB-NEXT: vmv1r.v v0, v10
+; ZVBB-NEXT: vmerge.vim v8, v12, 1, v0
+; ZVBB-NEXT: vmv1r.v v22, v16
+; ZVBB-NEXT: vmv1r.v v16, v19
+; ZVBB-NEXT: add a5, a2, a1
+; ZVBB-NEXT: vmv1r.v v23, v8
+; ZVBB-NEXT: vmv1r.v v18, v9
+; ZVBB-NEXT: vmv1r.v v0, v11
+; ZVBB-NEXT: vmerge.vim v24, v12, 1, v0
+; ZVBB-NEXT: vsetvli a6, zero, e8, m1, ta, ma
+; ZVBB-NEXT: vsseg5e8.v v20, (a4)
+; ZVBB-NEXT: vmv1r.v v19, v25
+; ZVBB-NEXT: vsseg5e8.v v15, (a0)
+; ZVBB-NEXT: vl1r.v v8, (a5)
+; ZVBB-NEXT: add a5, a5, a1
+; ZVBB-NEXT: vl1r.v v10, (a4)
+; ZVBB-NEXT: add a4, a5, a1
+; ZVBB-NEXT: vl1r.v v12, (a4)
+; ZVBB-NEXT: add a4, a0, a1
+; ZVBB-NEXT: vl1r.v v14, (a4)
+; ZVBB-NEXT: add a4, a4, a1
+; ZVBB-NEXT: vl1r.v v9, (a5)
+; ZVBB-NEXT: add a5, a4, a1
+; ZVBB-NEXT: vl1r.v v16, (a5)
+; ZVBB-NEXT: add a5, a5, a1
+; ZVBB-NEXT: srli a1, a1, 1
+; ZVBB-NEXT: vl1r.v v11, (a2)
+; ZVBB-NEXT: add a2, a3, a3
+; ZVBB-NEXT: vl1r.v v15, (a4)
+; ZVBB-NEXT: add a4, a1, a1
+; ZVBB-NEXT: vl1r.v v13, (a0)
+; ZVBB-NEXT: vl1r.v v17, (a5)
+; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; ZVBB-NEXT: vmsne.vi v18, v8, 0
+; ZVBB-NEXT: vmsne.vi v0, v10, 0
+; ZVBB-NEXT: vmsne.vi v8, v14, 0
+; ZVBB-NEXT: vmsne.vi v9, v12, 0
+; ZVBB-NEXT: vsetvli zero, a2, e8, mf2, ta, ma
+; ZVBB-NEXT: vslideup.vx v0, v18, a3
+; ZVBB-NEXT: vslideup.vx v9, v8, a3
+; ZVBB-NEXT: vsetvli zero, a4, e8, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v0, v9, a1
+; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; ZVBB-NEXT: vmsne.vi v8, v16, 0
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 10
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 80 x i1> @llvm.vector.interleave5.nxv80i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e)
+ ret <vscale x 80 x i1> %res
+}
+
+
+define <vscale x 80 x i8> @vector_interleave_nxv80i8_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, <vscale x 16 x i8> %e) nounwind {
+;
+; RV32-LABEL: vector_interleave_nxv80i8_nxv16i8:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -80
+; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 80
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 28
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; RV32-NEXT: vmv2r.v v20, v16
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv2r.v v18, v12
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 2
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 64
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: vmv2r.v v16, v8
+; RV32-NEXT: vmv2r.v v22, v16
+; RV32-NEXT: vmv2r.v v24, v18
+; RV32-NEXT: vmv1r.v v26, v20
+; RV32-NEXT: add a3, a0, a2
+; RV32-NEXT: vmv1r.v v23, v10
+; RV32-NEXT: add a4, a1, a2
+; RV32-NEXT: add a5, a4, a2
+; RV32-NEXT: vmv1r.v v25, v14
+; RV32-NEXT: add a6, a5, a2
+; RV32-NEXT: vmv1r.v v18, v11
+; RV32-NEXT: vsseg5e8.v v22, (a0)
+; RV32-NEXT: vmv1r.v v20, v15
+; RV32-NEXT: vsseg5e8.v v17, (a1)
+; RV32-NEXT: vl1r.v v16, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1r.v v17, (a6)
+; RV32-NEXT: add a6, a3, a2
+; RV32-NEXT: vl1r.v v10, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1r.v v11, (a6)
+; RV32-NEXT: vl1r.v v8, (a0)
+; RV32-NEXT: vl1r.v v9, (a3)
+; RV32-NEXT: vl1r.v v14, (a4)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a3, 10
+; RV32-NEXT: mul a0, a0, a3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 64
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1r.v v15, (a5)
+; RV32-NEXT: vl1r.v v12, (a6)
+; RV32-NEXT: vl1r.v v13, (a1)
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a0, a2
+; RV32-NEXT: vs2r.v v16, (a2)
+; RV32-NEXT: vs8r.v v8, (a0)
+; RV32-NEXT: vl8r.v v16, (a2)
+; RV32-NEXT: vl8r.v v8, (a0)
+; RV32-NEXT: addi sp, s0, -80
+; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 80
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_nxv80i8_nxv16i8:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -80
+; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi s0, sp, 80
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 28
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: sub sp, sp, a0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; RV64-NEXT: vmv2r.v v20, v16
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv2r.v v18, v12
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a2, a1, 2
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 64
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: vmv2r.v v16, v8
+; RV64-NEXT: vmv2r.v v22, v16
+; RV64-NEXT: vmv2r.v v24, v18
+; RV64-NEXT: vmv1r.v v26, v20
+; RV64-NEXT: add a3, a0, a2
+; RV64-NEXT: vmv1r.v v23, v10
+; RV64-NEXT: add a4, a1, a2
+; RV64-NEXT: add a5, a4, a2
+; RV64-NEXT: vmv1r.v v25, v14
+; RV64-NEXT: add a6, a5, a2
+; RV64-NEXT: vmv1r.v v18, v11
+; RV64-NEXT: vsseg5e8.v v22, (a0)
+; RV64-NEXT: vmv1r.v v20, v15
+; RV64-NEXT: vsseg5e8.v v17, (a1)
+; RV64-NEXT: vl1r.v v16, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1r.v v17, (a6)
+; RV64-NEXT: add a6, a3, a2
+; RV64-NEXT: vl1r.v v10, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1r.v v11, (a6)
+; RV64-NEXT: vl1r.v v8, (a0)
+; RV64-NEXT: vl1r.v v9, (a3)
+; RV64-NEXT: vl1r.v v14, (a4)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a3, 10
+; RV64-NEXT: mul a0, a0, a3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 64
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1r.v v15, (a5)
+; RV64-NEXT: vl1r.v v12, (a6)
+; RV64-NEXT: vl1r.v v13, (a1)
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a0, a2
+; RV64-NEXT: vs2r.v v16, (a2)
+; RV64-NEXT: vs8r.v v8, (a0)
+; RV64-NEXT: vl8r.v v16, (a2)
+; RV64-NEXT: vl8r.v v8, (a0)
+; RV64-NEXT: addi sp, s0, -80
+; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 80
+; RV64-NEXT: ret
+;
+; ZVBB-RV32-LABEL: vector_interleave_nxv80i8_nxv16i8:
+; ZVBB-RV32: # %bb.0:
+; ZVBB-RV32-NEXT: addi sp, sp, -80
+; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: addi s0, sp, 80
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a1, 28
+; ZVBB-RV32-NEXT: mul a0, a0, a1
+; ZVBB-RV32-NEXT: sub sp, sp, a0
+; ZVBB-RV32-NEXT: andi sp, sp, -64
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; ZVBB-RV32-NEXT: vmv2r.v v20, v16
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv2r.v v18, v12
+; ZVBB-RV32-NEXT: csrr a1, vlenb
+; ZVBB-RV32-NEXT: slli a2, a1, 2
+; ZVBB-RV32-NEXT: add a1, a2, a1
+; ZVBB-RV32-NEXT: add a1, sp, a1
+; ZVBB-RV32-NEXT: addi a1, a1, 64
+; ZVBB-RV32-NEXT: csrr a2, vlenb
+; ZVBB-RV32-NEXT: vmv2r.v v16, v8
+; ZVBB-RV32-NEXT: vmv2r.v v22, v16
+; ZVBB-RV32-NEXT: vmv2r.v v24, v18
+; ZVBB-RV32-NEXT: vmv1r.v v26, v20
+; ZVBB-RV32-NEXT: add a3, a0, a2
+; ZVBB-RV32-NEXT: vmv1r.v v23, v10
+; ZVBB-RV32-NEXT: add a4, a1, a2
+; ZVBB-RV32-NEXT: add a5, a4, a2
+; ZVBB-RV32-NEXT: vmv1r.v v25, v14
+; ZVBB-RV32-NEXT: add a6, a5, a2
+; ZVBB-RV32-NEXT: vmv1r.v v18, v11
+; ZVBB-RV32-NEXT: vsseg5e8.v v22, (a0)
+; ZVBB-RV32-NEXT: vmv1r.v v20, v15
+; ZVBB-RV32-NEXT: vsseg5e8.v v17, (a1)
+; ZVBB-RV32-NEXT: vl1r.v v16, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1r.v v17, (a6)
+; ZVBB-RV32-NEXT: add a6, a3, a2
+; ZVBB-RV32-NEXT: vl1r.v v10, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1r.v v11, (a6)
+; ZVBB-RV32-NEXT: vl1r.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1r.v v9, (a3)
+; ZVBB-RV32-NEXT: vl1r.v v14, (a4)
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a3, 10
+; ZVBB-RV32-NEXT: mul a0, a0, a3
+; ZVBB-RV32-NEXT: add a0, sp, a0
+; ZVBB-RV32-NEXT: addi a0, a0, 64
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1r.v v15, (a5)
+; ZVBB-RV32-NEXT: vl1r.v v12, (a6)
+; ZVBB-RV32-NEXT: vl1r.v v13, (a1)
+; ZVBB-RV32-NEXT: slli a2, a2, 3
+; ZVBB-RV32-NEXT: add a2, a0, a2
+; ZVBB-RV32-NEXT: vs2r.v v16, (a2)
+; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT: vl8r.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8r.v v8, (a0)
+; ZVBB-RV32-NEXT: addi sp, s0, -80
+; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: addi sp, sp, 80
+; ZVBB-RV32-NEXT: ret
+;
+; ZVBB-RV64-LABEL: vector_interleave_nxv80i8_nxv16i8:
+; ZVBB-RV64: # %bb.0:
+; ZVBB-RV64-NEXT: addi sp, sp, -80
+; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: addi s0, sp, 80
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a1, 28
+; ZVBB-RV64-NEXT: mul a0, a0, a1
+; ZVBB-RV64-NEXT: sub sp, sp, a0
+; ZVBB-RV64-NEXT: andi sp, sp, -64
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; ZVBB-RV64-NEXT: vmv2r.v v20, v16
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv2r.v v18, v12
+; ZVBB-RV64-NEXT: csrr a1, vlenb
+; ZVBB-RV64-NEXT: slli a2, a1, 2
+; ZVBB-RV64-NEXT: add a1, a2, a1
+; ZVBB-RV64-NEXT: add a1, sp, a1
+; ZVBB-RV64-NEXT: addi a1, a1, 64
+; ZVBB-RV64-NEXT: csrr a2, vlenb
+; ZVBB-RV64-NEXT: vmv2r.v v16, v8
+; ZVBB-RV64-NEXT: vmv2r.v v22, v16
+; ZVBB-RV64-NEXT: vmv2r.v v24, v18
+; ZVBB-RV64-NEXT: vmv1r.v v26, v20
+; ZVBB-RV64-NEXT: add a3, a0, a2
+; ZVBB-RV64-NEXT: vmv1r.v v23, v10
+; ZVBB-RV64-NEXT: add a4, a1, a2
+; ZVBB-RV64-NEXT: add a5, a4, a2
+; ZVBB-RV64-NEXT: vmv1r.v v25, v14
+; ZVBB-RV64-NEXT: add a6, a5, a2
+; ZVBB-RV64-NEXT: vmv1r.v v18, v11
+; ZVBB-RV64-NEXT: vsseg5e8.v v22, (a0)
+; ZVBB-RV64-NEXT: vmv1r.v v20, v15
+; ZVBB-RV64-NEXT: vsseg5e8.v v17, (a1)
+; ZVBB-RV64-NEXT: vl1r.v v16, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1r.v v17, (a6)
+; ZVBB-RV64-NEXT: add a6, a3, a2
+; ZVBB-RV64-NEXT: vl1r.v v10, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1r.v v11, (a6)
+; ZVBB-RV64-NEXT: vl1r.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1r.v v9, (a3)
+; ZVBB-RV64-NEXT: vl1r.v v14, (a4)
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a3, 10
+; ZVBB-RV64-NEXT: mul a0, a0, a3
+; ZVBB-RV64-NEXT: add a0, sp, a0
+; ZVBB-RV64-NEXT: addi a0, a0, 64
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1r.v v15, (a5)
+; ZVBB-RV64-NEXT: vl1r.v v12, (a6)
+; ZVBB-RV64-NEXT: vl1r.v v13, (a1)
+; ZVBB-RV64-NEXT: slli a2, a2, 3
+; ZVBB-RV64-NEXT: add a2, a0, a2
+; ZVBB-RV64-NEXT: vs2r.v v16, (a2)
+; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT: vl8r.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8r.v v8, (a0)
+; ZVBB-RV64-NEXT: addi sp, s0, -80
+; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: addi sp, sp, 80
+; ZVBB-RV64-NEXT: ret
+ %res = call <vscale x 80 x i8> @llvm.vector.interleave5.nxv80i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, <vscale x 16 x i8> %e)
+ ret <vscale x 80 x i8> %res
+}
+
+
+define <vscale x 40 x i8> @vector_interleave_nxv40i8_nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, <vscale x 8 x i8> %c, <vscale x 8 x i8> %d, <vscale x 8 x i8> %e) nounwind {
+; CHECK-LABEL: vector_interleave_nxv40i8_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 2
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vsetvli a4, zero, e8, m1, ta, ma
+; CHECK-NEXT: vsseg5e8.v v8, (a0)
+; CHECK-NEXT: vl1r.v v10, (a3)
+; CHECK-NEXT: add a3, a3, a1
+; CHECK-NEXT: vl1r.v v11, (a3)
+; CHECK-NEXT: vl1r.v v8, (a0)
+; CHECK-NEXT: vl1r.v v9, (a2)
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vl1r.v v12, (a1)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 2
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv40i8_nxv8i8:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a1, a0, 2
+; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vsetvli a4, zero, e8, m1, ta, ma
+; ZVBB-NEXT: vsseg5e8.v v8, (a0)
+; ZVBB-NEXT: vl1r.v v10, (a3)
+; ZVBB-NEXT: add a3, a3, a1
+; ZVBB-NEXT: vl1r.v v11, (a3)
+; ZVBB-NEXT: vl1r.v v8, (a0)
+; ZVBB-NEXT: vl1r.v v9, (a2)
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vl1r.v v12, (a1)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a1, a0, 2
+; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 40 x i8> @llvm.vector.interleave5.nxv40i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, <vscale x 8 x i8> %c, <vscale x 8 x i8> %d, <vscale x 8 x i8> %e)
+ ret <vscale x 40 x i8> %res
+}
+
+
+define <vscale x 20 x i32> @vector_interleave_nxv20i32_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, <vscale x 4 x i32> %e) nounwind {
+;
+; RV32-LABEL: vector_interleave_nxv20i32_nxv4i32:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -80
+; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 80
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 28
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; RV32-NEXT: vmv2r.v v20, v16
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv2r.v v18, v12
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 2
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 64
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: vmv2r.v v16, v8
+; RV32-NEXT: vmv2r.v v22, v16
+; RV32-NEXT: vmv2r.v v24, v18
+; RV32-NEXT: vmv1r.v v26, v20
+; RV32-NEXT: add a3, a0, a2
+; RV32-NEXT: vmv1r.v v23, v10
+; RV32-NEXT: add a4, a1, a2
+; RV32-NEXT: add a5, a4, a2
+; RV32-NEXT: vmv1r.v v25, v14
+; RV32-NEXT: add a6, a5, a2
+; RV32-NEXT: vmv1r.v v18, v11
+; RV32-NEXT: vsseg5e32.v v22, (a0)
+; RV32-NEXT: vmv1r.v v20, v15
+; RV32-NEXT: vsseg5e32.v v17, (a1)
+; RV32-NEXT: vl1re32.v v16, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re32.v v17, (a6)
+; RV32-NEXT: add a6, a3, a2
+; RV32-NEXT: vl1re32.v v10, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re32.v v11, (a6)
+; RV32-NEXT: vl1re32.v v8, (a0)
+; RV32-NEXT: vl1re32.v v9, (a3)
+; RV32-NEXT: vl1re32.v v14, (a4)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a3, 10
+; RV32-NEXT: mul a0, a0, a3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 64
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re32.v v15, (a5)
+; RV32-NEXT: vl1re32.v v12, (a6)
+; RV32-NEXT: vl1re32.v v13, (a1)
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a0, a2
+; RV32-NEXT: vs2r.v v16, (a2)
+; RV32-NEXT: vs8r.v v8, (a0)
+; RV32-NEXT: vl8re32.v v16, (a2)
+; RV32-NEXT: vl8re32.v v8, (a0)
+; RV32-NEXT: addi sp, s0, -80
+; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 80
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_nxv20i32_nxv4i32:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -80
+; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi s0, sp, 80
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 28
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: sub sp, sp, a0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; RV64-NEXT: vmv2r.v v20, v16
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv2r.v v18, v12
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a2, a1, 2
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 64
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: vmv2r.v v16, v8
+; RV64-NEXT: vmv2r.v v22, v16
+; RV64-NEXT: vmv2r.v v24, v18
+; RV64-NEXT: vmv1r.v v26, v20
+; RV64-NEXT: add a3, a0, a2
+; RV64-NEXT: vmv1r.v v23, v10
+; RV64-NEXT: add a4, a1, a2
+; RV64-NEXT: add a5, a4, a2
+; RV64-NEXT: vmv1r.v v25, v14
+; RV64-NEXT: add a6, a5, a2
+; RV64-NEXT: vmv1r.v v18, v11
+; RV64-NEXT: vsseg5e32.v v22, (a0)
+; RV64-NEXT: vmv1r.v v20, v15
+; RV64-NEXT: vsseg5e32.v v17, (a1)
+; RV64-NEXT: vl1re32.v v16, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re32.v v17, (a6)
+; RV64-NEXT: add a6, a3, a2
+; RV64-NEXT: vl1re32.v v10, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re32.v v11, (a6)
+; RV64-NEXT: vl1re32.v v8, (a0)
+; RV64-NEXT: vl1re32.v v9, (a3)
+; RV64-NEXT: vl1re32.v v14, (a4)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a3, 10
+; RV64-NEXT: mul a0, a0, a3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 64
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re32.v v15, (a5)
+; RV64-NEXT: vl1re32.v v12, (a6)
+; RV64-NEXT: vl1re32.v v13, (a1)
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a0, a2
+; RV64-NEXT: vs2r.v v16, (a2)
+; RV64-NEXT: vs8r.v v8, (a0)
+; RV64-NEXT: vl8re32.v v16, (a2)
+; RV64-NEXT: vl8re32.v v8, (a0)
+; RV64-NEXT: addi sp, s0, -80
+; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 80
+; RV64-NEXT: ret
+;
+; ZVBB-RV32-LABEL: vector_interleave_nxv20i32_nxv4i32:
+; ZVBB-RV32: # %bb.0:
+; ZVBB-RV32-NEXT: addi sp, sp, -80
+; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: addi s0, sp, 80
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a1, 28
+; ZVBB-RV32-NEXT: mul a0, a0, a1
+; ZVBB-RV32-NEXT: sub sp, sp, a0
+; ZVBB-RV32-NEXT: andi sp, sp, -64
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV32-NEXT: vmv2r.v v20, v16
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv2r.v v18, v12
+; ZVBB-RV32-NEXT: csrr a1, vlenb
+; ZVBB-RV32-NEXT: slli a2, a1, 2
+; ZVBB-RV32-NEXT: add a1, a2, a1
+; ZVBB-RV32-NEXT: add a1, sp, a1
+; ZVBB-RV32-NEXT: addi a1, a1, 64
+; ZVBB-RV32-NEXT: csrr a2, vlenb
+; ZVBB-RV32-NEXT: vmv2r.v v16, v8
+; ZVBB-RV32-NEXT: vmv2r.v v22, v16
+; ZVBB-RV32-NEXT: vmv2r.v v24, v18
+; ZVBB-RV32-NEXT: vmv1r.v v26, v20
+; ZVBB-RV32-NEXT: add a3, a0, a2
+; ZVBB-RV32-NEXT: vmv1r.v v23, v10
+; ZVBB-RV32-NEXT: add a4, a1, a2
+; ZVBB-RV32-NEXT: add a5, a4, a2
+; ZVBB-RV32-NEXT: vmv1r.v v25, v14
+; ZVBB-RV32-NEXT: add a6, a5, a2
+; ZVBB-RV32-NEXT: vmv1r.v v18, v11
+; ZVBB-RV32-NEXT: vsseg5e32.v v22, (a0)
+; ZVBB-RV32-NEXT: vmv1r.v v20, v15
+; ZVBB-RV32-NEXT: vsseg5e32.v v17, (a1)
+; ZVBB-RV32-NEXT: vl1re32.v v16, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v17, (a6)
+; ZVBB-RV32-NEXT: add a6, a3, a2
+; ZVBB-RV32-NEXT: vl1re32.v v10, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v11, (a6)
+; ZVBB-RV32-NEXT: vl1re32.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re32.v v9, (a3)
+; ZVBB-RV32-NEXT: vl1re32.v v14, (a4)
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a3, 10
+; ZVBB-RV32-NEXT: mul a0, a0, a3
+; ZVBB-RV32-NEXT: add a0, sp, a0
+; ZVBB-RV32-NEXT: addi a0, a0, 64
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v15, (a5)
+; ZVBB-RV32-NEXT: vl1re32.v v12, (a6)
+; ZVBB-RV32-NEXT: vl1re32.v v13, (a1)
+; ZVBB-RV32-NEXT: slli a2, a2, 3
+; ZVBB-RV32-NEXT: add a2, a0, a2
+; ZVBB-RV32-NEXT: vs2r.v v16, (a2)
+; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT: vl8re32.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8re32.v v8, (a0)
+; ZVBB-RV32-NEXT: addi sp, s0, -80
+; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: addi sp, sp, 80
+; ZVBB-RV32-NEXT: ret
+;
+; ZVBB-RV64-LABEL: vector_interleave_nxv20i32_nxv4i32:
+; ZVBB-RV64: # %bb.0:
+; ZVBB-RV64-NEXT: addi sp, sp, -80
+; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: addi s0, sp, 80
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a1, 28
+; ZVBB-RV64-NEXT: mul a0, a0, a1
+; ZVBB-RV64-NEXT: sub sp, sp, a0
+; ZVBB-RV64-NEXT: andi sp, sp, -64
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV64-NEXT: vmv2r.v v20, v16
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv2r.v v18, v12
+; ZVBB-RV64-NEXT: csrr a1, vlenb
+; ZVBB-RV64-NEXT: slli a2, a1, 2
+; ZVBB-RV64-NEXT: add a1, a2, a1
+; ZVBB-RV64-NEXT: add a1, sp, a1
+; ZVBB-RV64-NEXT: addi a1, a1, 64
+; ZVBB-RV64-NEXT: csrr a2, vlenb
+; ZVBB-RV64-NEXT: vmv2r.v v16, v8
+; ZVBB-RV64-NEXT: vmv2r.v v22, v16
+; ZVBB-RV64-NEXT: vmv2r.v v24, v18
+; ZVBB-RV64-NEXT: vmv1r.v v26, v20
+; ZVBB-RV64-NEXT: add a3, a0, a2
+; ZVBB-RV64-NEXT: vmv1r.v v23, v10
+; ZVBB-RV64-NEXT: add a4, a1, a2
+; ZVBB-RV64-NEXT: add a5, a4, a2
+; ZVBB-RV64-NEXT: vmv1r.v v25, v14
+; ZVBB-RV64-NEXT: add a6, a5, a2
+; ZVBB-RV64-NEXT: vmv1r.v v18, v11
+; ZVBB-RV64-NEXT: vsseg5e32.v v22, (a0)
+; ZVBB-RV64-NEXT: vmv1r.v v20, v15
+; ZVBB-RV64-NEXT: vsseg5e32.v v17, (a1)
+; ZVBB-RV64-NEXT: vl1re32.v v16, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v17, (a6)
+; ZVBB-RV64-NEXT: add a6, a3, a2
+; ZVBB-RV64-NEXT: vl1re32.v v10, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v11, (a6)
+; ZVBB-RV64-NEXT: vl1re32.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re32.v v9, (a3)
+; ZVBB-RV64-NEXT: vl1re32.v v14, (a4)
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a3, 10
+; ZVBB-RV64-NEXT: mul a0, a0, a3
+; ZVBB-RV64-NEXT: add a0, sp, a0
+; ZVBB-RV64-NEXT: addi a0, a0, 64
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v15, (a5)
+; ZVBB-RV64-NEXT: vl1re32.v v12, (a6)
+; ZVBB-RV64-NEXT: vl1re32.v v13, (a1)
+; ZVBB-RV64-NEXT: slli a2, a2, 3
+; ZVBB-RV64-NEXT: add a2, a0, a2
+; ZVBB-RV64-NEXT: vs2r.v v16, (a2)
+; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT: vl8re32.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8re32.v v8, (a0)
+; ZVBB-RV64-NEXT: addi sp, s0, -80
+; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: addi sp, sp, 80
+; ZVBB-RV64-NEXT: ret
+ %res = call <vscale x 20 x i32> @llvm.vector.interleave5.nxv20i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, <vscale x 4 x i32> %e)
+ ret <vscale x 20 x i32> %res
+}
+
+
+define <vscale x 10 x i64> @vector_interleave_nxv10i64_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d, <vscale x 2 x i64> %e) nounwind {
+;
+; RV32-LABEL: vector_interleave_nxv10i64_nxv2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -80
+; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 80
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 28
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT: vmv2r.v v20, v16
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv2r.v v18, v12
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 2
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 64
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: vmv2r.v v16, v8
+; RV32-NEXT: vmv2r.v v22, v16
+; RV32-NEXT: vmv2r.v v24, v18
+; RV32-NEXT: vmv1r.v v26, v20
+; RV32-NEXT: add a3, a0, a2
+; RV32-NEXT: vmv1r.v v23, v10
+; RV32-NEXT: add a4, a1, a2
+; RV32-NEXT: add a5, a4, a2
+; RV32-NEXT: vmv1r.v v25, v14
+; RV32-NEXT: add a6, a5, a2
+; RV32-NEXT: vmv1r.v v18, v11
+; RV32-NEXT: vsseg5e64.v v22, (a0)
+; RV32-NEXT: vmv1r.v v20, v15
+; RV32-NEXT: vsseg5e64.v v17, (a1)
+; RV32-NEXT: vl1re64.v v16, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re64.v v17, (a6)
+; RV32-NEXT: add a6, a3, a2
+; RV32-NEXT: vl1re64.v v10, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re64.v v11, (a6)
+; RV32-NEXT: vl1re64.v v8, (a0)
+; RV32-NEXT: vl1re64.v v9, (a3)
+; RV32-NEXT: vl1re64.v v14, (a4)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a3, 10
+; RV32-NEXT: mul a0, a0, a3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 64
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re64.v v15, (a5)
+; RV32-NEXT: vl1re64.v v12, (a6)
+; RV32-NEXT: vl1re64.v v13, (a1)
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a0, a2
+; RV32-NEXT: vs2r.v v16, (a2)
+; RV32-NEXT: vs8r.v v8, (a0)
+; RV32-NEXT: vl8re64.v v16, (a2)
+; RV32-NEXT: vl8re64.v v8, (a0)
+; RV32-NEXT: addi sp, s0, -80
+; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 80
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_nxv10i64_nxv2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -80
+; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi s0, sp, 80
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 28
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: sub sp, sp, a0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv2r.v v20, v16
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv2r.v v18, v12
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a2, a1, 2
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 64
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: vmv2r.v v16, v8
+; RV64-NEXT: vmv2r.v v22, v16
+; RV64-NEXT: vmv2r.v v24, v18
+; RV64-NEXT: vmv1r.v v26, v20
+; RV64-NEXT: add a3, a0, a2
+; RV64-NEXT: vmv1r.v v23, v10
+; RV64-NEXT: add a4, a1, a2
+; RV64-NEXT: add a5, a4, a2
+; RV64-NEXT: vmv1r.v v25, v14
+; RV64-NEXT: add a6, a5, a2
+; RV64-NEXT: vmv1r.v v18, v11
+; RV64-NEXT: vsseg5e64.v v22, (a0)
+; RV64-NEXT: vmv1r.v v20, v15
+; RV64-NEXT: vsseg5e64.v v17, (a1)
+; RV64-NEXT: vl1re64.v v16, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re64.v v17, (a6)
+; RV64-NEXT: add a6, a3, a2
+; RV64-NEXT: vl1re64.v v10, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re64.v v11, (a6)
+; RV64-NEXT: vl1re64.v v8, (a0)
+; RV64-NEXT: vl1re64.v v9, (a3)
+; RV64-NEXT: vl1re64.v v14, (a4)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a3, 10
+; RV64-NEXT: mul a0, a0, a3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 64
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re64.v v15, (a5)
+; RV64-NEXT: vl1re64.v v12, (a6)
+; RV64-NEXT: vl1re64.v v13, (a1)
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a0, a2
+; RV64-NEXT: vs2r.v v16, (a2)
+; RV64-NEXT: vs8r.v v8, (a0)
+; RV64-NEXT: vl8re64.v v16, (a2)
+; RV64-NEXT: vl8re64.v v8, (a0)
+; RV64-NEXT: addi sp, s0, -80
+; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 80
+; RV64-NEXT: ret
+;
+; ZVBB-RV32-LABEL: vector_interleave_nxv10i64_nxv2i64:
+; ZVBB-RV32: # %bb.0:
+; ZVBB-RV32-NEXT: addi sp, sp, -80
+; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: addi s0, sp, 80
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a1, 28
+; ZVBB-RV32-NEXT: mul a0, a0, a1
+; ZVBB-RV32-NEXT: sub sp, sp, a0
+; ZVBB-RV32-NEXT: andi sp, sp, -64
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV32-NEXT: vmv2r.v v20, v16
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv2r.v v18, v12
+; ZVBB-RV32-NEXT: csrr a1, vlenb
+; ZVBB-RV32-NEXT: slli a2, a1, 2
+; ZVBB-RV32-NEXT: add a1, a2, a1
+; ZVBB-RV32-NEXT: add a1, sp, a1
+; ZVBB-RV32-NEXT: addi a1, a1, 64
+; ZVBB-RV32-NEXT: csrr a2, vlenb
+; ZVBB-RV32-NEXT: vmv2r.v v16, v8
+; ZVBB-RV32-NEXT: vmv2r.v v22, v16
+; ZVBB-RV32-NEXT: vmv2r.v v24, v18
+; ZVBB-RV32-NEXT: vmv1r.v v26, v20
+; ZVBB-RV32-NEXT: add a3, a0, a2
+; ZVBB-RV32-NEXT: vmv1r.v v23, v10
+; ZVBB-RV32-NEXT: add a4, a1, a2
+; ZVBB-RV32-NEXT: add a5, a4, a2
+; ZVBB-RV32-NEXT: vmv1r.v v25, v14
+; ZVBB-RV32-NEXT: add a6, a5, a2
+; ZVBB-RV32-NEXT: vmv1r.v v18, v11
+; ZVBB-RV32-NEXT: vsseg5e64.v v22, (a0)
+; ZVBB-RV32-NEXT: vmv1r.v v20, v15
+; ZVBB-RV32-NEXT: vsseg5e64.v v17, (a1)
+; ZVBB-RV32-NEXT: vl1re64.v v16, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v17, (a6)
+; ZVBB-RV32-NEXT: add a6, a3, a2
+; ZVBB-RV32-NEXT: vl1re64.v v10, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v11, (a6)
+; ZVBB-RV32-NEXT: vl1re64.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re64.v v9, (a3)
+; ZVBB-RV32-NEXT: vl1re64.v v14, (a4)
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a3, 10
+; ZVBB-RV32-NEXT: mul a0, a0, a3
+; ZVBB-RV32-NEXT: add a0, sp, a0
+; ZVBB-RV32-NEXT: addi a0, a0, 64
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v15, (a5)
+; ZVBB-RV32-NEXT: vl1re64.v v12, (a6)
+; ZVBB-RV32-NEXT: vl1re64.v v13, (a1)
+; ZVBB-RV32-NEXT: slli a2, a2, 3
+; ZVBB-RV32-NEXT: add a2, a0, a2
+; ZVBB-RV32-NEXT: vs2r.v v16, (a2)
+; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT: vl8re64.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8re64.v v8, (a0)
+; ZVBB-RV32-NEXT: addi sp, s0, -80
+; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: addi sp, sp, 80
+; ZVBB-RV32-NEXT: ret
+;
+; ZVBB-RV64-LABEL: vector_interleave_nxv10i64_nxv2i64:
+; ZVBB-RV64: # %bb.0:
+; ZVBB-RV64-NEXT: addi sp, sp, -80
+; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: addi s0, sp, 80
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a1, 28
+; ZVBB-RV64-NEXT: mul a0, a0, a1
+; ZVBB-RV64-NEXT: sub sp, sp, a0
+; ZVBB-RV64-NEXT: andi sp, sp, -64
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV64-NEXT: vmv2r.v v20, v16
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv2r.v v18, v12
+; ZVBB-RV64-NEXT: csrr a1, vlenb
+; ZVBB-RV64-NEXT: slli a2, a1, 2
+; ZVBB-RV64-NEXT: add a1, a2, a1
+; ZVBB-RV64-NEXT: add a1, sp, a1
+; ZVBB-RV64-NEXT: addi a1, a1, 64
+; ZVBB-RV64-NEXT: csrr a2, vlenb
+; ZVBB-RV64-NEXT: vmv2r.v v16, v8
+; ZVBB-RV64-NEXT: vmv2r.v v22, v16
+; ZVBB-RV64-NEXT: vmv2r.v v24, v18
+; ZVBB-RV64-NEXT: vmv1r.v v26, v20
+; ZVBB-RV64-NEXT: add a3, a0, a2
+; ZVBB-RV64-NEXT: vmv1r.v v23, v10
+; ZVBB-RV64-NEXT: add a4, a1, a2
+; ZVBB-RV64-NEXT: add a5, a4, a2
+; ZVBB-RV64-NEXT: vmv1r.v v25, v14
+; ZVBB-RV64-NEXT: add a6, a5, a2
+; ZVBB-RV64-NEXT: vmv1r.v v18, v11
+; ZVBB-RV64-NEXT: vsseg5e64.v v22, (a0)
+; ZVBB-RV64-NEXT: vmv1r.v v20, v15
+; ZVBB-RV64-NEXT: vsseg5e64.v v17, (a1)
+; ZVBB-RV64-NEXT: vl1re64.v v16, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v17, (a6)
+; ZVBB-RV64-NEXT: add a6, a3, a2
+; ZVBB-RV64-NEXT: vl1re64.v v10, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v11, (a6)
+; ZVBB-RV64-NEXT: vl1re64.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re64.v v9, (a3)
+; ZVBB-RV64-NEXT: vl1re64.v v14, (a4)
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a3, 10
+; ZVBB-RV64-NEXT: mul a0, a0, a3
+; ZVBB-RV64-NEXT: add a0, sp, a0
+; ZVBB-RV64-NEXT: addi a0, a0, 64
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v15, (a5)
+; ZVBB-RV64-NEXT: vl1re64.v v12, (a6)
+; ZVBB-RV64-NEXT: vl1re64.v v13, (a1)
+; ZVBB-RV64-NEXT: slli a2, a2, 3
+; ZVBB-RV64-NEXT: add a2, a0, a2
+; ZVBB-RV64-NEXT: vs2r.v v16, (a2)
+; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT: vl8re64.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8re64.v v8, (a0)
+; ZVBB-RV64-NEXT: addi sp, s0, -80
+; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: addi sp, sp, 80
+; ZVBB-RV64-NEXT: ret
+ %res = call <vscale x 10 x i64> @llvm.vector.interleave5.nxv10i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d, <vscale x 2 x i64> %e)
+ ret <vscale x 10 x i64> %res
+}
+
+define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e, <vscale x 16 x i1> %f, <vscale x 16 x i1> %g) nounwind {
+; CHECK-LABEL: vector_interleave_nxv112i1_nxv16i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 14
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmv.v.i v14, 0
+; CHECK-NEXT: addi a4, sp, 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 3
+; CHECK-NEXT: sub a0, a1, a0
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: vmerge.vim v16, v14, 1, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vim v22, v14, 1, v0
+; CHECK-NEXT: add a3, a4, a2
+; CHECK-NEXT: srli a1, a2, 2
+; CHECK-NEXT: add a5, a0, a2
+; CHECK-NEXT: vmv4r.v v24, v16
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmerge.vim v18, v14, 1, v0
+; CHECK-NEXT: add a6, a3, a2
+; CHECK-NEXT: vmv1r.v v25, v22
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmerge.vim v8, v14, 1, v0
+; CHECK-NEXT: vmv1r.v v26, v18
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: vmerge.vim v20, v14, 1, v0
+; CHECK-NEXT: vmv1r.v v27, v8
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vim v10, v14, 1, v0
+; CHECK-NEXT: vmv1r.v v28, v20
+; CHECK-NEXT: vmv1r.v v18, v23
+; CHECK-NEXT: add a7, a6, a2
+; CHECK-NEXT: vmv1r.v v29, v10
+; CHECK-NEXT: vmv1r.v v20, v9
+; CHECK-NEXT: vmv1r.v v0, v13
+; CHECK-NEXT: vmerge.vim v30, v14, 1, v0
+; CHECK-NEXT: vmv1r.v v22, v11
+; CHECK-NEXT: vsetvli t0, zero, e8, m1, ta, ma
+; CHECK-NEXT: vsseg7e8.v v24, (a4)
+; CHECK-NEXT: vmv1r.v v23, v31
+; CHECK-NEXT: vsseg7e8.v v17, (a0)
+; CHECK-NEXT: vl1r.v v8, (a6)
+; CHECK-NEXT: add a6, a7, a2
+; CHECK-NEXT: vl1r.v v10, (a4)
+; CHECK-NEXT: add a4, a6, a2
+; CHECK-NEXT: vl1r.v v12, (a6)
+; CHECK-NEXT: add a6, a4, a2
+; CHECK-NEXT: vl1r.v v14, (a6)
+; CHECK-NEXT: add a6, a5, a2
+; CHECK-NEXT: vl1r.v v16, (a5)
+; CHECK-NEXT: add a5, a6, a2
+; CHECK-NEXT: vl1r.v v18, (a5)
+; CHECK-NEXT: add a5, a5, a2
+; CHECK-NEXT: vl1r.v v9, (a7)
+; CHECK-NEXT: add a7, a5, a2
+; CHECK-NEXT: vl1r.v v20, (a7)
+; CHECK-NEXT: add a7, a7, a2
+; CHECK-NEXT: srli a2, a2, 1
+; CHECK-NEXT: vl1r.v v11, (a3)
+; CHECK-NEXT: add a3, a1, a1
+; CHECK-NEXT: vl1r.v v13, (a4)
+; CHECK-NEXT: add a4, a2, a2
+; CHECK-NEXT: vl1r.v v15, (a0)
+; CHECK-NEXT: vl1r.v v19, (a5)
+; CHECK-NEXT: vl1r.v v17, (a6)
+; CHECK-NEXT: vl1r.v v21, (a7)
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmsne.vi v22, v8, 0
+; CHECK-NEXT: vmsne.vi v0, v10, 0
+; CHECK-NEXT: vmsne.vi v9, v12, 0
+; CHECK-NEXT: vmsne.vi v10, v14, 0
+; CHECK-NEXT: vmsne.vi v11, v18, 0
+; CHECK-NEXT: vmsne.vi v8, v16, 0
+; CHECK-NEXT: vmsne.vi v12, v20, 0
+; CHECK-NEXT: vsetvli zero, a3, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v0, v22, a1
+; CHECK-NEXT: vslideup.vx v9, v10, a1
+; CHECK-NEXT: vslideup.vx v8, v11, a1
+; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v0, v9, a2
+; CHECK-NEXT: vslideup.vx v8, v12, a2
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 14
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv112i1_nxv16i1:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 14
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; ZVBB-NEXT: vmv.v.i v14, 0
+; ZVBB-NEXT: addi a4, sp, 16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a1, a0, 3
+; ZVBB-NEXT: sub a0, a1, a0
+; ZVBB-NEXT: add a0, sp, a0
+; ZVBB-NEXT: addi a0, a0, 16
+; ZVBB-NEXT: csrr a2, vlenb
+; ZVBB-NEXT: vmerge.vim v16, v14, 1, v0
+; ZVBB-NEXT: vmv1r.v v0, v8
+; ZVBB-NEXT: vmerge.vim v22, v14, 1, v0
+; ZVBB-NEXT: add a3, a4, a2
+; ZVBB-NEXT: srli a1, a2, 2
+; ZVBB-NEXT: add a5, a0, a2
+; ZVBB-NEXT: vmv4r.v v24, v16
+; ZVBB-NEXT: vmv1r.v v0, v9
+; ZVBB-NEXT: vmerge.vim v18, v14, 1, v0
+; ZVBB-NEXT: add a6, a3, a2
+; ZVBB-NEXT: vmv1r.v v25, v22
+; ZVBB-NEXT: vmv1r.v v0, v10
+; ZVBB-NEXT: vmerge.vim v8, v14, 1, v0
+; ZVBB-NEXT: vmv1r.v v26, v18
+; ZVBB-NEXT: vmv1r.v v0, v11
+; ZVBB-NEXT: vmerge.vim v20, v14, 1, v0
+; ZVBB-NEXT: vmv1r.v v27, v8
+; ZVBB-NEXT: vmv1r.v v0, v12
+; ZVBB-NEXT: vmerge.vim v10, v14, 1, v0
+; ZVBB-NEXT: vmv1r.v v28, v20
+; ZVBB-NEXT: vmv1r.v v18, v23
+; ZVBB-NEXT: add a7, a6, a2
+; ZVBB-NEXT: vmv1r.v v29, v10
+; ZVBB-NEXT: vmv1r.v v20, v9
+; ZVBB-NEXT: vmv1r.v v0, v13
+; ZVBB-NEXT: vmerge.vim v30, v14, 1, v0
+; ZVBB-NEXT: vmv1r.v v22, v11
+; ZVBB-NEXT: vsetvli t0, zero, e8, m1, ta, ma
+; ZVBB-NEXT: vsseg7e8.v v24, (a4)
+; ZVBB-NEXT: vmv1r.v v23, v31
+; ZVBB-NEXT: vsseg7e8.v v17, (a0)
+; ZVBB-NEXT: vl1r.v v8, (a6)
+; ZVBB-NEXT: add a6, a7, a2
+; ZVBB-NEXT: vl1r.v v10, (a4)
+; ZVBB-NEXT: add a4, a6, a2
+; ZVBB-NEXT: vl1r.v v12, (a6)
+; ZVBB-NEXT: add a6, a4, a2
+; ZVBB-NEXT: vl1r.v v14, (a6)
+; ZVBB-NEXT: add a6, a5, a2
+; ZVBB-NEXT: vl1r.v v16, (a5)
+; ZVBB-NEXT: add a5, a6, a2
+; ZVBB-NEXT: vl1r.v v18, (a5)
+; ZVBB-NEXT: add a5, a5, a2
+; ZVBB-NEXT: vl1r.v v9, (a7)
+; ZVBB-NEXT: add a7, a5, a2
+; ZVBB-NEXT: vl1r.v v20, (a7)
+; ZVBB-NEXT: add a7, a7, a2
+; ZVBB-NEXT: srli a2, a2, 1
+; ZVBB-NEXT: vl1r.v v11, (a3)
+; ZVBB-NEXT: add a3, a1, a1
+; ZVBB-NEXT: vl1r.v v13, (a4)
+; ZVBB-NEXT: add a4, a2, a2
+; ZVBB-NEXT: vl1r.v v15, (a0)
+; ZVBB-NEXT: vl1r.v v19, (a5)
+; ZVBB-NEXT: vl1r.v v17, (a6)
+; ZVBB-NEXT: vl1r.v v21, (a7)
+; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; ZVBB-NEXT: vmsne.vi v22, v8, 0
+; ZVBB-NEXT: vmsne.vi v0, v10, 0
+; ZVBB-NEXT: vmsne.vi v9, v12, 0
+; ZVBB-NEXT: vmsne.vi v10, v14, 0
+; ZVBB-NEXT: vmsne.vi v11, v18, 0
+; ZVBB-NEXT: vmsne.vi v8, v16, 0
+; ZVBB-NEXT: vmsne.vi v12, v20, 0
+; ZVBB-NEXT: vsetvli zero, a3, e8, mf2, ta, ma
+; ZVBB-NEXT: vslideup.vx v0, v22, a1
+; ZVBB-NEXT: vslideup.vx v9, v10, a1
+; ZVBB-NEXT: vslideup.vx v8, v11, a1
+; ZVBB-NEXT: vsetvli zero, a4, e8, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v0, v9, a2
+; ZVBB-NEXT: vslideup.vx v8, v12, a2
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 14
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 112 x i1> @llvm.vector.interleave7.nxv112i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e, <vscale x 16 x i1> %f, <vscale x 16 x i1> %g)
+ ret <vscale x 112 x i1> %res
+}
+
+
+define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, <vscale x 16 x i8> %e, <vscale x 16 x i8> %f, <vscale x 16 x i8> %g) nounwind {
+;
+; RV32-LABEL: vector_interleave_nxv112i8_nxv16i8:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -80
+; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 80
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; RV32-NEXT: vmv2r.v v26, v20
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv2r.v v24, v16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 3
+; RV32-NEXT: sub a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 64
+; RV32-NEXT: vmv2r.v v22, v12
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: vmv2r.v v20, v8
+; RV32-NEXT: vmv1r.v v1, v20
+; RV32-NEXT: vmv1r.v v3, v22
+; RV32-NEXT: vmv1r.v v5, v24
+; RV32-NEXT: vmv1r.v v7, v26
+; RV32-NEXT: add a3, a0, a2
+; RV32-NEXT: vmv1r.v v2, v10
+; RV32-NEXT: add a4, a1, a2
+; RV32-NEXT: slli a5, a2, 2
+; RV32-NEXT: vmv1r.v v4, v14
+; RV32-NEXT: slli a6, a2, 4
+; RV32-NEXT: add a7, a4, a2
+; RV32-NEXT: vmv1r.v v6, v18
+; RV32-NEXT: sub a5, a6, a5
+; RV32-NEXT: vmv1r.v v22, v11
+; RV32-NEXT: add a6, a7, a2
+; RV32-NEXT: vmv1r.v v24, v15
+; RV32-NEXT: vsseg7e8.v v1, (a0)
+; RV32-NEXT: vmv1r.v v26, v19
+; RV32-NEXT: vsseg7e8.v v21, (a1)
+; RV32-NEXT: vl1r.v v10, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1r.v v11, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1r.v v12, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1r.v v13, (a6)
+; RV32-NEXT: add a6, a3, a2
+; RV32-NEXT: vl1r.v v18, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1r.v v19, (a6)
+; RV32-NEXT: vl1r.v v16, (a0)
+; RV32-NEXT: vl1r.v v8, (a4)
+; RV32-NEXT: vl1r.v v17, (a3)
+; RV32-NEXT: vl1r.v v9, (a7)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a3, 14
+; RV32-NEXT: mul a0, a0, a3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 64
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1r.v v20, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1r.v v21, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a0, a2
+; RV32-NEXT: vl1r.v v22, (a6)
+; RV32-NEXT: vl1r.v v23, (a1)
+; RV32-NEXT: add a5, a0, a5
+; RV32-NEXT: vs2r.v v12, (a5)
+; RV32-NEXT: vs4r.v v8, (a2)
+; RV32-NEXT: vs8r.v v16, (a0)
+; RV32-NEXT: vl8r.v v16, (a2)
+; RV32-NEXT: vl8r.v v8, (a0)
+; RV32-NEXT: addi sp, s0, -80
+; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 80
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_nxv112i8_nxv16i8:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -80
+; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi s0, sp, 80
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: sub sp, sp, a0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; RV64-NEXT: vmv2r.v v26, v20
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv2r.v v24, v16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a2, a1, 3
+; RV64-NEXT: sub a1, a2, a1
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 64
+; RV64-NEXT: vmv2r.v v22, v12
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: vmv2r.v v20, v8
+; RV64-NEXT: vmv1r.v v1, v20
+; RV64-NEXT: vmv1r.v v3, v22
+; RV64-NEXT: vmv1r.v v5, v24
+; RV64-NEXT: vmv1r.v v7, v26
+; RV64-NEXT: add a3, a0, a2
+; RV64-NEXT: vmv1r.v v2, v10
+; RV64-NEXT: add a4, a1, a2
+; RV64-NEXT: slli a5, a2, 2
+; RV64-NEXT: vmv1r.v v4, v14
+; RV64-NEXT: slli a6, a2, 4
+; RV64-NEXT: add a7, a4, a2
+; RV64-NEXT: vmv1r.v v6, v18
+; RV64-NEXT: sub a5, a6, a5
+; RV64-NEXT: vmv1r.v v22, v11
+; RV64-NEXT: add a6, a7, a2
+; RV64-NEXT: vmv1r.v v24, v15
+; RV64-NEXT: vsseg7e8.v v1, (a0)
+; RV64-NEXT: vmv1r.v v26, v19
+; RV64-NEXT: vsseg7e8.v v21, (a1)
+; RV64-NEXT: vl1r.v v10, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1r.v v11, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1r.v v12, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1r.v v13, (a6)
+; RV64-NEXT: add a6, a3, a2
+; RV64-NEXT: vl1r.v v18, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1r.v v19, (a6)
+; RV64-NEXT: vl1r.v v16, (a0)
+; RV64-NEXT: vl1r.v v8, (a4)
+; RV64-NEXT: vl1r.v v17, (a3)
+; RV64-NEXT: vl1r.v v9, (a7)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a3, 14
+; RV64-NEXT: mul a0, a0, a3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 64
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1r.v v20, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1r.v v21, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a0, a2
+; RV64-NEXT: vl1r.v v22, (a6)
+; RV64-NEXT: vl1r.v v23, (a1)
+; RV64-NEXT: add a5, a0, a5
+; RV64-NEXT: vs2r.v v12, (a5)
+; RV64-NEXT: vs4r.v v8, (a2)
+; RV64-NEXT: vs8r.v v16, (a0)
+; RV64-NEXT: vl8r.v v16, (a2)
+; RV64-NEXT: vl8r.v v8, (a0)
+; RV64-NEXT: addi sp, s0, -80
+; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 80
+; RV64-NEXT: ret
+;
+; ZVBB-RV32-LABEL: vector_interleave_nxv112i8_nxv16i8:
+; ZVBB-RV32: # %bb.0:
+; ZVBB-RV32-NEXT: addi sp, sp, -80
+; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: addi s0, sp, 80
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: slli a0, a0, 5
+; ZVBB-RV32-NEXT: sub sp, sp, a0
+; ZVBB-RV32-NEXT: andi sp, sp, -64
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; ZVBB-RV32-NEXT: vmv2r.v v26, v20
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv2r.v v24, v16
+; ZVBB-RV32-NEXT: csrr a1, vlenb
+; ZVBB-RV32-NEXT: slli a2, a1, 3
+; ZVBB-RV32-NEXT: sub a1, a2, a1
+; ZVBB-RV32-NEXT: add a1, sp, a1
+; ZVBB-RV32-NEXT: addi a1, a1, 64
+; ZVBB-RV32-NEXT: vmv2r.v v22, v12
+; ZVBB-RV32-NEXT: csrr a2, vlenb
+; ZVBB-RV32-NEXT: vmv2r.v v20, v8
+; ZVBB-RV32-NEXT: vmv1r.v v1, v20
+; ZVBB-RV32-NEXT: vmv1r.v v3, v22
+; ZVBB-RV32-NEXT: vmv1r.v v5, v24
+; ZVBB-RV32-NEXT: vmv1r.v v7, v26
+; ZVBB-RV32-NEXT: add a3, a0, a2
+; ZVBB-RV32-NEXT: vmv1r.v v2, v10
+; ZVBB-RV32-NEXT: add a4, a1, a2
+; ZVBB-RV32-NEXT: slli a5, a2, 2
+; ZVBB-RV32-NEXT: vmv1r.v v4, v14
+; ZVBB-RV32-NEXT: slli a6, a2, 4
+; ZVBB-RV32-NEXT: add a7, a4, a2
+; ZVBB-RV32-NEXT: vmv1r.v v6, v18
+; ZVBB-RV32-NEXT: sub a5, a6, a5
+; ZVBB-RV32-NEXT: vmv1r.v v22, v11
+; ZVBB-RV32-NEXT: add a6, a7, a2
+; ZVBB-RV32-NEXT: vmv1r.v v24, v15
+; ZVBB-RV32-NEXT: vsseg7e8.v v1, (a0)
+; ZVBB-RV32-NEXT: vmv1r.v v26, v19
+; ZVBB-RV32-NEXT: vsseg7e8.v v21, (a1)
+; ZVBB-RV32-NEXT: vl1r.v v10, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1r.v v11, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1r.v v12, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1r.v v13, (a6)
+; ZVBB-RV32-NEXT: add a6, a3, a2
+; ZVBB-RV32-NEXT: vl1r.v v18, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1r.v v19, (a6)
+; ZVBB-RV32-NEXT: vl1r.v v16, (a0)
+; ZVBB-RV32-NEXT: vl1r.v v8, (a4)
+; ZVBB-RV32-NEXT: vl1r.v v17, (a3)
+; ZVBB-RV32-NEXT: vl1r.v v9, (a7)
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a3, 14
+; ZVBB-RV32-NEXT: mul a0, a0, a3
+; ZVBB-RV32-NEXT: add a0, sp, a0
+; ZVBB-RV32-NEXT: addi a0, a0, 64
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1r.v v20, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1r.v v21, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: slli a2, a2, 3
+; ZVBB-RV32-NEXT: add a2, a0, a2
+; ZVBB-RV32-NEXT: vl1r.v v22, (a6)
+; ZVBB-RV32-NEXT: vl1r.v v23, (a1)
+; ZVBB-RV32-NEXT: add a5, a0, a5
+; ZVBB-RV32-NEXT: vs2r.v v12, (a5)
+; ZVBB-RV32-NEXT: vs4r.v v8, (a2)
+; ZVBB-RV32-NEXT: vs8r.v v16, (a0)
+; ZVBB-RV32-NEXT: vl8r.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8r.v v8, (a0)
+; ZVBB-RV32-NEXT: addi sp, s0, -80
+; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: addi sp, sp, 80
+; ZVBB-RV32-NEXT: ret
+;
+; ZVBB-RV64-LABEL: vector_interleave_nxv112i8_nxv16i8:
+; ZVBB-RV64: # %bb.0:
+; ZVBB-RV64-NEXT: addi sp, sp, -80
+; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: addi s0, sp, 80
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: slli a0, a0, 5
+; ZVBB-RV64-NEXT: sub sp, sp, a0
+; ZVBB-RV64-NEXT: andi sp, sp, -64
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; ZVBB-RV64-NEXT: vmv2r.v v26, v20
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv2r.v v24, v16
+; ZVBB-RV64-NEXT: csrr a1, vlenb
+; ZVBB-RV64-NEXT: slli a2, a1, 3
+; ZVBB-RV64-NEXT: sub a1, a2, a1
+; ZVBB-RV64-NEXT: add a1, sp, a1
+; ZVBB-RV64-NEXT: addi a1, a1, 64
+; ZVBB-RV64-NEXT: vmv2r.v v22, v12
+; ZVBB-RV64-NEXT: csrr a2, vlenb
+; ZVBB-RV64-NEXT: vmv2r.v v20, v8
+; ZVBB-RV64-NEXT: vmv1r.v v1, v20
+; ZVBB-RV64-NEXT: vmv1r.v v3, v22
+; ZVBB-RV64-NEXT: vmv1r.v v5, v24
+; ZVBB-RV64-NEXT: vmv1r.v v7, v26
+; ZVBB-RV64-NEXT: add a3, a0, a2
+; ZVBB-RV64-NEXT: vmv1r.v v2, v10
+; ZVBB-RV64-NEXT: add a4, a1, a2
+; ZVBB-RV64-NEXT: slli a5, a2, 2
+; ZVBB-RV64-NEXT: vmv1r.v v4, v14
+; ZVBB-RV64-NEXT: slli a6, a2, 4
+; ZVBB-RV64-NEXT: add a7, a4, a2
+; ZVBB-RV64-NEXT: vmv1r.v v6, v18
+; ZVBB-RV64-NEXT: sub a5, a6, a5
+; ZVBB-RV64-NEXT: vmv1r.v v22, v11
+; ZVBB-RV64-NEXT: add a6, a7, a2
+; ZVBB-RV64-NEXT: vmv1r.v v24, v15
+; ZVBB-RV64-NEXT: vsseg7e8.v v1, (a0)
+; ZVBB-RV64-NEXT: vmv1r.v v26, v19
+; ZVBB-RV64-NEXT: vsseg7e8.v v21, (a1)
+; ZVBB-RV64-NEXT: vl1r.v v10, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1r.v v11, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1r.v v12, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1r.v v13, (a6)
+; ZVBB-RV64-NEXT: add a6, a3, a2
+; ZVBB-RV64-NEXT: vl1r.v v18, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1r.v v19, (a6)
+; ZVBB-RV64-NEXT: vl1r.v v16, (a0)
+; ZVBB-RV64-NEXT: vl1r.v v8, (a4)
+; ZVBB-RV64-NEXT: vl1r.v v17, (a3)
+; ZVBB-RV64-NEXT: vl1r.v v9, (a7)
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a3, 14
+; ZVBB-RV64-NEXT: mul a0, a0, a3
+; ZVBB-RV64-NEXT: add a0, sp, a0
+; ZVBB-RV64-NEXT: addi a0, a0, 64
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1r.v v20, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1r.v v21, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: slli a2, a2, 3
+; ZVBB-RV64-NEXT: add a2, a0, a2
+; ZVBB-RV64-NEXT: vl1r.v v22, (a6)
+; ZVBB-RV64-NEXT: vl1r.v v23, (a1)
+; ZVBB-RV64-NEXT: add a5, a0, a5
+; ZVBB-RV64-NEXT: vs2r.v v12, (a5)
+; ZVBB-RV64-NEXT: vs4r.v v8, (a2)
+; ZVBB-RV64-NEXT: vs8r.v v16, (a0)
+; ZVBB-RV64-NEXT: vl8r.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8r.v v8, (a0)
+; ZVBB-RV64-NEXT: addi sp, s0, -80
+; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: addi sp, sp, 80
+; ZVBB-RV64-NEXT: ret
+ %res = call <vscale x 112 x i8> @llvm.vector.interleave7.nxv112i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, <vscale x 16 x i8> %e, <vscale x 16 x i8> %f, <vscale x 16 x i8> %g)
+ ret <vscale x 112 x i8> %res
+}
+
+
+define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c, <vscale x 8 x i16> %d, <vscale x 8 x i16> %e, <vscale x 8 x i16> %f, <vscale x 8 x i16> %g) nounwind {
+;
+; RV32-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -80
+; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 80
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vmv2r.v v26, v20
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv2r.v v24, v16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 3
+; RV32-NEXT: sub a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 64
+; RV32-NEXT: vmv2r.v v22, v12
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: vmv2r.v v20, v8
+; RV32-NEXT: vmv1r.v v1, v20
+; RV32-NEXT: vmv1r.v v3, v22
+; RV32-NEXT: vmv1r.v v5, v24
+; RV32-NEXT: vmv1r.v v7, v26
+; RV32-NEXT: add a3, a0, a2
+; RV32-NEXT: vmv1r.v v2, v10
+; RV32-NEXT: add a4, a1, a2
+; RV32-NEXT: slli a5, a2, 2
+; RV32-NEXT: vmv1r.v v4, v14
+; RV32-NEXT: slli a6, a2, 4
+; RV32-NEXT: add a7, a4, a2
+; RV32-NEXT: vmv1r.v v6, v18
+; RV32-NEXT: sub a5, a6, a5
+; RV32-NEXT: vmv1r.v v22, v11
+; RV32-NEXT: add a6, a7, a2
+; RV32-NEXT: vmv1r.v v24, v15
+; RV32-NEXT: vsseg7e16.v v1, (a0)
+; RV32-NEXT: vmv1r.v v26, v19
+; RV32-NEXT: vsseg7e16.v v21, (a1)
+; RV32-NEXT: vl1re16.v v10, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v11, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v12, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v13, (a6)
+; RV32-NEXT: add a6, a3, a2
+; RV32-NEXT: vl1re16.v v18, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v19, (a6)
+; RV32-NEXT: vl1re16.v v16, (a0)
+; RV32-NEXT: vl1re16.v v8, (a4)
+; RV32-NEXT: vl1re16.v v17, (a3)
+; RV32-NEXT: vl1re16.v v9, (a7)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a3, 14
+; RV32-NEXT: mul a0, a0, a3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 64
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v20, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v21, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a0, a2
+; RV32-NEXT: vl1re16.v v22, (a6)
+; RV32-NEXT: vl1re16.v v23, (a1)
+; RV32-NEXT: add a5, a0, a5
+; RV32-NEXT: vs2r.v v12, (a5)
+; RV32-NEXT: vs4r.v v8, (a2)
+; RV32-NEXT: vs8r.v v16, (a0)
+; RV32-NEXT: vl8re16.v v16, (a2)
+; RV32-NEXT: vl8re16.v v8, (a0)
+; RV32-NEXT: addi sp, s0, -80
+; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 80
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -80
+; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi s0, sp, 80
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: sub sp, sp, a0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vmv2r.v v26, v20
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv2r.v v24, v16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a2, a1, 3
+; RV64-NEXT: sub a1, a2, a1
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 64
+; RV64-NEXT: vmv2r.v v22, v12
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: vmv2r.v v20, v8
+; RV64-NEXT: vmv1r.v v1, v20
+; RV64-NEXT: vmv1r.v v3, v22
+; RV64-NEXT: vmv1r.v v5, v24
+; RV64-NEXT: vmv1r.v v7, v26
+; RV64-NEXT: add a3, a0, a2
+; RV64-NEXT: vmv1r.v v2, v10
+; RV64-NEXT: add a4, a1, a2
+; RV64-NEXT: slli a5, a2, 2
+; RV64-NEXT: vmv1r.v v4, v14
+; RV64-NEXT: slli a6, a2, 4
+; RV64-NEXT: add a7, a4, a2
+; RV64-NEXT: vmv1r.v v6, v18
+; RV64-NEXT: sub a5, a6, a5
+; RV64-NEXT: vmv1r.v v22, v11
+; RV64-NEXT: add a6, a7, a2
+; RV64-NEXT: vmv1r.v v24, v15
+; RV64-NEXT: vsseg7e16.v v1, (a0)
+; RV64-NEXT: vmv1r.v v26, v19
+; RV64-NEXT: vsseg7e16.v v21, (a1)
+; RV64-NEXT: vl1re16.v v10, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v11, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v12, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v13, (a6)
+; RV64-NEXT: add a6, a3, a2
+; RV64-NEXT: vl1re16.v v18, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v19, (a6)
+; RV64-NEXT: vl1re16.v v16, (a0)
+; RV64-NEXT: vl1re16.v v8, (a4)
+; RV64-NEXT: vl1re16.v v17, (a3)
+; RV64-NEXT: vl1re16.v v9, (a7)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a3, 14
+; RV64-NEXT: mul a0, a0, a3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 64
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v20, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v21, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a0, a2
+; RV64-NEXT: vl1re16.v v22, (a6)
+; RV64-NEXT: vl1re16.v v23, (a1)
+; RV64-NEXT: add a5, a0, a5
+; RV64-NEXT: vs2r.v v12, (a5)
+; RV64-NEXT: vs4r.v v8, (a2)
+; RV64-NEXT: vs8r.v v16, (a0)
+; RV64-NEXT: vl8re16.v v16, (a2)
+; RV64-NEXT: vl8re16.v v8, (a0)
+; RV64-NEXT: addi sp, s0, -80
+; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 80
+; RV64-NEXT: ret
+;
+; ZVBB-RV32-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; ZVBB-RV32: # %bb.0:
+; ZVBB-RV32-NEXT: addi sp, sp, -80
+; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: addi s0, sp, 80
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: slli a0, a0, 5
+; ZVBB-RV32-NEXT: sub sp, sp, a0
+; ZVBB-RV32-NEXT: andi sp, sp, -64
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV32-NEXT: vmv2r.v v26, v20
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv2r.v v24, v16
+; ZVBB-RV32-NEXT: csrr a1, vlenb
+; ZVBB-RV32-NEXT: slli a2, a1, 3
+; ZVBB-RV32-NEXT: sub a1, a2, a1
+; ZVBB-RV32-NEXT: add a1, sp, a1
+; ZVBB-RV32-NEXT: addi a1, a1, 64
+; ZVBB-RV32-NEXT: vmv2r.v v22, v12
+; ZVBB-RV32-NEXT: csrr a2, vlenb
+; ZVBB-RV32-NEXT: vmv2r.v v20, v8
+; ZVBB-RV32-NEXT: vmv1r.v v1, v20
+; ZVBB-RV32-NEXT: vmv1r.v v3, v22
+; ZVBB-RV32-NEXT: vmv1r.v v5, v24
+; ZVBB-RV32-NEXT: vmv1r.v v7, v26
+; ZVBB-RV32-NEXT: add a3, a0, a2
+; ZVBB-RV32-NEXT: vmv1r.v v2, v10
+; ZVBB-RV32-NEXT: add a4, a1, a2
+; ZVBB-RV32-NEXT: slli a5, a2, 2
+; ZVBB-RV32-NEXT: vmv1r.v v4, v14
+; ZVBB-RV32-NEXT: slli a6, a2, 4
+; ZVBB-RV32-NEXT: add a7, a4, a2
+; ZVBB-RV32-NEXT: vmv1r.v v6, v18
+; ZVBB-RV32-NEXT: sub a5, a6, a5
+; ZVBB-RV32-NEXT: vmv1r.v v22, v11
+; ZVBB-RV32-NEXT: add a6, a7, a2
+; ZVBB-RV32-NEXT: vmv1r.v v24, v15
+; ZVBB-RV32-NEXT: vsseg7e16.v v1, (a0)
+; ZVBB-RV32-NEXT: vmv1r.v v26, v19
+; ZVBB-RV32-NEXT: vsseg7e16.v v21, (a1)
+; ZVBB-RV32-NEXT: vl1re16.v v10, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v11, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v12, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v13, (a6)
+; ZVBB-RV32-NEXT: add a6, a3, a2
+; ZVBB-RV32-NEXT: vl1re16.v v18, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v19, (a6)
+; ZVBB-RV32-NEXT: vl1re16.v v16, (a0)
+; ZVBB-RV32-NEXT: vl1re16.v v8, (a4)
+; ZVBB-RV32-NEXT: vl1re16.v v17, (a3)
+; ZVBB-RV32-NEXT: vl1re16.v v9, (a7)
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a3, 14
+; ZVBB-RV32-NEXT: mul a0, a0, a3
+; ZVBB-RV32-NEXT: add a0, sp, a0
+; ZVBB-RV32-NEXT: addi a0, a0, 64
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v20, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v21, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: slli a2, a2, 3
+; ZVBB-RV32-NEXT: add a2, a0, a2
+; ZVBB-RV32-NEXT: vl1re16.v v22, (a6)
+; ZVBB-RV32-NEXT: vl1re16.v v23, (a1)
+; ZVBB-RV32-NEXT: add a5, a0, a5
+; ZVBB-RV32-NEXT: vs2r.v v12, (a5)
+; ZVBB-RV32-NEXT: vs4r.v v8, (a2)
+; ZVBB-RV32-NEXT: vs8r.v v16, (a0)
+; ZVBB-RV32-NEXT: vl8re16.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8re16.v v8, (a0)
+; ZVBB-RV32-NEXT: addi sp, s0, -80
+; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: addi sp, sp, 80
+; ZVBB-RV32-NEXT: ret
+;
+; ZVBB-RV64-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; ZVBB-RV64: # %bb.0:
+; ZVBB-RV64-NEXT: addi sp, sp, -80
+; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: addi s0, sp, 80
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: slli a0, a0, 5
+; ZVBB-RV64-NEXT: sub sp, sp, a0
+; ZVBB-RV64-NEXT: andi sp, sp, -64
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV64-NEXT: vmv2r.v v26, v20
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv2r.v v24, v16
+; ZVBB-RV64-NEXT: csrr a1, vlenb
+; ZVBB-RV64-NEXT: slli a2, a1, 3
+; ZVBB-RV64-NEXT: sub a1, a2, a1
+; ZVBB-RV64-NEXT: add a1, sp, a1
+; ZVBB-RV64-NEXT: addi a1, a1, 64
+; ZVBB-RV64-NEXT: vmv2r.v v22, v12
+; ZVBB-RV64-NEXT: csrr a2, vlenb
+; ZVBB-RV64-NEXT: vmv2r.v v20, v8
+; ZVBB-RV64-NEXT: vmv1r.v v1, v20
+; ZVBB-RV64-NEXT: vmv1r.v v3, v22
+; ZVBB-RV64-NEXT: vmv1r.v v5, v24
+; ZVBB-RV64-NEXT: vmv1r.v v7, v26
+; ZVBB-RV64-NEXT: add a3, a0, a2
+; ZVBB-RV64-NEXT: vmv1r.v v2, v10
+; ZVBB-RV64-NEXT: add a4, a1, a2
+; ZVBB-RV64-NEXT: slli a5, a2, 2
+; ZVBB-RV64-NEXT: vmv1r.v v4, v14
+; ZVBB-RV64-NEXT: slli a6, a2, 4
+; ZVBB-RV64-NEXT: add a7, a4, a2
+; ZVBB-RV64-NEXT: vmv1r.v v6, v18
+; ZVBB-RV64-NEXT: sub a5, a6, a5
+; ZVBB-RV64-NEXT: vmv1r.v v22, v11
+; ZVBB-RV64-NEXT: add a6, a7, a2
+; ZVBB-RV64-NEXT: vmv1r.v v24, v15
+; ZVBB-RV64-NEXT: vsseg7e16.v v1, (a0)
+; ZVBB-RV64-NEXT: vmv1r.v v26, v19
+; ZVBB-RV64-NEXT: vsseg7e16.v v21, (a1)
+; ZVBB-RV64-NEXT: vl1re16.v v10, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v11, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v12, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v13, (a6)
+; ZVBB-RV64-NEXT: add a6, a3, a2
+; ZVBB-RV64-NEXT: vl1re16.v v18, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v19, (a6)
+; ZVBB-RV64-NEXT: vl1re16.v v16, (a0)
+; ZVBB-RV64-NEXT: vl1re16.v v8, (a4)
+; ZVBB-RV64-NEXT: vl1re16.v v17, (a3)
+; ZVBB-RV64-NEXT: vl1re16.v v9, (a7)
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a3, 14
+; ZVBB-RV64-NEXT: mul a0, a0, a3
+; ZVBB-RV64-NEXT: add a0, sp, a0
+; ZVBB-RV64-NEXT: addi a0, a0, 64
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v20, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v21, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: slli a2, a2, 3
+; ZVBB-RV64-NEXT: add a2, a0, a2
+; ZVBB-RV64-NEXT: vl1re16.v v22, (a6)
+; ZVBB-RV64-NEXT: vl1re16.v v23, (a1)
+; ZVBB-RV64-NEXT: add a5, a0, a5
+; ZVBB-RV64-NEXT: vs2r.v v12, (a5)
+; ZVBB-RV64-NEXT: vs4r.v v8, (a2)
+; ZVBB-RV64-NEXT: vs8r.v v16, (a0)
+; ZVBB-RV64-NEXT: vl8re16.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8re16.v v8, (a0)
+; ZVBB-RV64-NEXT: addi sp, s0, -80
+; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: addi sp, sp, 80
+; ZVBB-RV64-NEXT: ret
+ %res = call <vscale x 56 x i16> @llvm.vector.interleave7.nxv56i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c, <vscale x 8 x i16> %d, <vscale x 8 x i16> %e, <vscale x 8 x i16> %f, <vscale x 8 x i16> %g)
+ ret <vscale x 56 x i16> %res
+}
+
+
+define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, <vscale x 4 x i32> %e, <vscale x 4 x i32> %f, <vscale x 4 x i32> %g) nounwind {
+;
+; RV32-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -80
+; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 80
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; RV32-NEXT: vmv2r.v v26, v20
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv2r.v v24, v16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 3
+; RV32-NEXT: sub a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 64
+; RV32-NEXT: vmv2r.v v22, v12
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: vmv2r.v v20, v8
+; RV32-NEXT: vmv1r.v v1, v20
+; RV32-NEXT: vmv1r.v v3, v22
+; RV32-NEXT: vmv1r.v v5, v24
+; RV32-NEXT: vmv1r.v v7, v26
+; RV32-NEXT: add a3, a0, a2
+; RV32-NEXT: vmv1r.v v2, v10
+; RV32-NEXT: add a4, a1, a2
+; RV32-NEXT: slli a5, a2, 2
+; RV32-NEXT: vmv1r.v v4, v14
+; RV32-NEXT: slli a6, a2, 4
+; RV32-NEXT: add a7, a4, a2
+; RV32-NEXT: vmv1r.v v6, v18
+; RV32-NEXT: sub a5, a6, a5
+; RV32-NEXT: vmv1r.v v22, v11
+; RV32-NEXT: add a6, a7, a2
+; RV32-NEXT: vmv1r.v v24, v15
+; RV32-NEXT: vsseg7e32.v v1, (a0)
+; RV32-NEXT: vmv1r.v v26, v19
+; RV32-NEXT: vsseg7e32.v v21, (a1)
+; RV32-NEXT: vl1re32.v v10, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re32.v v11, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re32.v v12, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re32.v v13, (a6)
+; RV32-NEXT: add a6, a3, a2
+; RV32-NEXT: vl1re32.v v18, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re32.v v19, (a6)
+; RV32-NEXT: vl1re32.v v16, (a0)
+; RV32-NEXT: vl1re32.v v8, (a4)
+; RV32-NEXT: vl1re32.v v17, (a3)
+; RV32-NEXT: vl1re32.v v9, (a7)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a3, 14
+; RV32-NEXT: mul a0, a0, a3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 64
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re32.v v20, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re32.v v21, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a0, a2
+; RV32-NEXT: vl1re32.v v22, (a6)
+; RV32-NEXT: vl1re32.v v23, (a1)
+; RV32-NEXT: add a5, a0, a5
+; RV32-NEXT: vs2r.v v12, (a5)
+; RV32-NEXT: vs4r.v v8, (a2)
+; RV32-NEXT: vs8r.v v16, (a0)
+; RV32-NEXT: vl8re32.v v16, (a2)
+; RV32-NEXT: vl8re32.v v8, (a0)
+; RV32-NEXT: addi sp, s0, -80
+; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 80
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -80
+; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi s0, sp, 80
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: sub sp, sp, a0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; RV64-NEXT: vmv2r.v v26, v20
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv2r.v v24, v16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a2, a1, 3
+; RV64-NEXT: sub a1, a2, a1
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 64
+; RV64-NEXT: vmv2r.v v22, v12
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: vmv2r.v v20, v8
+; RV64-NEXT: vmv1r.v v1, v20
+; RV64-NEXT: vmv1r.v v3, v22
+; RV64-NEXT: vmv1r.v v5, v24
+; RV64-NEXT: vmv1r.v v7, v26
+; RV64-NEXT: add a3, a0, a2
+; RV64-NEXT: vmv1r.v v2, v10
+; RV64-NEXT: add a4, a1, a2
+; RV64-NEXT: slli a5, a2, 2
+; RV64-NEXT: vmv1r.v v4, v14
+; RV64-NEXT: slli a6, a2, 4
+; RV64-NEXT: add a7, a4, a2
+; RV64-NEXT: vmv1r.v v6, v18
+; RV64-NEXT: sub a5, a6, a5
+; RV64-NEXT: vmv1r.v v22, v11
+; RV64-NEXT: add a6, a7, a2
+; RV64-NEXT: vmv1r.v v24, v15
+; RV64-NEXT: vsseg7e32.v v1, (a0)
+; RV64-NEXT: vmv1r.v v26, v19
+; RV64-NEXT: vsseg7e32.v v21, (a1)
+; RV64-NEXT: vl1re32.v v10, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re32.v v11, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re32.v v12, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re32.v v13, (a6)
+; RV64-NEXT: add a6, a3, a2
+; RV64-NEXT: vl1re32.v v18, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re32.v v19, (a6)
+; RV64-NEXT: vl1re32.v v16, (a0)
+; RV64-NEXT: vl1re32.v v8, (a4)
+; RV64-NEXT: vl1re32.v v17, (a3)
+; RV64-NEXT: vl1re32.v v9, (a7)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a3, 14
+; RV64-NEXT: mul a0, a0, a3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 64
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re32.v v20, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re32.v v21, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a0, a2
+; RV64-NEXT: vl1re32.v v22, (a6)
+; RV64-NEXT: vl1re32.v v23, (a1)
+; RV64-NEXT: add a5, a0, a5
+; RV64-NEXT: vs2r.v v12, (a5)
+; RV64-NEXT: vs4r.v v8, (a2)
+; RV64-NEXT: vs8r.v v16, (a0)
+; RV64-NEXT: vl8re32.v v16, (a2)
+; RV64-NEXT: vl8re32.v v8, (a0)
+; RV64-NEXT: addi sp, s0, -80
+; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 80
+; RV64-NEXT: ret
+;
+; ZVBB-RV32-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; ZVBB-RV32: # %bb.0:
+; ZVBB-RV32-NEXT: addi sp, sp, -80
+; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: addi s0, sp, 80
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: slli a0, a0, 5
+; ZVBB-RV32-NEXT: sub sp, sp, a0
+; ZVBB-RV32-NEXT: andi sp, sp, -64
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV32-NEXT: vmv2r.v v26, v20
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv2r.v v24, v16
+; ZVBB-RV32-NEXT: csrr a1, vlenb
+; ZVBB-RV32-NEXT: slli a2, a1, 3
+; ZVBB-RV32-NEXT: sub a1, a2, a1
+; ZVBB-RV32-NEXT: add a1, sp, a1
+; ZVBB-RV32-NEXT: addi a1, a1, 64
+; ZVBB-RV32-NEXT: vmv2r.v v22, v12
+; ZVBB-RV32-NEXT: csrr a2, vlenb
+; ZVBB-RV32-NEXT: vmv2r.v v20, v8
+; ZVBB-RV32-NEXT: vmv1r.v v1, v20
+; ZVBB-RV32-NEXT: vmv1r.v v3, v22
+; ZVBB-RV32-NEXT: vmv1r.v v5, v24
+; ZVBB-RV32-NEXT: vmv1r.v v7, v26
+; ZVBB-RV32-NEXT: add a3, a0, a2
+; ZVBB-RV32-NEXT: vmv1r.v v2, v10
+; ZVBB-RV32-NEXT: add a4, a1, a2
+; ZVBB-RV32-NEXT: slli a5, a2, 2
+; ZVBB-RV32-NEXT: vmv1r.v v4, v14
+; ZVBB-RV32-NEXT: slli a6, a2, 4
+; ZVBB-RV32-NEXT: add a7, a4, a2
+; ZVBB-RV32-NEXT: vmv1r.v v6, v18
+; ZVBB-RV32-NEXT: sub a5, a6, a5
+; ZVBB-RV32-NEXT: vmv1r.v v22, v11
+; ZVBB-RV32-NEXT: add a6, a7, a2
+; ZVBB-RV32-NEXT: vmv1r.v v24, v15
+; ZVBB-RV32-NEXT: vsseg7e32.v v1, (a0)
+; ZVBB-RV32-NEXT: vmv1r.v v26, v19
+; ZVBB-RV32-NEXT: vsseg7e32.v v21, (a1)
+; ZVBB-RV32-NEXT: vl1re32.v v10, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v11, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v12, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v13, (a6)
+; ZVBB-RV32-NEXT: add a6, a3, a2
+; ZVBB-RV32-NEXT: vl1re32.v v18, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v19, (a6)
+; ZVBB-RV32-NEXT: vl1re32.v v16, (a0)
+; ZVBB-RV32-NEXT: vl1re32.v v8, (a4)
+; ZVBB-RV32-NEXT: vl1re32.v v17, (a3)
+; ZVBB-RV32-NEXT: vl1re32.v v9, (a7)
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a3, 14
+; ZVBB-RV32-NEXT: mul a0, a0, a3
+; ZVBB-RV32-NEXT: add a0, sp, a0
+; ZVBB-RV32-NEXT: addi a0, a0, 64
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v20, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v21, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: slli a2, a2, 3
+; ZVBB-RV32-NEXT: add a2, a0, a2
+; ZVBB-RV32-NEXT: vl1re32.v v22, (a6)
+; ZVBB-RV32-NEXT: vl1re32.v v23, (a1)
+; ZVBB-RV32-NEXT: add a5, a0, a5
+; ZVBB-RV32-NEXT: vs2r.v v12, (a5)
+; ZVBB-RV32-NEXT: vs4r.v v8, (a2)
+; ZVBB-RV32-NEXT: vs8r.v v16, (a0)
+; ZVBB-RV32-NEXT: vl8re32.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8re32.v v8, (a0)
+; ZVBB-RV32-NEXT: addi sp, s0, -80
+; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: addi sp, sp, 80
+; ZVBB-RV32-NEXT: ret
+;
+; ZVBB-RV64-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; ZVBB-RV64: # %bb.0:
+; ZVBB-RV64-NEXT: addi sp, sp, -80
+; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: addi s0, sp, 80
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: slli a0, a0, 5
+; ZVBB-RV64-NEXT: sub sp, sp, a0
+; ZVBB-RV64-NEXT: andi sp, sp, -64
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV64-NEXT: vmv2r.v v26, v20
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv2r.v v24, v16
+; ZVBB-RV64-NEXT: csrr a1, vlenb
+; ZVBB-RV64-NEXT: slli a2, a1, 3
+; ZVBB-RV64-NEXT: sub a1, a2, a1
+; ZVBB-RV64-NEXT: add a1, sp, a1
+; ZVBB-RV64-NEXT: addi a1, a1, 64
+; ZVBB-RV64-NEXT: vmv2r.v v22, v12
+; ZVBB-RV64-NEXT: csrr a2, vlenb
+; ZVBB-RV64-NEXT: vmv2r.v v20, v8
+; ZVBB-RV64-NEXT: vmv1r.v v1, v20
+; ZVBB-RV64-NEXT: vmv1r.v v3, v22
+; ZVBB-RV64-NEXT: vmv1r.v v5, v24
+; ZVBB-RV64-NEXT: vmv1r.v v7, v26
+; ZVBB-RV64-NEXT: add a3, a0, a2
+; ZVBB-RV64-NEXT: vmv1r.v v2, v10
+; ZVBB-RV64-NEXT: add a4, a1, a2
+; ZVBB-RV64-NEXT: slli a5, a2, 2
+; ZVBB-RV64-NEXT: vmv1r.v v4, v14
+; ZVBB-RV64-NEXT: slli a6, a2, 4
+; ZVBB-RV64-NEXT: add a7, a4, a2
+; ZVBB-RV64-NEXT: vmv1r.v v6, v18
+; ZVBB-RV64-NEXT: sub a5, a6, a5
+; ZVBB-RV64-NEXT: vmv1r.v v22, v11
+; ZVBB-RV64-NEXT: add a6, a7, a2
+; ZVBB-RV64-NEXT: vmv1r.v v24, v15
+; ZVBB-RV64-NEXT: vsseg7e32.v v1, (a0)
+; ZVBB-RV64-NEXT: vmv1r.v v26, v19
+; ZVBB-RV64-NEXT: vsseg7e32.v v21, (a1)
+; ZVBB-RV64-NEXT: vl1re32.v v10, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v11, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v12, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v13, (a6)
+; ZVBB-RV64-NEXT: add a6, a3, a2
+; ZVBB-RV64-NEXT: vl1re32.v v18, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v19, (a6)
+; ZVBB-RV64-NEXT: vl1re32.v v16, (a0)
+; ZVBB-RV64-NEXT: vl1re32.v v8, (a4)
+; ZVBB-RV64-NEXT: vl1re32.v v17, (a3)
+; ZVBB-RV64-NEXT: vl1re32.v v9, (a7)
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a3, 14
+; ZVBB-RV64-NEXT: mul a0, a0, a3
+; ZVBB-RV64-NEXT: add a0, sp, a0
+; ZVBB-RV64-NEXT: addi a0, a0, 64
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v20, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v21, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: slli a2, a2, 3
+; ZVBB-RV64-NEXT: add a2, a0, a2
+; ZVBB-RV64-NEXT: vl1re32.v v22, (a6)
+; ZVBB-RV64-NEXT: vl1re32.v v23, (a1)
+; ZVBB-RV64-NEXT: add a5, a0, a5
+; ZVBB-RV64-NEXT: vs2r.v v12, (a5)
+; ZVBB-RV64-NEXT: vs4r.v v8, (a2)
+; ZVBB-RV64-NEXT: vs8r.v v16, (a0)
+; ZVBB-RV64-NEXT: vl8re32.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8re32.v v8, (a0)
+; ZVBB-RV64-NEXT: addi sp, s0, -80
+; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: addi sp, sp, 80
+; ZVBB-RV64-NEXT: ret
+ %res = call <vscale x 28 x i32> @llvm.vector.interleave7.nxv28i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, <vscale x 4 x i32> %e, <vscale x 4 x i32> %f, <vscale x 4 x i32> %g)
+ ret <vscale x 28 x i32> %res
+}
+
+define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d, <vscale x 2 x i64> %e, <vscale x 2 x i64> %f, <vscale x 2 x i64> %g) nounwind {
+;
+; RV32-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -80
+; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 80
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT: vmv2r.v v26, v20
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv2r.v v24, v16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 3
+; RV32-NEXT: sub a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 64
+; RV32-NEXT: vmv2r.v v22, v12
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: vmv2r.v v20, v8
+; RV32-NEXT: vmv1r.v v1, v20
+; RV32-NEXT: vmv1r.v v3, v22
+; RV32-NEXT: vmv1r.v v5, v24
+; RV32-NEXT: vmv1r.v v7, v26
+; RV32-NEXT: add a3, a0, a2
+; RV32-NEXT: vmv1r.v v2, v10
+; RV32-NEXT: add a4, a1, a2
+; RV32-NEXT: slli a5, a2, 2
+; RV32-NEXT: vmv1r.v v4, v14
+; RV32-NEXT: slli a6, a2, 4
+; RV32-NEXT: add a7, a4, a2
+; RV32-NEXT: vmv1r.v v6, v18
+; RV32-NEXT: sub a5, a6, a5
+; RV32-NEXT: vmv1r.v v22, v11
+; RV32-NEXT: add a6, a7, a2
+; RV32-NEXT: vmv1r.v v24, v15
+; RV32-NEXT: vsseg7e64.v v1, (a0)
+; RV32-NEXT: vmv1r.v v26, v19
+; RV32-NEXT: vsseg7e64.v v21, (a1)
+; RV32-NEXT: vl1re64.v v10, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re64.v v11, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re64.v v12, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re64.v v13, (a6)
+; RV32-NEXT: add a6, a3, a2
+; RV32-NEXT: vl1re64.v v18, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re64.v v19, (a6)
+; RV32-NEXT: vl1re64.v v16, (a0)
+; RV32-NEXT: vl1re64.v v8, (a4)
+; RV32-NEXT: vl1re64.v v17, (a3)
+; RV32-NEXT: vl1re64.v v9, (a7)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a3, 14
+; RV32-NEXT: mul a0, a0, a3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 64
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re64.v v20, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re64.v v21, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a0, a2
+; RV32-NEXT: vl1re64.v v22, (a6)
+; RV32-NEXT: vl1re64.v v23, (a1)
+; RV32-NEXT: add a5, a0, a5
+; RV32-NEXT: vs2r.v v12, (a5)
+; RV32-NEXT: vs4r.v v8, (a2)
+; RV32-NEXT: vs8r.v v16, (a0)
+; RV32-NEXT: vl8re64.v v16, (a2)
+; RV32-NEXT: vl8re64.v v8, (a0)
+; RV32-NEXT: addi sp, s0, -80
+; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 80
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -80
+; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi s0, sp, 80
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: sub sp, sp, a0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv2r.v v26, v20
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv2r.v v24, v16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a2, a1, 3
+; RV64-NEXT: sub a1, a2, a1
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 64
+; RV64-NEXT: vmv2r.v v22, v12
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: vmv2r.v v20, v8
+; RV64-NEXT: vmv1r.v v1, v20
+; RV64-NEXT: vmv1r.v v3, v22
+; RV64-NEXT: vmv1r.v v5, v24
+; RV64-NEXT: vmv1r.v v7, v26
+; RV64-NEXT: add a3, a0, a2
+; RV64-NEXT: vmv1r.v v2, v10
+; RV64-NEXT: add a4, a1, a2
+; RV64-NEXT: slli a5, a2, 2
+; RV64-NEXT: vmv1r.v v4, v14
+; RV64-NEXT: slli a6, a2, 4
+; RV64-NEXT: add a7, a4, a2
+; RV64-NEXT: vmv1r.v v6, v18
+; RV64-NEXT: sub a5, a6, a5
+; RV64-NEXT: vmv1r.v v22, v11
+; RV64-NEXT: add a6, a7, a2
+; RV64-NEXT: vmv1r.v v24, v15
+; RV64-NEXT: vsseg7e64.v v1, (a0)
+; RV64-NEXT: vmv1r.v v26, v19
+; RV64-NEXT: vsseg7e64.v v21, (a1)
+; RV64-NEXT: vl1re64.v v10, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re64.v v11, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re64.v v12, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re64.v v13, (a6)
+; RV64-NEXT: add a6, a3, a2
+; RV64-NEXT: vl1re64.v v18, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re64.v v19, (a6)
+; RV64-NEXT: vl1re64.v v16, (a0)
+; RV64-NEXT: vl1re64.v v8, (a4)
+; RV64-NEXT: vl1re64.v v17, (a3)
+; RV64-NEXT: vl1re64.v v9, (a7)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a3, 14
+; RV64-NEXT: mul a0, a0, a3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 64
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re64.v v20, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re64.v v21, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a0, a2
+; RV64-NEXT: vl1re64.v v22, (a6)
+; RV64-NEXT: vl1re64.v v23, (a1)
+; RV64-NEXT: add a5, a0, a5
+; RV64-NEXT: vs2r.v v12, (a5)
+; RV64-NEXT: vs4r.v v8, (a2)
+; RV64-NEXT: vs8r.v v16, (a0)
+; RV64-NEXT: vl8re64.v v16, (a2)
+; RV64-NEXT: vl8re64.v v8, (a0)
+; RV64-NEXT: addi sp, s0, -80
+; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 80
+; RV64-NEXT: ret
+;
+; ZVBB-RV32-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; ZVBB-RV32: # %bb.0:
+; ZVBB-RV32-NEXT: addi sp, sp, -80
+; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: addi s0, sp, 80
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: slli a0, a0, 5
+; ZVBB-RV32-NEXT: sub sp, sp, a0
+; ZVBB-RV32-NEXT: andi sp, sp, -64
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV32-NEXT: vmv2r.v v26, v20
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv2r.v v24, v16
+; ZVBB-RV32-NEXT: csrr a1, vlenb
+; ZVBB-RV32-NEXT: slli a2, a1, 3
+; ZVBB-RV32-NEXT: sub a1, a2, a1
+; ZVBB-RV32-NEXT: add a1, sp, a1
+; ZVBB-RV32-NEXT: addi a1, a1, 64
+; ZVBB-RV32-NEXT: vmv2r.v v22, v12
+; ZVBB-RV32-NEXT: csrr a2, vlenb
+; ZVBB-RV32-NEXT: vmv2r.v v20, v8
+; ZVBB-RV32-NEXT: vmv1r.v v1, v20
+; ZVBB-RV32-NEXT: vmv1r.v v3, v22
+; ZVBB-RV32-NEXT: vmv1r.v v5, v24
+; ZVBB-RV32-NEXT: vmv1r.v v7, v26
+; ZVBB-RV32-NEXT: add a3, a0, a2
+; ZVBB-RV32-NEXT: vmv1r.v v2, v10
+; ZVBB-RV32-NEXT: add a4, a1, a2
+; ZVBB-RV32-NEXT: slli a5, a2, 2
+; ZVBB-RV32-NEXT: vmv1r.v v4, v14
+; ZVBB-RV32-NEXT: slli a6, a2, 4
+; ZVBB-RV32-NEXT: add a7, a4, a2
+; ZVBB-RV32-NEXT: vmv1r.v v6, v18
+; ZVBB-RV32-NEXT: sub a5, a6, a5
+; ZVBB-RV32-NEXT: vmv1r.v v22, v11
+; ZVBB-RV32-NEXT: add a6, a7, a2
+; ZVBB-RV32-NEXT: vmv1r.v v24, v15
+; ZVBB-RV32-NEXT: vsseg7e64.v v1, (a0)
+; ZVBB-RV32-NEXT: vmv1r.v v26, v19
+; ZVBB-RV32-NEXT: vsseg7e64.v v21, (a1)
+; ZVBB-RV32-NEXT: vl1re64.v v10, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v11, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v12, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v13, (a6)
+; ZVBB-RV32-NEXT: add a6, a3, a2
+; ZVBB-RV32-NEXT: vl1re64.v v18, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v19, (a6)
+; ZVBB-RV32-NEXT: vl1re64.v v16, (a0)
+; ZVBB-RV32-NEXT: vl1re64.v v8, (a4)
+; ZVBB-RV32-NEXT: vl1re64.v v17, (a3)
+; ZVBB-RV32-NEXT: vl1re64.v v9, (a7)
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a3, 14
+; ZVBB-RV32-NEXT: mul a0, a0, a3
+; ZVBB-RV32-NEXT: add a0, sp, a0
+; ZVBB-RV32-NEXT: addi a0, a0, 64
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v20, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v21, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: slli a2, a2, 3
+; ZVBB-RV32-NEXT: add a2, a0, a2
+; ZVBB-RV32-NEXT: vl1re64.v v22, (a6)
+; ZVBB-RV32-NEXT: vl1re64.v v23, (a1)
+; ZVBB-RV32-NEXT: add a5, a0, a5
+; ZVBB-RV32-NEXT: vs2r.v v12, (a5)
+; ZVBB-RV32-NEXT: vs4r.v v8, (a2)
+; ZVBB-RV32-NEXT: vs8r.v v16, (a0)
+; ZVBB-RV32-NEXT: vl8re64.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8re64.v v8, (a0)
+; ZVBB-RV32-NEXT: addi sp, s0, -80
+; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: addi sp, sp, 80
+; ZVBB-RV32-NEXT: ret
+;
+; ZVBB-RV64-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; ZVBB-RV64: # %bb.0:
+; ZVBB-RV64-NEXT: addi sp, sp, -80
+; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: addi s0, sp, 80
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: slli a0, a0, 5
+; ZVBB-RV64-NEXT: sub sp, sp, a0
+; ZVBB-RV64-NEXT: andi sp, sp, -64
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV64-NEXT: vmv2r.v v26, v20
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv2r.v v24, v16
+; ZVBB-RV64-NEXT: csrr a1, vlenb
+; ZVBB-RV64-NEXT: slli a2, a1, 3
+; ZVBB-RV64-NEXT: sub a1, a2, a1
+; ZVBB-RV64-NEXT: add a1, sp, a1
+; ZVBB-RV64-NEXT: addi a1, a1, 64
+; ZVBB-RV64-NEXT: vmv2r.v v22, v12
+; ZVBB-RV64-NEXT: csrr a2, vlenb
+; ZVBB-RV64-NEXT: vmv2r.v v20, v8
+; ZVBB-RV64-NEXT: vmv1r.v v1, v20
+; ZVBB-RV64-NEXT: vmv1r.v v3, v22
+; ZVBB-RV64-NEXT: vmv1r.v v5, v24
+; ZVBB-RV64-NEXT: vmv1r.v v7, v26
+; ZVBB-RV64-NEXT: add a3, a0, a2
+; ZVBB-RV64-NEXT: vmv1r.v v2, v10
+; ZVBB-RV64-NEXT: add a4, a1, a2
+; ZVBB-RV64-NEXT: slli a5, a2, 2
+; ZVBB-RV64-NEXT: vmv1r.v v4, v14
+; ZVBB-RV64-NEXT: slli a6, a2, 4
+; ZVBB-RV64-NEXT: add a7, a4, a2
+; ZVBB-RV64-NEXT: vmv1r.v v6, v18
+; ZVBB-RV64-NEXT: sub a5, a6, a5
+; ZVBB-RV64-NEXT: vmv1r.v v22, v11
+; ZVBB-RV64-NEXT: add a6, a7, a2
+; ZVBB-RV64-NEXT: vmv1r.v v24, v15
+; ZVBB-RV64-NEXT: vsseg7e64.v v1, (a0)
+; ZVBB-RV64-NEXT: vmv1r.v v26, v19
+; ZVBB-RV64-NEXT: vsseg7e64.v v21, (a1)
+; ZVBB-RV64-NEXT: vl1re64.v v10, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v11, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v12, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v13, (a6)
+; ZVBB-RV64-NEXT: add a6, a3, a2
+; ZVBB-RV64-NEXT: vl1re64.v v18, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v19, (a6)
+; ZVBB-RV64-NEXT: vl1re64.v v16, (a0)
+; ZVBB-RV64-NEXT: vl1re64.v v8, (a4)
+; ZVBB-RV64-NEXT: vl1re64.v v17, (a3)
+; ZVBB-RV64-NEXT: vl1re64.v v9, (a7)
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a3, 14
+; ZVBB-RV64-NEXT: mul a0, a0, a3
+; ZVBB-RV64-NEXT: add a0, sp, a0
+; ZVBB-RV64-NEXT: addi a0, a0, 64
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v20, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v21, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: slli a2, a2, 3
+; ZVBB-RV64-NEXT: add a2, a0, a2
+; ZVBB-RV64-NEXT: vl1re64.v v22, (a6)
+; ZVBB-RV64-NEXT: vl1re64.v v23, (a1)
+; ZVBB-RV64-NEXT: add a5, a0, a5
+; ZVBB-RV64-NEXT: vs2r.v v12, (a5)
+; ZVBB-RV64-NEXT: vs4r.v v8, (a2)
+; ZVBB-RV64-NEXT: vs8r.v v16, (a0)
+; ZVBB-RV64-NEXT: vl8re64.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8re64.v v8, (a0)
+; ZVBB-RV64-NEXT: addi sp, s0, -80
+; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: addi sp, sp, 80
+; ZVBB-RV64-NEXT: ret
+ %res = call <vscale x 14 x i64> @llvm.vector.interleave7.nxv14i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d, <vscale x 2 x i64> %e, <vscale x 2 x i64> %f, <vscale x 2 x i64> %g)
+ ret <vscale x 14 x i64> %res
+}
More information about the llvm-commits
mailing list