[llvm] [IR] Add llvm.vector.[de]interleave{4,6,8} (PR #139893)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Mon May 26 06:31:24 PDT 2025
https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/139893
>From 915c27d908e4a6ba43b35c957bf7aacbf408dc91 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 12 May 2025 07:08:16 +0300
Subject: [PATCH 1/5] [IR] Add llvm.vector.(de)interleave4/6/8
---
llvm/docs/LangRef.rst | 10 +-
llvm/include/llvm/IR/Intrinsics.h | 30 +-
llvm/include/llvm/IR/Intrinsics.td | 66 +
.../SelectionDAG/SelectionDAGBuilder.cpp | 18 +
llvm/lib/IR/Intrinsics.cpp | 28 +-
.../RISCV/rvv/vector-deinterleave-fixed.ll | 502 +-
.../CodeGen/RISCV/rvv/vector-deinterleave.ll | 1753 +-
.../RISCV/rvv/vector-interleave-fixed.ll | 787 +-
.../CodeGen/RISCV/rvv/vector-interleave.ll | 13856 +++++++++++-----
9 files changed, 12995 insertions(+), 4055 deletions(-)
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 7296bb84b7d95..c0bc0a10ed537 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20158,7 +20158,7 @@ Arguments:
The argument to this intrinsic must be a vector.
-'``llvm.vector.deinterleave2/3/5/7``' Intrinsic
+'``llvm.vector.deinterleave2/3/4/5/6/7/8``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Syntax:
@@ -20176,8 +20176,8 @@ This is an overloaded intrinsic.
Overview:
"""""""""
-The '``llvm.vector.deinterleave2/3/5/7``' intrinsics deinterleave adjacent lanes
-into 2, 3, 5, and 7 separate vectors, respectively, and return them as the
+The '``llvm.vector.deinterleave2/3/4/5/6/7/8``' intrinsics deinterleave adjacent lanes
+into 2 through to 8 separate vectors, respectively, and return them as the
result.
This intrinsic works for both fixed and scalable vectors. While this intrinsic
@@ -20199,7 +20199,7 @@ Arguments:
The argument is a vector whose type corresponds to the logical concatenation of
the aggregated result types.
-'``llvm.vector.interleave2/3/5/7``' Intrinsic
+'``llvm.vector.interleave2/3/4/5/6/7/8``' Intrinsic
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Syntax:
@@ -20217,7 +20217,7 @@ This is an overloaded intrinsic.
Overview:
"""""""""
-The '``llvm.vector.interleave2/3/5/7``' intrinsic constructs a vector
+The '``llvm.vector.interleave2/3/4/5/6/7/8``' intrinsic constructs a vector
by interleaving all the input vectors.
This intrinsic works for both fixed and scalable vectors. While this intrinsic
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 6fb1bf9359b9a..b64784909fc25 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -153,8 +153,11 @@ namespace Intrinsic {
TruncArgument,
HalfVecArgument,
OneThirdVecArgument,
+ OneFourthVecArgument,
OneFifthVecArgument,
+ OneSixthVecArgument,
OneSeventhVecArgument,
+ OneEighthVecArgument,
SameVecWidthArgument,
VecOfAnyPtrsToElt,
VecElementArgument,
@@ -167,8 +170,11 @@ namespace Intrinsic {
} Kind;
// These three have to be contiguous.
- static_assert(OneFifthVecArgument == OneThirdVecArgument + 1 &&
- OneSeventhVecArgument == OneFifthVecArgument + 1);
+ static_assert(OneFourthVecArgument == OneThirdVecArgument + 1 &&
+ OneFifthVecArgument == OneFourthVecArgument + 1 &&
+ OneSixthVecArgument == OneFifthVecArgument + 1 &&
+ OneSeventhVecArgument == OneSixthVecArgument + 1 &&
+ OneEighthVecArgument == OneSeventhVecArgument + 1);
union {
unsigned Integer_Width;
unsigned Float_Width;
@@ -188,19 +194,23 @@ namespace Intrinsic {
unsigned getArgumentNumber() const {
assert(Kind == Argument || Kind == ExtendArgument ||
Kind == TruncArgument || Kind == HalfVecArgument ||
- Kind == OneThirdVecArgument || Kind == OneFifthVecArgument ||
- Kind == OneSeventhVecArgument || Kind == SameVecWidthArgument ||
- Kind == VecElementArgument || Kind == Subdivide2Argument ||
- Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt);
+ Kind == OneThirdVecArgument || Kind == OneFourthVecArgument ||
+ Kind == OneFifthVecArgument || Kind == OneSixthVecArgument ||
+ Kind == OneSeventhVecArgument || Kind == OneEighthVecArgument ||
+ Kind == SameVecWidthArgument || Kind == VecElementArgument ||
+ Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
+ Kind == VecOfBitcastsToInt);
return Argument_Info >> 3;
}
ArgKind getArgumentKind() const {
assert(Kind == Argument || Kind == ExtendArgument ||
Kind == TruncArgument || Kind == HalfVecArgument ||
- Kind == OneThirdVecArgument || Kind == OneFifthVecArgument ||
- Kind == OneSeventhVecArgument || Kind == SameVecWidthArgument ||
- Kind == VecElementArgument || Kind == Subdivide2Argument ||
- Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt);
+ Kind == OneThirdVecArgument || Kind == OneFourthVecArgument ||
+ Kind == OneFifthVecArgument || Kind == OneSixthVecArgument ||
+ Kind == OneSeventhVecArgument || Kind == OneEighthVecArgument ||
+ Kind == SameVecWidthArgument || Kind == VecElementArgument ||
+ Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
+ Kind == VecOfBitcastsToInt);
return (ArgKind)(Argument_Info & 7);
}
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 8d26961eebbf3..3994a543f9dcf 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -340,6 +340,9 @@ def IIT_ONE_FIFTH_VEC_ARG : IIT_Base<63>;
def IIT_ONE_SEVENTH_VEC_ARG : IIT_Base<64>;
def IIT_V2048: IIT_Vec<2048, 65>;
def IIT_V4096: IIT_Vec<4096, 66>;
+def IIT_ONE_FOURTH_VEC_ARG : IIT_Base<67>;
+def IIT_ONE_SIXTH_VEC_ARG : IIT_Base<68>;
+def IIT_ONE_EIGHTH_VEC_ARG : IIT_Base<69>;
}
defvar IIT_all_FixedTypes = !filter(iit, IIT_all,
@@ -483,12 +486,21 @@ class LLVMHalfElementsVectorType<int num>
class LLVMOneThirdElementsVectorType<int num>
: LLVMMatchType<num, IIT_ONE_THIRD_VEC_ARG>;
+class LLVMOneFourthElementsVectorType<int num>
+ : LLVMMatchType<num, IIT_ONE_FOURTH_VEC_ARG>;
+
class LLVMOneFifthElementsVectorType<int num>
: LLVMMatchType<num, IIT_ONE_FIFTH_VEC_ARG>;
+class LLVMOneSixthElementsVectorType<int num>
+ : LLVMMatchType<num, IIT_ONE_SIXTH_VEC_ARG>;
+
class LLVMOneSeventhElementsVectorType<int num>
: LLVMMatchType<num, IIT_ONE_SEVENTH_VEC_ARG>;
+class LLVMOneEighthElementsVectorType<int num>
+ : LLVMMatchType<num, IIT_ONE_EIGHTH_VEC_ARG>;
+
// Match the type of another intrinsic parameter that is expected to be a
// vector type (i.e. <N x iM>) but with each element subdivided to
// form a vector with more elements that are smaller than the original.
@@ -2776,6 +2788,20 @@ def int_vector_deinterleave3 : DefaultAttrsIntrinsic<[LLVMOneThirdElementsVector
[llvm_anyvector_ty],
[IntrNoMem]>;
+def int_vector_interleave4 : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMOneFourthElementsVectorType<0>,
+ LLVMOneFourthElementsVectorType<0>,
+ LLVMOneFourthElementsVectorType<0>,
+ LLVMOneFourthElementsVectorType<0>],
+ [IntrNoMem]>;
+
+def int_vector_deinterleave4 : DefaultAttrsIntrinsic<[LLVMOneFourthElementsVectorType<0>,
+ LLVMOneFourthElementsVectorType<0>,
+ LLVMOneFourthElementsVectorType<0>,
+ LLVMOneFourthElementsVectorType<0>],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+
def int_vector_interleave5 : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMOneFifthElementsVectorType<0>,
LLVMOneFifthElementsVectorType<0>,
@@ -2792,6 +2818,24 @@ def int_vector_deinterleave5 : DefaultAttrsIntrinsic<[LLVMOneFifthElementsVector
[llvm_anyvector_ty],
[IntrNoMem]>;
+def int_vector_interleave6 : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMOneSixthElementsVectorType<0>,
+ LLVMOneSixthElementsVectorType<0>,
+ LLVMOneSixthElementsVectorType<0>,
+ LLVMOneSixthElementsVectorType<0>,
+ LLVMOneSixthElementsVectorType<0>,
+ LLVMOneSixthElementsVectorType<0>],
+ [IntrNoMem]>;
+
+def int_vector_deinterleave6 : DefaultAttrsIntrinsic<[LLVMOneSixthElementsVectorType<0>,
+ LLVMOneSixthElementsVectorType<0>,
+ LLVMOneSixthElementsVectorType<0>,
+ LLVMOneSixthElementsVectorType<0>,
+ LLVMOneSixthElementsVectorType<0>,
+ LLVMOneSixthElementsVectorType<0>],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+
def int_vector_interleave7 : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[LLVMOneSeventhElementsVectorType<0>,
LLVMOneSeventhElementsVectorType<0>,
@@ -2812,6 +2856,28 @@ def int_vector_deinterleave7 : DefaultAttrsIntrinsic<[LLVMOneSeventhElementsVect
[llvm_anyvector_ty],
[IntrNoMem]>;
+def int_vector_interleave8 : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMOneEighthElementsVectorType<0>,
+ LLVMOneEighthElementsVectorType<0>,
+ LLVMOneEighthElementsVectorType<0>,
+ LLVMOneEighthElementsVectorType<0>,
+ LLVMOneEighthElementsVectorType<0>,
+ LLVMOneEighthElementsVectorType<0>,
+ LLVMOneEighthElementsVectorType<0>,
+ LLVMOneEighthElementsVectorType<0>],
+ [IntrNoMem]>;
+
+def int_vector_deinterleave8 : DefaultAttrsIntrinsic<[LLVMOneEighthElementsVectorType<0>,
+ LLVMOneEighthElementsVectorType<0>,
+ LLVMOneEighthElementsVectorType<0>,
+ LLVMOneEighthElementsVectorType<0>,
+ LLVMOneEighthElementsVectorType<0>,
+ LLVMOneEighthElementsVectorType<0>,
+ LLVMOneEighthElementsVectorType<0>,
+ LLVMOneEighthElementsVectorType<0>],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+
//===-------------- Intrinsics to perform partial reduction ---------------===//
def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 9d138d364bad7..10ee75a83a267 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8181,24 +8181,42 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
case Intrinsic::vector_interleave3:
visitVectorInterleave(I, 3);
return;
+ case Intrinsic::vector_interleave4:
+ visitVectorInterleave(I, 4);
+ return;
case Intrinsic::vector_interleave5:
visitVectorInterleave(I, 5);
return;
+ case Intrinsic::vector_interleave6:
+ visitVectorInterleave(I, 6);
+ return;
case Intrinsic::vector_interleave7:
visitVectorInterleave(I, 7);
return;
+ case Intrinsic::vector_interleave8:
+ visitVectorInterleave(I, 8);
+ return;
case Intrinsic::vector_deinterleave2:
visitVectorDeinterleave(I, 2);
return;
case Intrinsic::vector_deinterleave3:
visitVectorDeinterleave(I, 3);
return;
+ case Intrinsic::vector_deinterleave4:
+ visitVectorDeinterleave(I, 4);
+ return;
case Intrinsic::vector_deinterleave5:
visitVectorDeinterleave(I, 5);
return;
+ case Intrinsic::vector_deinterleave6:
+ visitVectorDeinterleave(I, 6);
+ return;
case Intrinsic::vector_deinterleave7:
visitVectorDeinterleave(I, 7);
return;
+ case Intrinsic::vector_deinterleave8:
+ visitVectorDeinterleave(I, 8);
+ return;
case Intrinsic::experimental_vector_compress:
setValue(&I, DAG.getNode(ISD::VECTOR_COMPRESS, sdl,
getValue(I.getArgOperand(0)).getValueType(),
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index dabb5fe006b3c..28f7523476774 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -378,18 +378,36 @@ DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
IITDescriptor::get(IITDescriptor::OneThirdVecArgument, ArgInfo));
return;
}
+ case IIT_ONE_FOURTH_VEC_ARG: {
+ unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+ OutputTable.push_back(
+ IITDescriptor::get(IITDescriptor::OneFourthVecArgument, ArgInfo));
+ return;
+ }
case IIT_ONE_FIFTH_VEC_ARG: {
unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
OutputTable.push_back(
IITDescriptor::get(IITDescriptor::OneFifthVecArgument, ArgInfo));
return;
}
+ case IIT_ONE_SIXTH_VEC_ARG: {
+ unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+ OutputTable.push_back(
+ IITDescriptor::get(IITDescriptor::OneSixthVecArgument, ArgInfo));
+ return;
+ }
case IIT_ONE_SEVENTH_VEC_ARG: {
unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
OutputTable.push_back(
IITDescriptor::get(IITDescriptor::OneSeventhVecArgument, ArgInfo));
return;
}
+ case IIT_ONE_EIGHTH_VEC_ARG: {
+ unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+ OutputTable.push_back(
+ IITDescriptor::get(IITDescriptor::OneEighthVecArgument, ArgInfo));
+ return;
+ }
case IIT_SAME_VEC_WIDTH_ARG: {
unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
OutputTable.push_back(
@@ -584,11 +602,14 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
return VectorType::getHalfElementsVectorType(
cast<VectorType>(Tys[D.getArgumentNumber()]));
case IITDescriptor::OneThirdVecArgument:
+ case IITDescriptor::OneFourthVecArgument:
case IITDescriptor::OneFifthVecArgument:
+ case IITDescriptor::OneSixthVecArgument:
case IITDescriptor::OneSeventhVecArgument:
+ case IITDescriptor::OneEighthVecArgument:
return VectorType::getOneNthElementsVectorType(
cast<VectorType>(Tys[D.getArgumentNumber()]),
- 3 + (D.Kind - IITDescriptor::OneThirdVecArgument) * 2);
+ 3 + (D.Kind - IITDescriptor::OneThirdVecArgument));
case IITDescriptor::SameVecWidthArgument: {
Type *EltTy = DecodeFixedType(Infos, Tys, Context);
Type *Ty = Tys[D.getArgumentNumber()];
@@ -974,15 +995,18 @@ matchIntrinsicType(Type *Ty, ArrayRef<Intrinsic::IITDescriptor> &Infos,
VectorType::getHalfElementsVectorType(
cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
case IITDescriptor::OneThirdVecArgument:
+ case IITDescriptor::OneFourthVecArgument:
case IITDescriptor::OneFifthVecArgument:
+ case IITDescriptor::OneSixthVecArgument:
case IITDescriptor::OneSeventhVecArgument:
+ case IITDescriptor::OneEighthVecArgument:
// If this is a forward reference, defer the check for later.
if (D.getArgumentNumber() >= ArgTys.size())
return IsDeferredCheck || DeferCheck(Ty);
return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
VectorType::getOneNthElementsVectorType(
cast<VectorType>(ArgTys[D.getArgumentNumber()]),
- 3 + (D.Kind - IITDescriptor::OneThirdVecArgument) * 2) != Ty;
+ 3 + (D.Kind - IITDescriptor::OneThirdVecArgument)) != Ty;
case IITDescriptor::SameVecWidthArgument: {
if (D.getArgumentNumber() >= ArgTys.size()) {
// Defer check and subsequent check for the vector element type.
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index f6b5a35aa06d6..a3ad0b26efd4d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -223,6 +223,41 @@ define {<2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v6i32(<6 x
ret {<2 x i32>, <2 x i32>, <2 x i32>} %res
}
+define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v8i32(<8 x i32> %v) {
+; CHECK-LABEL: vector_deinterleave3_v2i32_v8i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vsetivli zero, 2, e32, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v10, v8, 6
+; CHECK-NEXT: vslidedown.vi v12, v8, 4
+; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v9, v8, 2
+; CHECK-NEXT: srli a0, a0, 3
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v12, v10, a0
+; CHECK-NEXT: vslideup.vx v8, v9, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv.v.v v9, v12
+; CHECK-NEXT: vs2r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vlseg4e32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %res = call {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @llvm.vector.deinterleave4.v8i32(<8 x i32> %v)
+ ret {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} %res
+}
define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterleave5_v2i16_v10i16(<10 x i16> %v) {
; CHECK-LABEL: vector_deinterleave5_v2i16_v10i16:
@@ -265,6 +300,49 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterle
ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res
}
+define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterleave6_v2i16_v12i16(<12 x i16> %v) {
+; CHECK-LABEL: vector_deinterleave6_v2i16_v12i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v14, v8, 6
+; CHECK-NEXT: vslidedown.vi v15, v8, 4
+; CHECK-NEXT: vslidedown.vi v16, v8, 2
+; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v10, v8, 10
+; CHECK-NEXT: vslidedown.vi v12, v8, 8
+; CHECK-NEXT: srli a1, a0, 3
+; CHECK-NEXT: srli a0, a0, 2
+; CHECK-NEXT: add a2, a1, a1
+; CHECK-NEXT: add a3, a0, a0
+; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v15, v14, a1
+; CHECK-NEXT: vslideup.vx v8, v16, a1
+; CHECK-NEXT: vslideup.vx v12, v10, a1
+; CHECK-NEXT: vsetvli zero, a3, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v15, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v9, v12
+; CHECK-NEXT: vs2r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vlseg6e16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %res = call {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @llvm.vector.deinterleave6.v12i16(<12 x i16> %v)
+ ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res
+}
+
define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave7_v14i8_v2i8(<14 x i8> %v) {
; RV32-LABEL: vector_deinterleave7_v14i8_v2i8:
; RV32: # %bb.0:
@@ -542,6 +620,300 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @v
ret {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} %res
}
+define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave8_v16i8_v2i8(<16 x i8> %v) {
+; RV32-LABEL: vector_deinterleave8_v16i8_v2i8:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -48
+; RV32-NEXT: .cfi_def_cfa_offset 48
+; RV32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s1, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s2, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s3, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s4, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT: .cfi_offset ra, -4
+; RV32-NEXT: .cfi_offset s0, -8
+; RV32-NEXT: .cfi_offset s1, -12
+; RV32-NEXT: .cfi_offset s2, -16
+; RV32-NEXT: .cfi_offset s3, -20
+; RV32-NEXT: .cfi_offset s4, -24
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 1
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 3 * vlenb
+; RV32-NEXT: addi a0, sp, 16
+; RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32-NEXT: csrr s1, vlenb
+; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v10, v8, 10
+; RV32-NEXT: vslidedown.vi v9, v8, 8
+; RV32-NEXT: srli s0, s1, 3
+; RV32-NEXT: srli s2, s1, 2
+; RV32-NEXT: add s3, s0, s0
+; RV32-NEXT: add s4, s2, s0
+; RV32-NEXT: vsetvli zero, s3, e8, mf2, tu, ma
+; RV32-NEXT: vslideup.vx v9, v10, s0
+; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v10, v8, 12
+; RV32-NEXT: vsetvli zero, s4, e8, mf2, tu, ma
+; RV32-NEXT: vslideup.vx v9, v10, s2
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV32-NEXT: li a1, 3
+; RV32-NEXT: mv a0, s0
+; RV32-NEXT: call __mulsi3
+; RV32-NEXT: add a1, a0, s0
+; RV32-NEXT: addi a2, sp, 16
+; RV32-NEXT: vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma
+; RV32-NEXT: vslidedown.vi v8, v12, 14
+; RV32-NEXT: vslidedown.vi v9, v12, 2
+; RV32-NEXT: vmv1r.v v10, v12
+; RV32-NEXT: vslidedown.vi v11, v12, 4
+; RV32-NEXT: vslidedown.vi v12, v12, 6
+; RV32-NEXT: srli s1, s1, 1
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: add a2, sp, a2
+; RV32-NEXT: addi a2, a2, 16
+; RV32-NEXT: vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV32-NEXT: vslideup.vx v13, v8, a0
+; RV32-NEXT: vsetvli zero, s3, e8, mf2, tu, ma
+; RV32-NEXT: vslideup.vx v10, v9, s0
+; RV32-NEXT: add a2, s1, s1
+; RV32-NEXT: vsetvli zero, s4, e8, mf2, tu, ma
+; RV32-NEXT: vslideup.vx v10, v11, s2
+; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV32-NEXT: vslideup.vx v10, v12, a0
+; RV32-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; RV32-NEXT: vslideup.vx v10, v13, s1
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 1
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 16
+; RV32-NEXT: vs1r.v v10, (a0)
+; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
+; RV32-NEXT: vlseg8e8.v v8, (a0)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a1, a0, 1
+; RV32-NEXT: add a0, a1, a0
+; RV32-NEXT: add sp, sp, a0
+; RV32-NEXT: .cfi_def_cfa sp, 48
+; RV32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s1, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s2, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s3, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s4, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT: .cfi_restore ra
+; RV32-NEXT: .cfi_restore s0
+; RV32-NEXT: .cfi_restore s1
+; RV32-NEXT: .cfi_restore s2
+; RV32-NEXT: .cfi_restore s3
+; RV32-NEXT: .cfi_restore s4
+; RV32-NEXT: addi sp, sp, 48
+; RV32-NEXT: .cfi_def_cfa_offset 0
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_deinterleave8_v16i8_v2i8:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -64
+; RV64-NEXT: .cfi_def_cfa_offset 64
+; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s1, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s2, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s3, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s4, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT: .cfi_offset ra, -8
+; RV64-NEXT: .cfi_offset s0, -16
+; RV64-NEXT: .cfi_offset s1, -24
+; RV64-NEXT: .cfi_offset s2, -32
+; RV64-NEXT: .cfi_offset s3, -40
+; RV64-NEXT: .cfi_offset s4, -48
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a1, a0, 1
+; RV64-NEXT: add a0, a1, a0
+; RV64-NEXT: sub sp, sp, a0
+; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 3 * vlenb
+; RV64-NEXT: addi a0, sp, 16
+; RV64-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV64-NEXT: csrr s1, vlenb
+; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v10, v8, 10
+; RV64-NEXT: vslidedown.vi v9, v8, 8
+; RV64-NEXT: srli s0, s1, 3
+; RV64-NEXT: srli s2, s1, 2
+; RV64-NEXT: add s3, s0, s0
+; RV64-NEXT: add s4, s2, s0
+; RV64-NEXT: vsetvli zero, s3, e8, mf2, tu, ma
+; RV64-NEXT: vslideup.vx v9, v10, s0
+; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v10, v8, 12
+; RV64-NEXT: vsetvli zero, s4, e8, mf2, tu, ma
+; RV64-NEXT: vslideup.vx v9, v10, s2
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV64-NEXT: li a1, 3
+; RV64-NEXT: mv a0, s0
+; RV64-NEXT: call __muldi3
+; RV64-NEXT: add a1, a0, s0
+; RV64-NEXT: addi a2, sp, 16
+; RV64-NEXT: vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma
+; RV64-NEXT: vslidedown.vi v8, v12, 14
+; RV64-NEXT: vslidedown.vi v9, v12, 2
+; RV64-NEXT: vmv1r.v v10, v12
+; RV64-NEXT: vslidedown.vi v11, v12, 4
+; RV64-NEXT: vslidedown.vi v12, v12, 6
+; RV64-NEXT: srli s1, s1, 1
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: add a2, sp, a2
+; RV64-NEXT: addi a2, a2, 16
+; RV64-NEXT: vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV64-NEXT: vslideup.vx v13, v8, a0
+; RV64-NEXT: vsetvli zero, s3, e8, mf2, tu, ma
+; RV64-NEXT: vslideup.vx v10, v9, s0
+; RV64-NEXT: add a2, s1, s1
+; RV64-NEXT: vsetvli zero, s4, e8, mf2, tu, ma
+; RV64-NEXT: vslideup.vx v10, v11, s2
+; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; RV64-NEXT: vslideup.vx v10, v12, a0
+; RV64-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; RV64-NEXT: vslideup.vx v10, v13, s1
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 1
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 16
+; RV64-NEXT: vs1r.v v10, (a0)
+; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
+; RV64-NEXT: vlseg8e8.v v8, (a0)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a1, a0, 1
+; RV64-NEXT: add a0, a1, a0
+; RV64-NEXT: add sp, sp, a0
+; RV64-NEXT: .cfi_def_cfa sp, 64
+; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s1, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s2, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s3, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s4, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT: .cfi_restore ra
+; RV64-NEXT: .cfi_restore s0
+; RV64-NEXT: .cfi_restore s1
+; RV64-NEXT: .cfi_restore s2
+; RV64-NEXT: .cfi_restore s3
+; RV64-NEXT: .cfi_restore s4
+; RV64-NEXT: addi sp, sp, 64
+; RV64-NEXT: .cfi_def_cfa_offset 0
+; RV64-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave8_v16i8_v2i8:
+; ZIP: # %bb.0:
+; ZIP-NEXT: addi sp, sp, -64
+; ZIP-NEXT: .cfi_def_cfa_offset 64
+; ZIP-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
+; ZIP-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
+; ZIP-NEXT: sd s1, 40(sp) # 8-byte Folded Spill
+; ZIP-NEXT: sd s2, 32(sp) # 8-byte Folded Spill
+; ZIP-NEXT: sd s3, 24(sp) # 8-byte Folded Spill
+; ZIP-NEXT: sd s4, 16(sp) # 8-byte Folded Spill
+; ZIP-NEXT: .cfi_offset ra, -8
+; ZIP-NEXT: .cfi_offset s0, -16
+; ZIP-NEXT: .cfi_offset s1, -24
+; ZIP-NEXT: .cfi_offset s2, -32
+; ZIP-NEXT: .cfi_offset s3, -40
+; ZIP-NEXT: .cfi_offset s4, -48
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: slli a1, a0, 1
+; ZIP-NEXT: add a0, a1, a0
+; ZIP-NEXT: sub sp, sp, a0
+; ZIP-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 3 * vlenb
+; ZIP-NEXT: addi a0, sp, 16
+; ZIP-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; ZIP-NEXT: csrr s1, vlenb
+; ZIP-NEXT: vsetivli zero, 2, e8, m1, ta, ma
+; ZIP-NEXT: vslidedown.vi v10, v8, 10
+; ZIP-NEXT: vslidedown.vi v9, v8, 8
+; ZIP-NEXT: srli s0, s1, 3
+; ZIP-NEXT: srli s2, s1, 2
+; ZIP-NEXT: add s3, s0, s0
+; ZIP-NEXT: add s4, s2, s0
+; ZIP-NEXT: vsetvli zero, s3, e8, mf2, tu, ma
+; ZIP-NEXT: vslideup.vx v9, v10, s0
+; ZIP-NEXT: vsetivli zero, 2, e8, m1, ta, ma
+; ZIP-NEXT: vslidedown.vi v10, v8, 12
+; ZIP-NEXT: vsetvli zero, s4, e8, mf2, tu, ma
+; ZIP-NEXT: vslideup.vx v9, v10, s2
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: add a0, sp, a0
+; ZIP-NEXT: addi a0, a0, 16
+; ZIP-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; ZIP-NEXT: li a1, 3
+; ZIP-NEXT: mv a0, s0
+; ZIP-NEXT: call __muldi3
+; ZIP-NEXT: add a1, a0, s0
+; ZIP-NEXT: addi a2, sp, 16
+; ZIP-NEXT: vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; ZIP-NEXT: vsetivli zero, 2, e8, m1, ta, ma
+; ZIP-NEXT: vslidedown.vi v8, v12, 14
+; ZIP-NEXT: vslidedown.vi v9, v12, 2
+; ZIP-NEXT: vmv1r.v v10, v12
+; ZIP-NEXT: vslidedown.vi v11, v12, 4
+; ZIP-NEXT: vslidedown.vi v12, v12, 6
+; ZIP-NEXT: srli s1, s1, 1
+; ZIP-NEXT: csrr a2, vlenb
+; ZIP-NEXT: add a2, sp, a2
+; ZIP-NEXT: addi a2, a2, 16
+; ZIP-NEXT: vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
+; ZIP-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; ZIP-NEXT: vslideup.vx v13, v8, a0
+; ZIP-NEXT: vsetvli zero, s3, e8, mf2, tu, ma
+; ZIP-NEXT: vslideup.vx v10, v9, s0
+; ZIP-NEXT: add a2, s1, s1
+; ZIP-NEXT: vsetvli zero, s4, e8, mf2, tu, ma
+; ZIP-NEXT: vslideup.vx v10, v11, s2
+; ZIP-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; ZIP-NEXT: vslideup.vx v10, v12, a0
+; ZIP-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; ZIP-NEXT: vslideup.vx v10, v13, s1
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: slli a0, a0, 1
+; ZIP-NEXT: add a0, sp, a0
+; ZIP-NEXT: addi a0, a0, 16
+; ZIP-NEXT: vs1r.v v10, (a0)
+; ZIP-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
+; ZIP-NEXT: vlseg8e8.v v8, (a0)
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: slli a1, a0, 1
+; ZIP-NEXT: add a0, a1, a0
+; ZIP-NEXT: add sp, sp, a0
+; ZIP-NEXT: .cfi_def_cfa sp, 64
+; ZIP-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
+; ZIP-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
+; ZIP-NEXT: ld s1, 40(sp) # 8-byte Folded Reload
+; ZIP-NEXT: ld s2, 32(sp) # 8-byte Folded Reload
+; ZIP-NEXT: ld s3, 24(sp) # 8-byte Folded Reload
+; ZIP-NEXT: ld s4, 16(sp) # 8-byte Folded Reload
+; ZIP-NEXT: .cfi_restore ra
+; ZIP-NEXT: .cfi_restore s0
+; ZIP-NEXT: .cfi_restore s1
+; ZIP-NEXT: .cfi_restore s2
+; ZIP-NEXT: .cfi_restore s3
+; ZIP-NEXT: .cfi_restore s4
+; ZIP-NEXT: addi sp, sp, 64
+; ZIP-NEXT: .cfi_def_cfa_offset 0
+; ZIP-NEXT: ret
+ %res = call {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @llvm.vector.deinterleave8.v16i8(<16 x i8> %v)
+ ret {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} %res
+}
; Floats
@@ -695,8 +1067,8 @@ define {<4 x double>, <4 x double>} @vector_deinterleave_v4f64_v8f64(<8 x double
ret {<4 x double>, <4 x double>} %retval
}
-define {<2 x float>, <2 x float>, <2 x float>} @vector_deinterleave3_v632_v2f32(<6 x float> %v) {
-; CHECK-LABEL: vector_deinterleave3_v632_v2f32:
+define {<2 x float>, <2 x float>, <2 x float>} @vector_deinterleave3_v6f32_v2f32(<6 x float> %v) {
+; CHECK-LABEL: vector_deinterleave3_v6f32_v2f32:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
@@ -729,6 +1101,41 @@ define {<2 x float>, <2 x float>, <2 x float>} @vector_deinterleave3_v632_v2f32(
ret {<2 x float>, <2 x float>, <2 x float>} %res
}
+define {<2 x float>, <2 x float>, <2 x float>, <2 x float>} @vector_deinterleave4_v8f32_v2f32(<8 x float> %v) {
+; CHECK-LABEL: vector_deinterleave4_v8f32_v2f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vsetivli zero, 2, e32, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v10, v8, 6
+; CHECK-NEXT: vslidedown.vi v12, v8, 4
+; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v9, v8, 2
+; CHECK-NEXT: srli a0, a0, 3
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v12, v10, a0
+; CHECK-NEXT: vslideup.vx v8, v9, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv.v.v v9, v12
+; CHECK-NEXT: vs2r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vlseg4e32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %res = call {<2 x float>, <2 x float>, <2 x float>, <2 x float>} @llvm.vector.deinterleave4.v8f32(<8 x float> %v)
+ ret {<2 x float>, <2 x float>, <2 x float>, <2 x float>} %res
+}
define {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @vector_deinterleave5_v10f16_v2f16(<10 x half> %v) {
; CHECK-LABEL: vector_deinterleave5_v10f16_v2f16:
@@ -771,6 +1178,49 @@ define {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @vector_dein
ret {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} %res
}
+define {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @vector_deinterleave6_v12f16_v2f16(<12 x half> %v) {
+; CHECK-LABEL: vector_deinterleave6_v12f16_v2f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v14, v8, 6
+; CHECK-NEXT: vslidedown.vi v15, v8, 4
+; CHECK-NEXT: vslidedown.vi v16, v8, 2
+; CHECK-NEXT: vsetivli zero, 2, e16, m2, ta, ma
+; CHECK-NEXT: vslidedown.vi v10, v8, 10
+; CHECK-NEXT: vslidedown.vi v12, v8, 8
+; CHECK-NEXT: srli a1, a0, 3
+; CHECK-NEXT: srli a0, a0, 2
+; CHECK-NEXT: add a2, a1, a1
+; CHECK-NEXT: add a3, a0, a0
+; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v15, v14, a1
+; CHECK-NEXT: vslideup.vx v8, v16, a1
+; CHECK-NEXT: vslideup.vx v12, v10, a1
+; CHECK-NEXT: vsetvli zero, a3, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v15, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v9, v12
+; CHECK-NEXT: vs2r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vlseg6e16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %res = call {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @llvm.vector.deinterleave6.v12f16(<12 x half> %v)
+ ret {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} %res
+}
+
define {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} @vector_deinterleave7_v7f16_v1f16(<7 x half> %v) {
; CHECK-LABEL: vector_deinterleave7_v7f16_v1f16:
; CHECK: # %bb.0:
@@ -817,3 +1267,51 @@ define {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>,
%res = call {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} @llvm.vector.deinterleave7.v7f16(<7 x half> %v)
ret {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} %res
}
+
+define {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} @vector_deinterleave8_v8f16_v1f16(<8 x half> %v) {
+; CHECK-LABEL: vector_deinterleave8_v8f16_v1f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v10, v8, 7
+; CHECK-NEXT: vslidedown.vi v11, v8, 6
+; CHECK-NEXT: vslidedown.vi v12, v8, 5
+; CHECK-NEXT: srli a1, a0, 3
+; CHECK-NEXT: vslidedown.vi v9, v8, 4
+; CHECK-NEXT: srli a0, a0, 2
+; CHECK-NEXT: add a2, a1, a1
+; CHECK-NEXT: add a3, a0, a0
+; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v11, v10, a1
+; CHECK-NEXT: vslideup.vx v9, v12, a1
+; CHECK-NEXT: vsetvli zero, a3, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v11, a0
+; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v10, v8, 3
+; CHECK-NEXT: vslidedown.vi v11, v8, 2
+; CHECK-NEXT: vslidedown.vi v12, v8, 1
+; CHECK-NEXT: vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v11, v10, a1
+; CHECK-NEXT: vslideup.vx v8, v12, a1
+; CHECK-NEXT: vsetvli zero, a3, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v11, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs2r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vlseg8e16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+ %res = call {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} @llvm.vector.deinterleave8.v8f16(<8 x half> %v)
+ ret {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index e316c022727ab..6a08f5a28a295 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -468,6 +468,131 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_dein
ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
}
+define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv64i1(<vscale x 64 x i1> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv64i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmv1r.v v8, v0
+; CHECK-NEXT: vmv.v.i v10, 0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vmerge.vim v16, v10, 1, v0
+; CHECK-NEXT: srli a1, a0, 2
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a1
+; CHECK-NEXT: srli a1, a0, 1
+; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v18, v10, 1, v0
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v8, a1
+; CHECK-NEXT: srli a1, a0, 3
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: sub a0, a0, a1
+; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v20, v10, 1, v0
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v8, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v22, v10, 1, v0
+; CHECK-NEXT: vs8r.v v16, (a0)
+; CHECK-NEXT: vlseg4e8.v v8, (a0)
+; CHECK-NEXT: vmsne.vi v0, v8, 0
+; CHECK-NEXT: vmsne.vi v8, v10, 0
+; CHECK-NEXT: vmsne.vi v9, v12, 0
+; CHECK-NEXT: vmsne.vi v10, v14, 0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave4.nxv64i1(<vscale x 64 x i1> %vec)
+ ret {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} %retval
+}
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv64i8(<vscale x 64 x i8> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv64i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, ma
+; CHECK-NEXT: vlseg4e8.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave4.nxv48i8(<vscale x 64 x i8> %vec)
+ ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %retval
+}
+
+define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv32i16(<vscale x 32 x i16> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv32i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vlseg4e16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave4.nxv32i16(<vscale x 32 x i16> %vec)
+ ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %retval
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxv16i32(<vscale x 16 x i32> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4i32_nxv16i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; CHECK-NEXT: vlseg4e32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> %vec)
+ ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %retval
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv8i64(<vscale x 8 x i64> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv8i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-NEXT: vlseg4e64.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave4.nxv8i64(<vscale x 8 x i64> %vec)
+ ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
+}
+
define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv80i1(<vscale x 80 x i1> %vec) nounwind {
; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv80i1:
; CHECK: # %bb.0:
@@ -700,6 +825,240 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
}
+define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv96i1(<vscale x 96 x i1> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv96i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v10, 0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vmerge.vim v16, v10, 1, v0
+; CHECK-NEXT: srli a2, a0, 2
+; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a2
+; CHECK-NEXT: srli a3, a0, 1
+; CHECK-NEXT: vsetvli a4, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v18, v10, 1, v0
+; CHECK-NEXT: vsetvli a4, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v9, a3
+; CHECK-NEXT: srli a3, a0, 3
+; CHECK-NEXT: slli a3, a3, 1
+; CHECK-NEXT: sub a0, a0, a3
+; CHECK-NEXT: vsetvli a3, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v20, v10, 1, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vim v26, v10, 1, v0
+; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v9, a0
+; CHECK-NEXT: vs8r.v v16, (a1)
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v24, v10, 1, v0
+; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v8, a2
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v28, v10, 1, v0
+; CHECK-NEXT: vs8r.v v24, (a0)
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vlseg6e8.v v16, (a1)
+; CHECK-NEXT: vlseg6e8.v v10, (a0)
+; CHECK-NEXT: vmv2r.v v8, v16
+; CHECK-NEXT: vmv2r.v v22, v18
+; CHECK-NEXT: vmv2r.v v24, v20
+; CHECK-NEXT: vmv1r.v v9, v10
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmsne.vi v0, v8, 0
+; CHECK-NEXT: vmv1r.v v10, v17
+; CHECK-NEXT: vmsne.vi v8, v10, 0
+; CHECK-NEXT: vmv1r.v v23, v12
+; CHECK-NEXT: vmsne.vi v9, v22, 0
+; CHECK-NEXT: vmv1r.v v12, v19
+; CHECK-NEXT: vmsne.vi v10, v12, 0
+; CHECK-NEXT: vmv1r.v v25, v14
+; CHECK-NEXT: vmsne.vi v11, v24, 0
+; CHECK-NEXT: vmv1r.v v14, v21
+; CHECK-NEXT: vmsne.vi v12, v14, 0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave6.nxv96i1(<vscale x 96 x i1> %vec)
+ ret {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} %retval
+}
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv96i8(<vscale x 96 x i8> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv96i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmv2r.v v28, v18
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vmv2r.v v24, v14
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vmv2r.v v26, v16
+; CHECK-NEXT: vs8r.v v24, (a1)
+; CHECK-NEXT: vlseg6e8.v v24, (a0)
+; CHECK-NEXT: vlseg6e8.v v18, (a1)
+; CHECK-NEXT: vmv2r.v v8, v24
+; CHECK-NEXT: vmv1r.v v9, v18
+; CHECK-NEXT: vmv1r.v v18, v25
+; CHECK-NEXT: vmv2r.v v12, v26
+; CHECK-NEXT: vmv1r.v v13, v20
+; CHECK-NEXT: vmv1r.v v20, v27
+; CHECK-NEXT: vmv2r.v v16, v28
+; CHECK-NEXT: vmv1r.v v17, v22
+; CHECK-NEXT: vmv1r.v v22, v29
+; CHECK-NEXT: vmv2r.v v10, v18
+; CHECK-NEXT: vmv2r.v v14, v20
+; CHECK-NEXT: vmv2r.v v18, v22
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave6.nxv96i8(<vscale x 96 x i8> %vec)
+ ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %retval
+}
+
+define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv48i16(<vscale x 48 x i16> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv48i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv2r.v v28, v18
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vmv2r.v v24, v14
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vmv2r.v v26, v16
+; CHECK-NEXT: vs8r.v v24, (a1)
+; CHECK-NEXT: vlseg6e16.v v24, (a0)
+; CHECK-NEXT: vlseg6e16.v v18, (a1)
+; CHECK-NEXT: vmv2r.v v8, v24
+; CHECK-NEXT: vmv1r.v v9, v18
+; CHECK-NEXT: vmv1r.v v18, v25
+; CHECK-NEXT: vmv2r.v v12, v26
+; CHECK-NEXT: vmv1r.v v13, v20
+; CHECK-NEXT: vmv1r.v v20, v27
+; CHECK-NEXT: vmv2r.v v16, v28
+; CHECK-NEXT: vmv1r.v v17, v22
+; CHECK-NEXT: vmv1r.v v22, v29
+; CHECK-NEXT: vmv2r.v v10, v18
+; CHECK-NEXT: vmv2r.v v14, v20
+; CHECK-NEXT: vmv2r.v v18, v22
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave6.nxv48i16(<vscale x 48 x i16> %vec)
+ ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %retval
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxv24i32(<vscale x 24 x i32> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4i32_nxv24i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv2r.v v28, v18
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vmv2r.v v24, v14
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vmv2r.v v26, v16
+; CHECK-NEXT: vs8r.v v24, (a1)
+; CHECK-NEXT: vlseg6e32.v v24, (a0)
+; CHECK-NEXT: vlseg6e32.v v18, (a1)
+; CHECK-NEXT: vmv2r.v v8, v24
+; CHECK-NEXT: vmv1r.v v9, v18
+; CHECK-NEXT: vmv1r.v v18, v25
+; CHECK-NEXT: vmv2r.v v12, v26
+; CHECK-NEXT: vmv1r.v v13, v20
+; CHECK-NEXT: vmv1r.v v20, v27
+; CHECK-NEXT: vmv2r.v v16, v28
+; CHECK-NEXT: vmv1r.v v17, v22
+; CHECK-NEXT: vmv1r.v v22, v29
+; CHECK-NEXT: vmv2r.v v10, v18
+; CHECK-NEXT: vmv2r.v v14, v20
+; CHECK-NEXT: vmv2r.v v18, v22
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave6.nxv24i32(<vscale x 24 x i32> %vec)
+ ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %retval
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv12i64(<vscale x 12 x i64> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv12i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmv2r.v v28, v18
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vmv2r.v v24, v14
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vmv2r.v v26, v16
+; CHECK-NEXT: vs8r.v v24, (a1)
+; CHECK-NEXT: vlseg6e64.v v24, (a0)
+; CHECK-NEXT: vlseg6e64.v v18, (a1)
+; CHECK-NEXT: vmv2r.v v8, v24
+; CHECK-NEXT: vmv1r.v v9, v18
+; CHECK-NEXT: vmv1r.v v18, v25
+; CHECK-NEXT: vmv2r.v v12, v26
+; CHECK-NEXT: vmv1r.v v13, v20
+; CHECK-NEXT: vmv1r.v v20, v27
+; CHECK-NEXT: vmv2r.v v16, v28
+; CHECK-NEXT: vmv1r.v v17, v22
+; CHECK-NEXT: vmv1r.v v22, v29
+; CHECK-NEXT: vmv2r.v v10, v18
+; CHECK-NEXT: vmv2r.v v14, v20
+; CHECK-NEXT: vmv2r.v v18, v22
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave6.nxv12i64(<vscale x 12 x i64> %vec)
+ ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
+}
+
define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv112i1(<vscale x 112 x i1> %vec) nounwind {
; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv112i1:
; CHECK: # %bb.0:
@@ -971,26 +1330,277 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
}
-; Floats
-
-define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv4bf16(<vscale x 4 x bfloat> %vec) {
-; V-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16:
-; V: # %bb.0:
-; V-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; V-NEXT: vnsrl.wi v10, v8, 0
-; V-NEXT: vnsrl.wi v9, v8, 16
-; V-NEXT: vmv1r.v v8, v10
-; V-NEXT: ret
-;
-; ZIP-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16:
-; ZIP: # %bb.0:
-; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
-; ZIP-NEXT: ri.vunzip2b.vv v9, v8, v11
-; ZIP-NEXT: vmv.v.v v8, v10
-; ZIP-NEXT: ret
-%retval = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave2.nxv4bf16(<vscale x 4 x bfloat> %vec)
-ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %retval
+define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv128i1(<vscale x 128 x i1> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv128i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmv1r.v v9, v0
+; CHECK-NEXT: vmv.v.i v10, 0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vmerge.vim v16, v10, 1, v0
+; CHECK-NEXT: srli a1, a0, 2
+; CHECK-NEXT: srli a2, a0, 1
+; CHECK-NEXT: srli a3, a0, 3
+; CHECK-NEXT: vsetvli a4, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v0, a1
+; CHECK-NEXT: slli a3, a3, 1
+; CHECK-NEXT: vsetvli a4, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v18, v10, 1, v0
+; CHECK-NEXT: vsetvli a4, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v9, a2
+; CHECK-NEXT: sub a0, a0, a3
+; CHECK-NEXT: vsetvli a3, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v20, v10, 1, v0
+; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v9, a0
+; CHECK-NEXT: vsetvli a3, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v22, v10, 1, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vim v24, v10, 1, v0
+; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v8, a1
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vsetvli a3, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v26, v10, 1, v0
+; CHECK-NEXT: vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v8, a2
+; CHECK-NEXT: vs8r.v v16, (a1)
+; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v28, v10, 1, v0
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v0, v8, a0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmerge.vim v30, v10, 1, v0
+; CHECK-NEXT: vs8r.v v24, (a0)
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vlseg8e8.v v18, (a1)
+; CHECK-NEXT: vlseg8e8.v v10, (a0)
+; CHECK-NEXT: vmv2r.v v8, v18
+; CHECK-NEXT: vmv2r.v v26, v20
+; CHECK-NEXT: vmv2r.v v28, v22
+; CHECK-NEXT: vmv2r.v v30, v24
+; CHECK-NEXT: vmv1r.v v9, v10
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmsne.vi v0, v8, 0
+; CHECK-NEXT: vmv1r.v v10, v19
+; CHECK-NEXT: vmsne.vi v8, v10, 0
+; CHECK-NEXT: vmv1r.v v27, v12
+; CHECK-NEXT: vmsne.vi v9, v26, 0
+; CHECK-NEXT: vmv1r.v v12, v21
+; CHECK-NEXT: vmsne.vi v10, v12, 0
+; CHECK-NEXT: vmv1r.v v29, v14
+; CHECK-NEXT: vmsne.vi v11, v28, 0
+; CHECK-NEXT: vmv1r.v v14, v23
+; CHECK-NEXT: vmsne.vi v12, v14, 0
+; CHECK-NEXT: vmv1r.v v31, v16
+; CHECK-NEXT: vmsne.vi v13, v30, 0
+; CHECK-NEXT: vmv1r.v v16, v25
+; CHECK-NEXT: vmsne.vi v14, v16, 0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave8.nxv128i1(<vscale x 128 x i1> %vec)
+ ret {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} %retval
+}
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv128i8(<vscale x 128 x i8> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv128i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vs8r.v v16, (a1)
+; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT: vlseg8e8.v v0, (a0)
+; CHECK-NEXT: vlseg8e8.v v22, (a1)
+; CHECK-NEXT: vmv2r.v v8, v0
+; CHECK-NEXT: vmv1r.v v9, v22
+; CHECK-NEXT: vmv1r.v v22, v1
+; CHECK-NEXT: vmv2r.v v12, v2
+; CHECK-NEXT: vmv1r.v v13, v24
+; CHECK-NEXT: vmv1r.v v24, v3
+; CHECK-NEXT: vmv2r.v v16, v4
+; CHECK-NEXT: vmv1r.v v17, v26
+; CHECK-NEXT: vmv1r.v v26, v5
+; CHECK-NEXT: vmv2r.v v20, v6
+; CHECK-NEXT: vmv1r.v v21, v28
+; CHECK-NEXT: vmv1r.v v28, v7
+; CHECK-NEXT: vmv2r.v v10, v22
+; CHECK-NEXT: vmv2r.v v14, v24
+; CHECK-NEXT: vmv2r.v v18, v26
+; CHECK-NEXT: vmv2r.v v22, v28
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave8.nxv128i8(<vscale x 128 x i8> %vec)
+ ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %retval
+}
+
+define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv64i16(<vscale x 64 x i16> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv64i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vs8r.v v16, (a1)
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vlseg8e16.v v0, (a0)
+; CHECK-NEXT: vlseg8e16.v v22, (a1)
+; CHECK-NEXT: vmv2r.v v8, v0
+; CHECK-NEXT: vmv1r.v v9, v22
+; CHECK-NEXT: vmv1r.v v22, v1
+; CHECK-NEXT: vmv2r.v v12, v2
+; CHECK-NEXT: vmv1r.v v13, v24
+; CHECK-NEXT: vmv1r.v v24, v3
+; CHECK-NEXT: vmv2r.v v16, v4
+; CHECK-NEXT: vmv1r.v v17, v26
+; CHECK-NEXT: vmv1r.v v26, v5
+; CHECK-NEXT: vmv2r.v v20, v6
+; CHECK-NEXT: vmv1r.v v21, v28
+; CHECK-NEXT: vmv1r.v v28, v7
+; CHECK-NEXT: vmv2r.v v10, v22
+; CHECK-NEXT: vmv2r.v v14, v24
+; CHECK-NEXT: vmv2r.v v18, v26
+; CHECK-NEXT: vmv2r.v v22, v28
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave8.nxv64i16(<vscale x 64 x i16> %vec)
+ ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %retval
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxv32i32(<vscale x 32 x i32> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4i32_nxv32i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vs8r.v v16, (a1)
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vlseg8e32.v v0, (a0)
+; CHECK-NEXT: vlseg8e32.v v22, (a1)
+; CHECK-NEXT: vmv2r.v v8, v0
+; CHECK-NEXT: vmv1r.v v9, v22
+; CHECK-NEXT: vmv1r.v v22, v1
+; CHECK-NEXT: vmv2r.v v12, v2
+; CHECK-NEXT: vmv1r.v v13, v24
+; CHECK-NEXT: vmv1r.v v24, v3
+; CHECK-NEXT: vmv2r.v v16, v4
+; CHECK-NEXT: vmv1r.v v17, v26
+; CHECK-NEXT: vmv1r.v v26, v5
+; CHECK-NEXT: vmv2r.v v20, v6
+; CHECK-NEXT: vmv1r.v v21, v28
+; CHECK-NEXT: vmv1r.v v28, v7
+; CHECK-NEXT: vmv2r.v v10, v22
+; CHECK-NEXT: vmv2r.v v14, v24
+; CHECK-NEXT: vmv2r.v v18, v26
+; CHECK-NEXT: vmv2r.v v22, v28
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave8.nxv32i32(<vscale x 32 x i32> %vec)
+ ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %retval
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv16i64(<vscale x 16 x i64> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv16i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vs8r.v v16, (a1)
+; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; CHECK-NEXT: vlseg8e64.v v0, (a0)
+; CHECK-NEXT: vlseg8e64.v v22, (a1)
+; CHECK-NEXT: vmv2r.v v8, v0
+; CHECK-NEXT: vmv1r.v v9, v22
+; CHECK-NEXT: vmv1r.v v22, v1
+; CHECK-NEXT: vmv2r.v v12, v2
+; CHECK-NEXT: vmv1r.v v13, v24
+; CHECK-NEXT: vmv1r.v v24, v3
+; CHECK-NEXT: vmv2r.v v16, v4
+; CHECK-NEXT: vmv1r.v v17, v26
+; CHECK-NEXT: vmv1r.v v26, v5
+; CHECK-NEXT: vmv2r.v v20, v6
+; CHECK-NEXT: vmv1r.v v21, v28
+; CHECK-NEXT: vmv1r.v v28, v7
+; CHECK-NEXT: vmv2r.v v10, v22
+; CHECK-NEXT: vmv2r.v v14, v24
+; CHECK-NEXT: vmv2r.v v18, v26
+; CHECK-NEXT: vmv2r.v v22, v28
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave8.nxv16i64(<vscale x 16 x i64> %vec)
+ ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
+}
+
+; Floats
+
+define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv4bf16(<vscale x 4 x bfloat> %vec) {
+; V-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; V-NEXT: vnsrl.wi v10, v8, 0
+; V-NEXT: vnsrl.wi v9, v8, 16
+; V-NEXT: vmv1r.v v8, v10
+; V-NEXT: ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT: ri.vunzip2a.vv v10, v8, v9
+; ZIP-NEXT: ri.vunzip2b.vv v9, v8, v11
+; ZIP-NEXT: vmv.v.v v8, v10
+; ZIP-NEXT: ret
+%retval = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave2.nxv4bf16(<vscale x 4 x bfloat> %vec)
+ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %retval
}
define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv4f16(<vscale x 4 x half> %vec) {
@@ -1550,35 +2160,48 @@ define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @ve
ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %res
}
-define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv10f16(<vscale x 10 x half> %arg) nounwind {
-; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv10f16:
+define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv8f16(<vscale x 8 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv8f16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs2r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vlseg4e16.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vx v11, v9, a0
-; CHECK-NEXT: vslideup.vx v9, v11, a0
-; CHECK-NEXT: vslidedown.vx v11, v8, a0
-; CHECK-NEXT: vslideup.vx v8, v11, a0
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave4.nxv8f16(<vscale x 8 x half> %arg)
+ ret {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} %res
+}
+
+define {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv16f16(<vscale x 16 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs4r.v v8, (a0)
-; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vlseg5e16.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vlseg4e16.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 2
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
- %res = call {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave5.nxv10f16(<vscale x 10 x half> %arg)
- ret {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} %res
+ %res = call {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave4.nxv16f16(<vscale x 16 x half> %arg)
+ ret {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} %res
}
-define {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv20f16(<vscale x 20 x half> %arg) nounwind {
-; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv20f16:
+define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_nxv32f16(<vscale x 32 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv32f16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
@@ -1586,86 +2209,59 @@ define {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0)
-; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT: vlseg5e16.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vlseg4e16.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
- %res = call {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave5.nxv20f16(<vscale x 20 x half> %arg)
- ret {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} %res
+ %res = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave4.nxv32f16(<vscale x 32 x half> %arg)
+ ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res
}
-define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_nxv40f16(<vscale x 40 x half> %arg) nounwind {
-; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv40f16:
+define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv8bf16(<vscale x 8 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv8bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT: vmv1r.v v26, v15
-; CHECK-NEXT: vmv1r.v v27, v16
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vmv1r.v v24, v13
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 3
-; CHECK-NEXT: add a1, sp, a1
-; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vmv1r.v v25, v14
-; CHECK-NEXT: vs8r.v v8, (a0)
-; CHECK-NEXT: vmv1r.v v28, v17
-; CHECK-NEXT: vs8r.v v24, (a1)
-; CHECK-NEXT: vlseg5e16.v v12, (a0)
-; CHECK-NEXT: vlseg5e16.v v18, (a1)
-; CHECK-NEXT: vmv2r.v v8, v12
-; CHECK-NEXT: vmv1r.v v9, v18
-; CHECK-NEXT: vmv1r.v v18, v13
-; CHECK-NEXT: vmv2r.v v12, v14
-; CHECK-NEXT: vmv1r.v v13, v20
-; CHECK-NEXT: vmv1r.v v20, v15
-; CHECK-NEXT: vmv1r.v v17, v22
-; CHECK-NEXT: vmv2r.v v10, v18
-; CHECK-NEXT: vmv2r.v v14, v20
+; CHECK-NEXT: vs2r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vlseg4e16.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
- %res = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave5.nxv40f16(<vscale x 40 x half> %arg)
- ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res
+ %res = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave4.nxv8bf16(<vscale x 8 x bfloat> %arg)
+ ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %res
}
-define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv10bf16(<vscale x 10 x bfloat> %arg) nounwind {
-; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv10bf16:
+define {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @vector_deinterleave_nxv4bf16_nxv16bf16(<vscale x 16 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv16bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 2
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: srli a0, a0, 2
-; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT: vslidedown.vx v11, v9, a0
-; CHECK-NEXT: vslideup.vx v9, v11, a0
-; CHECK-NEXT: vslidedown.vx v11, v8, a0
-; CHECK-NEXT: vslideup.vx v8, v11, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs4r.v v8, (a0)
-; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vlseg5e16.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vlseg4e16.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 2
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
- %res = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave5.nxv10bf16(<vscale x 10 x bfloat> %arg)
- ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %res
+ %res = call {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @llvm.vector.deinterleave4.nxv16bf16(<vscale x 16 x bfloat> %arg)
+ ret {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} %res
}
-define {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @vector_deinterleave_nxv4bf16_nxv20bf16(<vscale x 20 x bfloat> %arg) nounwind {
-; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv20bf16:
+define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @vector_deinterleave_nxv8bf16_nxv32bf16(<vscale x 32 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8bf16_nxv32bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
@@ -1673,7 +2269,241 @@ define {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vs
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0)
-; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT: vlseg4e16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.vector.deinterleave4.nxv32bf16(<vscale x 32 x bfloat> %arg)
+ ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res
+}
+
+define {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @vector_deinterleave_nxv1f32_nxv4f32(<vscale x 4 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv1f32_nxv4f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs2r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vlseg4e32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @llvm.vector.deinterleave4.nxv4f32(<vscale x 4 x float> %arg)
+ ret {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} %res
+}
+
+define {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv8f32(<vscale x 8 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv8f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs4r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vlseg4e32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave4.nxv6f32(<vscale x 8 x float> %arg)
+ ret {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} %res
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32_nxv16f32(<vscale x 16 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv16f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e32, m2, ta, ma
+; CHECK-NEXT: vlseg4e32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> %arg)
+ ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res
+}
+
+define {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} @vector_deinterleave_nxv1f64_nxv4f64(<vscale x 4 x double> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv1f64_nxv4f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs4r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT: vlseg4e64.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} @llvm.vector.deinterleave4.nxv4f64(<vscale x 4 x double> %arg)
+ ret {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} %res
+}
+
+define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f64_nxv8f64(<vscale x 8 x double> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv8f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-NEXT: vlseg4e64.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave4.nxv8f64(<vscale x 8 x double> %arg)
+ ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %res
+}
+
+define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv10f16(<vscale x 10 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv10f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a0, a0, 2
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v11, v9, a0
+; CHECK-NEXT: vslideup.vx v9, v11, a0
+; CHECK-NEXT: vslidedown.vx v11, v8, a0
+; CHECK-NEXT: vslideup.vx v8, v11, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs4r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vlseg5e16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave5.nxv10f16(<vscale x 10 x half> %arg)
+ ret {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} %res
+}
+
+define {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv20f16(<vscale x 20 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv20f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vlseg5e16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave5.nxv20f16(<vscale x 20 x half> %arg)
+ ret {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} %res
+}
+
+define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_nxv40f16(<vscale x 40 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv40f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v26, v15
+; CHECK-NEXT: vmv1r.v v27, v16
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v24, v13
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vmv1r.v v25, v14
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vmv1r.v v28, v17
+; CHECK-NEXT: vs8r.v v24, (a1)
+; CHECK-NEXT: vlseg5e16.v v12, (a0)
+; CHECK-NEXT: vlseg5e16.v v18, (a1)
+; CHECK-NEXT: vmv2r.v v8, v12
+; CHECK-NEXT: vmv1r.v v9, v18
+; CHECK-NEXT: vmv1r.v v18, v13
+; CHECK-NEXT: vmv2r.v v12, v14
+; CHECK-NEXT: vmv1r.v v13, v20
+; CHECK-NEXT: vmv1r.v v20, v15
+; CHECK-NEXT: vmv1r.v v17, v22
+; CHECK-NEXT: vmv2r.v v10, v18
+; CHECK-NEXT: vmv2r.v v14, v20
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave5.nxv40f16(<vscale x 40 x half> %arg)
+ ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res
+}
+
+define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv10bf16(<vscale x 10 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv10bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a0, a0, 2
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v11, v9, a0
+; CHECK-NEXT: vslideup.vx v9, v11, a0
+; CHECK-NEXT: vslidedown.vx v11, v8, a0
+; CHECK-NEXT: vslideup.vx v8, v11, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs4r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vlseg5e16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave5.nxv10bf16(<vscale x 10 x bfloat> %arg)
+ ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %res
+}
+
+define {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @vector_deinterleave_nxv4bf16_nxv20bf16(<vscale x 20 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv20bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
; CHECK-NEXT: vlseg5e16.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
@@ -1720,12 +2550,351 @@ define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vs
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
- %res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.vector.deinterleave5.nxv40bf16(<vscale x 40 x bfloat> %arg)
- ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res
+ %res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.vector.deinterleave5.nxv40bf16(<vscale x 40 x bfloat> %arg)
+ ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res
+}
+
+define {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @vector_deinterleave_nxv1f32_nxv5f32(<vscale x 5 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv1f32_nxv5f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a0, a0, 3
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v11, v9, a0
+; CHECK-NEXT: vslideup.vx v9, v11, a0
+; CHECK-NEXT: vslidedown.vx v11, v8, a0
+; CHECK-NEXT: vslideup.vx v8, v11, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs4r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vlseg5e32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @llvm.vector.deinterleave5.nxv5f32(<vscale x 5 x float> %arg)
+ ret {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} %res
+}
+
+define {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv10f32(<vscale x 10 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv10f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vlseg5e32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave5.nxv10f32(<vscale x 10 x float> %arg)
+ ret {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} %res
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32_nxv20f32(<vscale x 20 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv20f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v26, v15
+; CHECK-NEXT: vmv1r.v v27, v16
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v24, v13
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vmv1r.v v25, v14
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vmv1r.v v28, v17
+; CHECK-NEXT: vs8r.v v24, (a1)
+; CHECK-NEXT: vlseg5e32.v v12, (a0)
+; CHECK-NEXT: vlseg5e32.v v18, (a1)
+; CHECK-NEXT: vmv2r.v v8, v12
+; CHECK-NEXT: vmv1r.v v9, v18
+; CHECK-NEXT: vmv1r.v v18, v13
+; CHECK-NEXT: vmv2r.v v12, v14
+; CHECK-NEXT: vmv1r.v v13, v20
+; CHECK-NEXT: vmv1r.v v20, v15
+; CHECK-NEXT: vmv1r.v v17, v22
+; CHECK-NEXT: vmv2r.v v10, v18
+; CHECK-NEXT: vmv2r.v v14, v20
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave5.nxv20f32(<vscale x 20 x float> %arg)
+ ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res
+}
+
+define {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} @vector_deinterleave_nxv1f64_nxv5f64(<vscale x 5 x double> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv1f64_nxv5f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT: vlseg5e64.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} @llvm.vector.deinterleave5.nxv5f64(<vscale x 5 x double> %arg)
+ ret {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} %res
+}
+
+define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f64_nxv10f64(<vscale x 10 x double> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv10f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmv1r.v v26, v15
+; CHECK-NEXT: vmv1r.v v27, v16
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv1r.v v24, v13
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vmv1r.v v25, v14
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vmv1r.v v28, v17
+; CHECK-NEXT: vs8r.v v24, (a1)
+; CHECK-NEXT: vlseg5e64.v v12, (a0)
+; CHECK-NEXT: vlseg5e64.v v18, (a1)
+; CHECK-NEXT: vmv2r.v v8, v12
+; CHECK-NEXT: vmv1r.v v9, v18
+; CHECK-NEXT: vmv1r.v v18, v13
+; CHECK-NEXT: vmv2r.v v12, v14
+; CHECK-NEXT: vmv1r.v v13, v20
+; CHECK-NEXT: vmv1r.v v20, v15
+; CHECK-NEXT: vmv1r.v v17, v22
+; CHECK-NEXT: vmv2r.v v10, v18
+; CHECK-NEXT: vmv2r.v v14, v20
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave5.nxv10f64(<vscale x 10 x double> %arg)
+ ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %res
+}
+
+define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv12f16(<vscale x 12 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv12f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a0, a0, 2
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v11, v9, a0
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v11, a0
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v11, v8, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v11, a0
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v11, v10, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v10, v11, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs4r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vlseg6e16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave6.nxv12f16(<vscale x 12 x half> %arg)
+ ret {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} %res
+}
+
+define {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv24f16(<vscale x 24 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv24f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vlseg6e16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave6.nxv24f16(<vscale x 24 x half> %arg)
+ ret {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} %res
+}
+
+define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_nxv48f16(<vscale x 48 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv48f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv2r.v v28, v18
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vmv2r.v v24, v14
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vmv2r.v v26, v16
+; CHECK-NEXT: vs8r.v v24, (a1)
+; CHECK-NEXT: vlseg6e16.v v24, (a0)
+; CHECK-NEXT: vlseg6e16.v v18, (a1)
+; CHECK-NEXT: vmv2r.v v8, v24
+; CHECK-NEXT: vmv1r.v v9, v18
+; CHECK-NEXT: vmv1r.v v18, v25
+; CHECK-NEXT: vmv2r.v v12, v26
+; CHECK-NEXT: vmv1r.v v13, v20
+; CHECK-NEXT: vmv1r.v v20, v27
+; CHECK-NEXT: vmv2r.v v16, v28
+; CHECK-NEXT: vmv1r.v v17, v22
+; CHECK-NEXT: vmv1r.v v22, v29
+; CHECK-NEXT: vmv2r.v v10, v18
+; CHECK-NEXT: vmv2r.v v14, v20
+; CHECK-NEXT: vmv2r.v v18, v22
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave6.nxv48f16(<vscale x 48 x half> %arg)
+ ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res
+}
+
+define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv12bf16(<vscale x 12 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv12bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: srli a0, a0, 2
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v11, v9, a0
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v11, a0
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v11, v8, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v11, a0
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v11, v10, a0
+; CHECK-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v10, v11, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs4r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vlseg6e16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave6.nxv12bf16(<vscale x 12 x bfloat> %arg)
+ ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %res
+}
+
+define {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @vector_deinterleave_nxv4bf16_nxv24bf16(<vscale x 24 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv24bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vlseg6e16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @llvm.vector.deinterleave6.nxv24bf16(<vscale x 24 x bfloat> %arg)
+ ret {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} %res
+}
+
+define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @vector_deinterleave_nxv8bf16_nxv48bf16(<vscale x 48 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8bf16_nxv48bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv2r.v v28, v18
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vmv2r.v v24, v14
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vmv2r.v v26, v16
+; CHECK-NEXT: vs8r.v v24, (a1)
+; CHECK-NEXT: vlseg6e16.v v24, (a0)
+; CHECK-NEXT: vlseg6e16.v v18, (a1)
+; CHECK-NEXT: vmv2r.v v8, v24
+; CHECK-NEXT: vmv1r.v v9, v18
+; CHECK-NEXT: vmv1r.v v18, v25
+; CHECK-NEXT: vmv2r.v v12, v26
+; CHECK-NEXT: vmv1r.v v13, v20
+; CHECK-NEXT: vmv1r.v v20, v27
+; CHECK-NEXT: vmv2r.v v16, v28
+; CHECK-NEXT: vmv1r.v v17, v22
+; CHECK-NEXT: vmv1r.v v22, v29
+; CHECK-NEXT: vmv2r.v v10, v18
+; CHECK-NEXT: vmv2r.v v14, v20
+; CHECK-NEXT: vmv2r.v v18, v22
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.vector.deinterleave6.nxv48bf16(<vscale x 48 x bfloat> %arg)
+ ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res
}
-define {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @vector_deinterleave_nxv1f32_nxv5f32(<vscale x 5 x float> %arg) nounwind {
-; CHECK-LABEL: vector_deinterleave_nxv1f32_nxv5f32:
+define {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @vector_deinterleave_nxv1f32_nxv6f32(<vscale x 6 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv1f32_nxv6f32:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
@@ -1735,24 +2904,32 @@ define {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscal
; CHECK-NEXT: srli a0, a0, 3
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v11, v9, a0
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
; CHECK-NEXT: vslideup.vx v9, v11, a0
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
; CHECK-NEXT: vslidedown.vx v11, v8, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
; CHECK-NEXT: vslideup.vx v8, v11, a0
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vslidedown.vx v11, v10, a0
+; CHECK-NEXT: vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v10, v11, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs4r.v v8, (a0)
; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vlseg5e32.v v8, (a0)
+; CHECK-NEXT: vlseg6e32.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 2
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
- %res = call {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @llvm.vector.deinterleave5.nxv5f32(<vscale x 5 x float> %arg)
- ret {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} %res
+ %res = call {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @llvm.vector.deinterleave6.nxv6f32(<vscale x 6 x float> %arg)
+ ret {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} %res
}
-define {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv10f32(<vscale x 10 x float> %arg) nounwind {
-; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv10f32:
+define {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv12f32(<vscale x 12 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv12f32:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
@@ -1761,58 +2938,59 @@ define {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscal
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0)
; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT: vlseg5e32.v v8, (a0)
+; CHECK-NEXT: vlseg6e32.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
- %res = call {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave5.nxv10f32(<vscale x 10 x float> %arg)
- ret {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} %res
+ %res = call {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave6.nxv12f32(<vscale x 12 x float> %arg)
+ ret {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} %res
}
-define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32_nxv20f32(<vscale x 20 x float> %arg) nounwind {
-; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv20f32:
+define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32_nxv24f32(<vscale x 24 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv24f32:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT: vmv1r.v v26, v15
-; CHECK-NEXT: vmv1r.v v27, v16
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vmv1r.v v24, v13
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv2r.v v28, v18
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vmv1r.v v25, v14
+; CHECK-NEXT: vmv2r.v v24, v14
; CHECK-NEXT: vs8r.v v8, (a0)
-; CHECK-NEXT: vmv1r.v v28, v17
+; CHECK-NEXT: vmv2r.v v26, v16
; CHECK-NEXT: vs8r.v v24, (a1)
-; CHECK-NEXT: vlseg5e32.v v12, (a0)
-; CHECK-NEXT: vlseg5e32.v v18, (a1)
-; CHECK-NEXT: vmv2r.v v8, v12
+; CHECK-NEXT: vlseg6e32.v v24, (a0)
+; CHECK-NEXT: vlseg6e32.v v18, (a1)
+; CHECK-NEXT: vmv2r.v v8, v24
; CHECK-NEXT: vmv1r.v v9, v18
-; CHECK-NEXT: vmv1r.v v18, v13
-; CHECK-NEXT: vmv2r.v v12, v14
+; CHECK-NEXT: vmv1r.v v18, v25
+; CHECK-NEXT: vmv2r.v v12, v26
; CHECK-NEXT: vmv1r.v v13, v20
-; CHECK-NEXT: vmv1r.v v20, v15
+; CHECK-NEXT: vmv1r.v v20, v27
+; CHECK-NEXT: vmv2r.v v16, v28
; CHECK-NEXT: vmv1r.v v17, v22
+; CHECK-NEXT: vmv1r.v v22, v29
; CHECK-NEXT: vmv2r.v v10, v18
; CHECK-NEXT: vmv2r.v v14, v20
+; CHECK-NEXT: vmv2r.v v18, v22
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
- %res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave5.nxv20f32(<vscale x 20 x float> %arg)
- ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res
+ %res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave6.nxv24f32(<vscale x 24 x float> %arg)
+ ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res
}
-define {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} @vector_deinterleave_nxv1f64_nxv5f64(<vscale x 5 x double> %arg) nounwind {
-; CHECK-LABEL: vector_deinterleave_nxv1f64_nxv5f64:
+define {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} @vector_deinterleave_nxv1f64_nxv6f64(<vscale x 6 x double> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv1f64_nxv6f64:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
@@ -1821,54 +2999,55 @@ define {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vs
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: vs8r.v v8, (a0)
; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-NEXT: vlseg5e64.v v8, (a0)
+; CHECK-NEXT: vlseg6e64.v v8, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
- %res = call {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} @llvm.vector.deinterleave5.nxv5f64(<vscale x 5 x double> %arg)
- ret {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} %res
+ %res = call {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} @llvm.vector.deinterleave6.nxv6f64(<vscale x 6 x double> %arg)
+ ret {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} %res
}
-define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f64_nxv10f64(<vscale x 10 x double> %arg) nounwind {
-; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv10f64:
+define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f64_nxv12f64(<vscale x 12 x double> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv12f64:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT: vmv1r.v v26, v15
-; CHECK-NEXT: vmv1r.v v27, v16
; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: vmv1r.v v24, v13
+; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmv2r.v v28, v18
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: slli a1, a1, 3
; CHECK-NEXT: add a1, sp, a1
; CHECK-NEXT: addi a1, a1, 16
-; CHECK-NEXT: vmv1r.v v25, v14
+; CHECK-NEXT: vmv2r.v v24, v14
; CHECK-NEXT: vs8r.v v8, (a0)
-; CHECK-NEXT: vmv1r.v v28, v17
+; CHECK-NEXT: vmv2r.v v26, v16
; CHECK-NEXT: vs8r.v v24, (a1)
-; CHECK-NEXT: vlseg5e64.v v12, (a0)
-; CHECK-NEXT: vlseg5e64.v v18, (a1)
-; CHECK-NEXT: vmv2r.v v8, v12
+; CHECK-NEXT: vlseg6e64.v v24, (a0)
+; CHECK-NEXT: vlseg6e64.v v18, (a1)
+; CHECK-NEXT: vmv2r.v v8, v24
; CHECK-NEXT: vmv1r.v v9, v18
-; CHECK-NEXT: vmv1r.v v18, v13
-; CHECK-NEXT: vmv2r.v v12, v14
+; CHECK-NEXT: vmv1r.v v18, v25
+; CHECK-NEXT: vmv2r.v v12, v26
; CHECK-NEXT: vmv1r.v v13, v20
-; CHECK-NEXT: vmv1r.v v20, v15
+; CHECK-NEXT: vmv1r.v v20, v27
+; CHECK-NEXT: vmv2r.v v16, v28
; CHECK-NEXT: vmv1r.v v17, v22
+; CHECK-NEXT: vmv1r.v v22, v29
; CHECK-NEXT: vmv2r.v v10, v18
; CHECK-NEXT: vmv2r.v v14, v20
+; CHECK-NEXT: vmv2r.v v18, v22
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 4
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
- %res = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave5.nxv10f64(<vscale x 10 x double> %arg)
- ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %res
+ %res = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave6.nxv12f64(<vscale x 12 x double> %arg)
+ ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %res
}
define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv14f16(<vscale x 14 x half> %arg) nounwind {
@@ -2221,3 +3400,311 @@ define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vs
%res = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave7.nxv14f64(<vscale x 14 x double> %arg)
ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %res
}
+
+define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv16f16(<vscale x 16 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv16f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs4r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vlseg8e16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave8.nxv16f16(<vscale x 16 x half> %arg)
+ ret {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} %res
+}
+
+define {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv32f16(<vscale x 32 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv32f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vlseg8e16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave8.nxv32f16(<vscale x 32 x half> %arg)
+ ret {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} %res
+}
+
+define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_nxv64f16(<vscale x 64 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv64f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vs8r.v v16, (a1)
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vlseg8e16.v v0, (a0)
+; CHECK-NEXT: vlseg8e16.v v22, (a1)
+; CHECK-NEXT: vmv2r.v v8, v0
+; CHECK-NEXT: vmv1r.v v9, v22
+; CHECK-NEXT: vmv1r.v v22, v1
+; CHECK-NEXT: vmv2r.v v12, v2
+; CHECK-NEXT: vmv1r.v v13, v24
+; CHECK-NEXT: vmv1r.v v24, v3
+; CHECK-NEXT: vmv2r.v v16, v4
+; CHECK-NEXT: vmv1r.v v17, v26
+; CHECK-NEXT: vmv1r.v v26, v5
+; CHECK-NEXT: vmv2r.v v20, v6
+; CHECK-NEXT: vmv1r.v v21, v28
+; CHECK-NEXT: vmv1r.v v28, v7
+; CHECK-NEXT: vmv2r.v v10, v22
+; CHECK-NEXT: vmv2r.v v14, v24
+; CHECK-NEXT: vmv2r.v v18, v26
+; CHECK-NEXT: vmv2r.v v22, v28
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave8.nxv64f16(<vscale x 64 x half> %arg)
+ ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res
+}
+
+define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv16bf16(<vscale x 16 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv16bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs4r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vlseg8e16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave8.nxv16bf16(<vscale x 16 x bfloat> %arg)
+ ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %res
+}
+
+define {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @vector_deinterleave_nxv4bf16_nxv32bf16(<vscale x 32 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv32bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT: vlseg8e16.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @llvm.vector.deinterleave8.nxv32bf16(<vscale x 32 x bfloat> %arg)
+ ret {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} %res
+}
+
+define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @vector_deinterleave_nxv8bf16_nxv64bf16(<vscale x 64 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8bf16_nxv64bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vs8r.v v16, (a1)
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vlseg8e16.v v0, (a0)
+; CHECK-NEXT: vlseg8e16.v v22, (a1)
+; CHECK-NEXT: vmv2r.v v8, v0
+; CHECK-NEXT: vmv1r.v v9, v22
+; CHECK-NEXT: vmv1r.v v22, v1
+; CHECK-NEXT: vmv2r.v v12, v2
+; CHECK-NEXT: vmv1r.v v13, v24
+; CHECK-NEXT: vmv1r.v v24, v3
+; CHECK-NEXT: vmv2r.v v16, v4
+; CHECK-NEXT: vmv1r.v v17, v26
+; CHECK-NEXT: vmv1r.v v26, v5
+; CHECK-NEXT: vmv2r.v v20, v6
+; CHECK-NEXT: vmv1r.v v21, v28
+; CHECK-NEXT: vmv1r.v v28, v7
+; CHECK-NEXT: vmv2r.v v10, v22
+; CHECK-NEXT: vmv2r.v v14, v24
+; CHECK-NEXT: vmv2r.v v18, v26
+; CHECK-NEXT: vmv2r.v v22, v28
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.vector.deinterleave8.nxv64bf16(<vscale x 64 x bfloat> %arg)
+ ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res
+}
+
+define {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @vector_deinterleave_nxv1f32_nxv8f32(<vscale x 8 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv1f32_nxv8f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs4r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vlseg8e32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @llvm.vector.deinterleave8.nxv8f32(<vscale x 8 x float> %arg)
+ ret {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} %res
+}
+
+define {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv16f32(<vscale x 16 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv16f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT: vlseg8e32.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave8.nxv16f32(<vscale x 16 x float> %arg)
+ ret {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} %res
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32_nxv32f32(<vscale x 32 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv32f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vs8r.v v16, (a1)
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vlseg8e32.v v0, (a0)
+; CHECK-NEXT: vlseg8e32.v v22, (a1)
+; CHECK-NEXT: vmv2r.v v8, v0
+; CHECK-NEXT: vmv1r.v v9, v22
+; CHECK-NEXT: vmv1r.v v22, v1
+; CHECK-NEXT: vmv2r.v v12, v2
+; CHECK-NEXT: vmv1r.v v13, v24
+; CHECK-NEXT: vmv1r.v v24, v3
+; CHECK-NEXT: vmv2r.v v16, v4
+; CHECK-NEXT: vmv1r.v v17, v26
+; CHECK-NEXT: vmv1r.v v26, v5
+; CHECK-NEXT: vmv2r.v v20, v6
+; CHECK-NEXT: vmv1r.v v21, v28
+; CHECK-NEXT: vmv1r.v v28, v7
+; CHECK-NEXT: vmv2r.v v10, v22
+; CHECK-NEXT: vmv2r.v v14, v24
+; CHECK-NEXT: vmv2r.v v18, v26
+; CHECK-NEXT: vmv2r.v v22, v28
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave8.nxv32f32(<vscale x 32 x float> %arg)
+ ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res
+}
+
+define {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} @vector_deinterleave_nxv1f64_nxv8f64(<vscale x 8 x double> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv1f64_nxv8f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT: vlseg8e64.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} @llvm.vector.deinterleave8.nxv8f64(<vscale x 8 x double> %arg)
+ ret {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} %res
+}
+
+define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f64_nxv16f64(<vscale x 16 x double> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv16f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v8, (a0)
+; CHECK-NEXT: vs8r.v v16, (a1)
+; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; CHECK-NEXT: vlseg8e64.v v0, (a0)
+; CHECK-NEXT: vlseg8e64.v v22, (a1)
+; CHECK-NEXT: vmv2r.v v8, v0
+; CHECK-NEXT: vmv1r.v v9, v22
+; CHECK-NEXT: vmv1r.v v22, v1
+; CHECK-NEXT: vmv2r.v v12, v2
+; CHECK-NEXT: vmv1r.v v13, v24
+; CHECK-NEXT: vmv1r.v v24, v3
+; CHECK-NEXT: vmv2r.v v16, v4
+; CHECK-NEXT: vmv1r.v v17, v26
+; CHECK-NEXT: vmv1r.v v26, v5
+; CHECK-NEXT: vmv2r.v v20, v6
+; CHECK-NEXT: vmv1r.v v21, v28
+; CHECK-NEXT: vmv1r.v v28, v7
+; CHECK-NEXT: vmv2r.v v10, v22
+; CHECK-NEXT: vmv2r.v v14, v24
+; CHECK-NEXT: vmv2r.v v18, v26
+; CHECK-NEXT: vmv2r.v v22, v28
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+ %res = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave8.nxv16f64(<vscale x 16 x double> %arg)
+ ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
index 279779dc49667..faf7903c21614 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
@@ -261,6 +261,108 @@ define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2
ret <6 x i32> %res
}
+define <8 x i32> @vector_interleave4_v8i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
+; CHECK-LABEL: vector_interleave4_v8i32_v2i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a1, a1, 1
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsseg4e32.v v8, (a0)
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vle32.v v10, (a3)
+; CHECK-NEXT: vle32.v v9, (a2)
+; CHECK-NEXT: vle32.v v11, (a1)
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v10, v11, 2
+; CHECK-NEXT: vslideup.vi v8, v9, 2
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v10, 4
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave4_v8i32_v2i32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: .cfi_def_cfa_offset 16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a1, a1, 1
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vsseg4e32.v v8, (a0)
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vle32.v v10, (a3)
+; ZVBB-NEXT: vle32.v v9, (a2)
+; ZVBB-NEXT: vle32.v v11, (a1)
+; ZVBB-NEXT: vle32.v v8, (a0)
+; ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; ZVBB-NEXT: vslideup.vi v10, v11, 2
+; ZVBB-NEXT: vslideup.vi v8, v9, 2
+; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v10, 4
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: .cfi_def_cfa sp, 16
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: .cfi_def_cfa_offset 0
+; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave4_v8i32_v2i32:
+; ZIP: # %bb.0:
+; ZIP-NEXT: addi sp, sp, -16
+; ZIP-NEXT: .cfi_def_cfa_offset 16
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: slli a0, a0, 1
+; ZIP-NEXT: sub sp, sp, a0
+; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZIP-NEXT: addi a0, sp, 16
+; ZIP-NEXT: csrr a1, vlenb
+; ZIP-NEXT: srli a1, a1, 1
+; ZIP-NEXT: add a2, a0, a1
+; ZIP-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
+; ZIP-NEXT: vsseg4e32.v v8, (a0)
+; ZIP-NEXT: add a3, a2, a1
+; ZIP-NEXT: add a1, a3, a1
+; ZIP-NEXT: vle32.v v10, (a3)
+; ZIP-NEXT: vle32.v v9, (a2)
+; ZIP-NEXT: vle32.v v11, (a1)
+; ZIP-NEXT: vle32.v v8, (a0)
+; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; ZIP-NEXT: vslideup.vi v10, v11, 2
+; ZIP-NEXT: vslideup.vi v8, v9, 2
+; ZIP-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; ZIP-NEXT: vslideup.vi v8, v10, 4
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: slli a0, a0, 1
+; ZIP-NEXT: add sp, sp, a0
+; ZIP-NEXT: .cfi_def_cfa sp, 16
+; ZIP-NEXT: addi sp, sp, 16
+; ZIP-NEXT: .cfi_def_cfa_offset 0
+; ZIP-NEXT: ret
+ %res = call <8 x i32> @llvm.vector.interleave4.v8i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d)
+ ret <8 x i32> %res
+}
define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e) {
; CHECK-LABEL: vector_interleave5_v10i16_v2i16:
@@ -377,6 +479,130 @@ define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <
ret <10 x i16> %res
}
+define <12 x i16> @vector_interleave6_v12i16_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e, <2 x i16> %f) {
+; CHECK-LABEL: vector_interleave6_v12i16_v2i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a1, a1, 2
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vsetvli a4, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vsseg6e16.v v8, (a0)
+; CHECK-NEXT: vle16.v v9, (a2)
+; CHECK-NEXT: add a2, a3, a1
+; CHECK-NEXT: vle16.v v11, (a2)
+; CHECK-NEXT: add a2, a2, a1
+; CHECK-NEXT: vle16.v v12, (a3)
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: vle16.v v10, (a2)
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: vle16.v v13, (a1)
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v12, v11, 2
+; CHECK-NEXT: vslideup.vi v8, v9, 2
+; CHECK-NEXT: vslideup.vi v10, v13, 2
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v12, 4
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v10, 8
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave6_v12i16_v2i16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: .cfi_def_cfa_offset 16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a1, a1, 2
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vsetvli a4, zero, e16, mf4, ta, ma
+; ZVBB-NEXT: vsseg6e16.v v8, (a0)
+; ZVBB-NEXT: vle16.v v9, (a2)
+; ZVBB-NEXT: add a2, a3, a1
+; ZVBB-NEXT: vle16.v v11, (a2)
+; ZVBB-NEXT: add a2, a2, a1
+; ZVBB-NEXT: vle16.v v12, (a3)
+; ZVBB-NEXT: add a1, a2, a1
+; ZVBB-NEXT: vle16.v v10, (a2)
+; ZVBB-NEXT: vle16.v v8, (a0)
+; ZVBB-NEXT: vle16.v v13, (a1)
+; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVBB-NEXT: vslideup.vi v12, v11, 2
+; ZVBB-NEXT: vslideup.vi v8, v9, 2
+; ZVBB-NEXT: vslideup.vi v10, v13, 2
+; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v12, 4
+; ZVBB-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v10, 8
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: .cfi_def_cfa sp, 16
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: .cfi_def_cfa_offset 0
+; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave6_v12i16_v2i16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: addi sp, sp, -16
+; ZIP-NEXT: .cfi_def_cfa_offset 16
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: slli a0, a0, 1
+; ZIP-NEXT: sub sp, sp, a0
+; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZIP-NEXT: addi a0, sp, 16
+; ZIP-NEXT: csrr a1, vlenb
+; ZIP-NEXT: srli a1, a1, 2
+; ZIP-NEXT: add a2, a0, a1
+; ZIP-NEXT: add a3, a2, a1
+; ZIP-NEXT: vsetvli a4, zero, e16, mf4, ta, ma
+; ZIP-NEXT: vsseg6e16.v v8, (a0)
+; ZIP-NEXT: vle16.v v9, (a2)
+; ZIP-NEXT: add a2, a3, a1
+; ZIP-NEXT: vle16.v v11, (a2)
+; ZIP-NEXT: add a2, a2, a1
+; ZIP-NEXT: vle16.v v12, (a3)
+; ZIP-NEXT: add a1, a2, a1
+; ZIP-NEXT: vle16.v v10, (a2)
+; ZIP-NEXT: vle16.v v8, (a0)
+; ZIP-NEXT: vle16.v v13, (a1)
+; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZIP-NEXT: vslideup.vi v12, v11, 2
+; ZIP-NEXT: vslideup.vi v8, v9, 2
+; ZIP-NEXT: vslideup.vi v10, v13, 2
+; ZIP-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; ZIP-NEXT: vslideup.vi v8, v12, 4
+; ZIP-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; ZIP-NEXT: vslideup.vi v8, v10, 8
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: slli a0, a0, 1
+; ZIP-NEXT: add sp, sp, a0
+; ZIP-NEXT: .cfi_def_cfa sp, 16
+; ZIP-NEXT: addi sp, sp, 16
+; ZIP-NEXT: .cfi_def_cfa_offset 0
+; ZIP-NEXT: ret
+ %res = call <12 x i16> @llvm.vector.interleave6.v12i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e, <2 x i16> %f)
+ ret <12 x i16> %res
+}
+
define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g) {
; CHECK-LABEL: vector_interleave7_v14i8_v2i8:
; CHECK: # %bb.0:
@@ -507,6 +733,144 @@ define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i
ret <14 x i8> %res
}
+define <16 x i8> @vector_interleave8_v16i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g, <2 x i8> %h) {
+; CHECK-LABEL: vector_interleave8_v16i8_v2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a1, a1, 3
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: add a4, a3, a1
+; CHECK-NEXT: add a5, a4, a1
+; CHECK-NEXT: add a6, a5, a1
+; CHECK-NEXT: vsetvli a7, zero, e8, mf8, ta, ma
+; CHECK-NEXT: vsseg8e8.v v8, (a0)
+; CHECK-NEXT: vle8.v v9, (a6)
+; CHECK-NEXT: add a6, a6, a1
+; CHECK-NEXT: vle8.v v10, (a5)
+; CHECK-NEXT: vle8.v v11, (a6)
+; CHECK-NEXT: add a1, a6, a1
+; CHECK-NEXT: vle8.v v12, (a2)
+; CHECK-NEXT: vle8.v v8, (a0)
+; CHECK-NEXT: vle8.v v13, (a3)
+; CHECK-NEXT: vle8.v v14, (a4)
+; CHECK-NEXT: vle8.v v15, (a1)
+; CHECK-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
+; CHECK-NEXT: vslideup.vi v10, v9, 2
+; CHECK-NEXT: vslideup.vi v8, v12, 2
+; CHECK-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
+; CHECK-NEXT: vslideup.vi v10, v11, 4
+; CHECK-NEXT: vslideup.vi v8, v13, 4
+; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v10, v15, 6
+; CHECK-NEXT: vslideup.vi v8, v14, 6
+; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v10, 8
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave8_v16i8_v2i8:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: .cfi_def_cfa_offset 16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a1, a1, 3
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: add a4, a3, a1
+; ZVBB-NEXT: add a5, a4, a1
+; ZVBB-NEXT: add a6, a5, a1
+; ZVBB-NEXT: vsetvli a7, zero, e8, mf8, ta, ma
+; ZVBB-NEXT: vsseg8e8.v v8, (a0)
+; ZVBB-NEXT: vle8.v v9, (a6)
+; ZVBB-NEXT: add a6, a6, a1
+; ZVBB-NEXT: vle8.v v10, (a5)
+; ZVBB-NEXT: vle8.v v11, (a6)
+; ZVBB-NEXT: add a1, a6, a1
+; ZVBB-NEXT: vle8.v v12, (a2)
+; ZVBB-NEXT: vle8.v v8, (a0)
+; ZVBB-NEXT: vle8.v v13, (a3)
+; ZVBB-NEXT: vle8.v v14, (a4)
+; ZVBB-NEXT: vle8.v v15, (a1)
+; ZVBB-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
+; ZVBB-NEXT: vslideup.vi v10, v9, 2
+; ZVBB-NEXT: vslideup.vi v8, v12, 2
+; ZVBB-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
+; ZVBB-NEXT: vslideup.vi v10, v11, 4
+; ZVBB-NEXT: vslideup.vi v8, v13, 4
+; ZVBB-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; ZVBB-NEXT: vslideup.vi v10, v15, 6
+; ZVBB-NEXT: vslideup.vi v8, v14, 6
+; ZVBB-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v10, 8
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: .cfi_def_cfa sp, 16
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: .cfi_def_cfa_offset 0
+; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave8_v16i8_v2i8:
+; ZIP: # %bb.0:
+; ZIP-NEXT: addi sp, sp, -16
+; ZIP-NEXT: .cfi_def_cfa_offset 16
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: sub sp, sp, a0
+; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; ZIP-NEXT: addi a0, sp, 16
+; ZIP-NEXT: csrr a1, vlenb
+; ZIP-NEXT: srli a1, a1, 3
+; ZIP-NEXT: add a2, a0, a1
+; ZIP-NEXT: add a3, a2, a1
+; ZIP-NEXT: add a4, a3, a1
+; ZIP-NEXT: add a5, a4, a1
+; ZIP-NEXT: add a6, a5, a1
+; ZIP-NEXT: vsetvli a7, zero, e8, mf8, ta, ma
+; ZIP-NEXT: vsseg8e8.v v8, (a0)
+; ZIP-NEXT: vle8.v v9, (a6)
+; ZIP-NEXT: add a6, a6, a1
+; ZIP-NEXT: vle8.v v10, (a5)
+; ZIP-NEXT: vle8.v v11, (a6)
+; ZIP-NEXT: add a1, a6, a1
+; ZIP-NEXT: vle8.v v12, (a2)
+; ZIP-NEXT: vle8.v v8, (a0)
+; ZIP-NEXT: vle8.v v13, (a3)
+; ZIP-NEXT: vle8.v v14, (a4)
+; ZIP-NEXT: vle8.v v15, (a1)
+; ZIP-NEXT: vsetivli zero, 4, e8, mf2, tu, ma
+; ZIP-NEXT: vslideup.vi v10, v9, 2
+; ZIP-NEXT: vslideup.vi v8, v12, 2
+; ZIP-NEXT: vsetivli zero, 6, e8, mf2, tu, ma
+; ZIP-NEXT: vslideup.vi v10, v11, 4
+; ZIP-NEXT: vslideup.vi v8, v13, 4
+; ZIP-NEXT: vsetivli zero, 8, e8, mf2, ta, ma
+; ZIP-NEXT: vslideup.vi v10, v15, 6
+; ZIP-NEXT: vslideup.vi v8, v14, 6
+; ZIP-NEXT: vsetivli zero, 16, e8, m1, ta, ma
+; ZIP-NEXT: vslideup.vi v8, v10, 8
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: add sp, sp, a0
+; ZIP-NEXT: .cfi_def_cfa sp, 16
+; ZIP-NEXT: addi sp, sp, 16
+; ZIP-NEXT: .cfi_def_cfa_offset 0
+; ZIP-NEXT: ret
+ %res = call <16 x i8> @llvm.vector.interleave8.v16i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g, <2 x i8> %h)
+ ret <16 x i8> %res
+}
; Floats
@@ -689,19 +1053,113 @@ define <4 x double> @vector_interleave_v4f64_v2f64(<2 x double> %a, <2 x double>
; ZVBB-NEXT: vmv.v.v v8, v10
; ZVBB-NEXT: ret
;
-; ZIP-LABEL: vector_interleave_v4f64_v2f64:
+; ZIP-LABEL: vector_interleave_v4f64_v2f64:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, ma
+; ZIP-NEXT: vmv1r.v v12, v9
+; ZIP-NEXT: ri.vzip2a.vv v10, v8, v12
+; ZIP-NEXT: vmv.v.v v8, v10
+; ZIP-NEXT: ret
+ %res = call <4 x double> @llvm.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b)
+ ret <4 x double> %res
+}
+
+define <6 x float> @vector_interleave3_v6f32_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; CHECK-LABEL: vector_interleave3_v6f32_v2f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a1, a1, 1
+; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsseg3e32.v v8, (a0)
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: vle32.v v9, (a2)
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: vle32.v v10, (a1)
+; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v9, 2
+; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v10, 4
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave3_v6f32_v2f32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: .cfi_def_cfa_offset 16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a1, a1, 1
+; ZVBB-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vsseg3e32.v v8, (a0)
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: vle32.v v9, (a2)
+; ZVBB-NEXT: vle32.v v8, (a0)
+; ZVBB-NEXT: add a1, a2, a1
+; ZVBB-NEXT: vle32.v v10, (a1)
+; ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v9, 2
+; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v10, 4
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: .cfi_def_cfa sp, 16
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: .cfi_def_cfa_offset 0
+; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave3_v6f32_v2f32:
; ZIP: # %bb.0:
-; ZIP-NEXT: vsetivli zero, 4, e64, m2, ta, ma
-; ZIP-NEXT: vmv1r.v v12, v9
-; ZIP-NEXT: ri.vzip2a.vv v10, v8, v12
-; ZIP-NEXT: vmv.v.v v8, v10
+; ZIP-NEXT: addi sp, sp, -16
+; ZIP-NEXT: .cfi_def_cfa_offset 16
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: slli a0, a0, 1
+; ZIP-NEXT: sub sp, sp, a0
+; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZIP-NEXT: addi a0, sp, 16
+; ZIP-NEXT: csrr a1, vlenb
+; ZIP-NEXT: srli a1, a1, 1
+; ZIP-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
+; ZIP-NEXT: vsseg3e32.v v8, (a0)
+; ZIP-NEXT: add a2, a0, a1
+; ZIP-NEXT: vle32.v v9, (a2)
+; ZIP-NEXT: vle32.v v8, (a0)
+; ZIP-NEXT: add a1, a2, a1
+; ZIP-NEXT: vle32.v v10, (a1)
+; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; ZIP-NEXT: vslideup.vi v8, v9, 2
+; ZIP-NEXT: vsetivli zero, 8, e32, m2, ta, ma
+; ZIP-NEXT: vslideup.vi v8, v10, 4
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: slli a0, a0, 1
+; ZIP-NEXT: add sp, sp, a0
+; ZIP-NEXT: .cfi_def_cfa sp, 16
+; ZIP-NEXT: addi sp, sp, 16
+; ZIP-NEXT: .cfi_def_cfa_offset 0
; ZIP-NEXT: ret
- %res = call <4 x double> @llvm.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b)
- ret <4 x double> %res
+ %res = call <6 x float> @llvm.vector.interleave3.v6f32(<2 x float> %a, <2 x float> %b, <2 x float> %c)
+ ret <6 x float> %res
}
-define <6 x float> @vector_interleave3_v632_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
-; CHECK-LABEL: vector_interleave3_v632_v2f32:
+define <8 x float> @vector_interleave4_v8f32_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) {
+; CHECK-LABEL: vector_interleave4_v8f32_v2f32:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: .cfi_def_cfa_offset 16
@@ -712,14 +1170,17 @@ define <6 x float> @vector_interleave3_v632_v2f32(<2 x float> %a, <2 x float> %b
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a1, a1, 1
-; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vsseg3e32.v v8, (a0)
; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsseg4e32.v v8, (a0)
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vle32.v v10, (a3)
; CHECK-NEXT: vle32.v v9, (a2)
+; CHECK-NEXT: vle32.v v11, (a1)
; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: add a1, a2, a1
-; CHECK-NEXT: vle32.v v10, (a1)
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v10, v11, 2
; CHECK-NEXT: vslideup.vi v8, v9, 2
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vslideup.vi v8, v10, 4
@@ -731,7 +1192,7 @@ define <6 x float> @vector_interleave3_v632_v2f32(<2 x float> %a, <2 x float> %b
; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave3_v632_v2f32:
+; ZVBB-LABEL: vector_interleave4_v8f32_v2f32:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: .cfi_def_cfa_offset 16
@@ -742,14 +1203,17 @@ define <6 x float> @vector_interleave3_v632_v2f32(<2 x float> %a, <2 x float> %b
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: srli a1, a1, 1
-; ZVBB-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
-; ZVBB-NEXT: vsseg3e32.v v8, (a0)
; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vsseg4e32.v v8, (a0)
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vle32.v v10, (a3)
; ZVBB-NEXT: vle32.v v9, (a2)
+; ZVBB-NEXT: vle32.v v11, (a1)
; ZVBB-NEXT: vle32.v v8, (a0)
-; ZVBB-NEXT: add a1, a2, a1
-; ZVBB-NEXT: vle32.v v10, (a1)
; ZVBB-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; ZVBB-NEXT: vslideup.vi v10, v11, 2
; ZVBB-NEXT: vslideup.vi v8, v9, 2
; ZVBB-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; ZVBB-NEXT: vslideup.vi v8, v10, 4
@@ -761,7 +1225,7 @@ define <6 x float> @vector_interleave3_v632_v2f32(<2 x float> %a, <2 x float> %b
; ZVBB-NEXT: .cfi_def_cfa_offset 0
; ZVBB-NEXT: ret
;
-; ZIP-LABEL: vector_interleave3_v632_v2f32:
+; ZIP-LABEL: vector_interleave4_v8f32_v2f32:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -16
; ZIP-NEXT: .cfi_def_cfa_offset 16
@@ -772,14 +1236,17 @@ define <6 x float> @vector_interleave3_v632_v2f32(<2 x float> %a, <2 x float> %b
; ZIP-NEXT: addi a0, sp, 16
; ZIP-NEXT: csrr a1, vlenb
; ZIP-NEXT: srli a1, a1, 1
-; ZIP-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
-; ZIP-NEXT: vsseg3e32.v v8, (a0)
; ZIP-NEXT: add a2, a0, a1
+; ZIP-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
+; ZIP-NEXT: vsseg4e32.v v8, (a0)
+; ZIP-NEXT: add a3, a2, a1
+; ZIP-NEXT: add a1, a3, a1
+; ZIP-NEXT: vle32.v v10, (a3)
; ZIP-NEXT: vle32.v v9, (a2)
+; ZIP-NEXT: vle32.v v11, (a1)
; ZIP-NEXT: vle32.v v8, (a0)
-; ZIP-NEXT: add a1, a2, a1
-; ZIP-NEXT: vle32.v v10, (a1)
; ZIP-NEXT: vsetivli zero, 4, e32, m1, ta, ma
+; ZIP-NEXT: vslideup.vi v10, v11, 2
; ZIP-NEXT: vslideup.vi v8, v9, 2
; ZIP-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; ZIP-NEXT: vslideup.vi v8, v10, 4
@@ -790,11 +1257,10 @@ define <6 x float> @vector_interleave3_v632_v2f32(<2 x float> %a, <2 x float> %b
; ZIP-NEXT: addi sp, sp, 16
; ZIP-NEXT: .cfi_def_cfa_offset 0
; ZIP-NEXT: ret
- %res = call <6 x float> @llvm.vector.interleave3.v6f32(<2 x float> %a, <2 x float> %b, <2 x float> %c)
- ret <6 x float> %res
+ %res = call <8 x float> @llvm.vector.interleave4.v8f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d)
+ ret <8 x float> %res
}
-
define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e) {
; CHECK-LABEL: vector_interleave5_v10f16_v2f16:
; CHECK: # %bb.0:
@@ -910,6 +1376,130 @@ define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b
ret <10 x half> %res
}
+define <12 x half> @vector_interleave6_v12f16_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e, <2 x half> %f) {
+; CHECK-LABEL: vector_interleave6_v12f16_v2f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a1, a1, 2
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vsetvli a4, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vsseg6e16.v v8, (a0)
+; CHECK-NEXT: vle16.v v9, (a2)
+; CHECK-NEXT: add a2, a3, a1
+; CHECK-NEXT: vle16.v v11, (a2)
+; CHECK-NEXT: add a2, a2, a1
+; CHECK-NEXT: vle16.v v12, (a3)
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: vle16.v v10, (a2)
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: vle16.v v13, (a1)
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v12, v11, 2
+; CHECK-NEXT: vslideup.vi v8, v9, 2
+; CHECK-NEXT: vslideup.vi v10, v13, 2
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v12, 4
+; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v10, 8
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave6_v12f16_v2f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: .cfi_def_cfa_offset 16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a1, a1, 2
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vsetvli a4, zero, e16, mf4, ta, ma
+; ZVBB-NEXT: vsseg6e16.v v8, (a0)
+; ZVBB-NEXT: vle16.v v9, (a2)
+; ZVBB-NEXT: add a2, a3, a1
+; ZVBB-NEXT: vle16.v v11, (a2)
+; ZVBB-NEXT: add a2, a2, a1
+; ZVBB-NEXT: vle16.v v12, (a3)
+; ZVBB-NEXT: add a1, a2, a1
+; ZVBB-NEXT: vle16.v v10, (a2)
+; ZVBB-NEXT: vle16.v v8, (a0)
+; ZVBB-NEXT: vle16.v v13, (a1)
+; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVBB-NEXT: vslideup.vi v12, v11, 2
+; ZVBB-NEXT: vslideup.vi v8, v9, 2
+; ZVBB-NEXT: vslideup.vi v10, v13, 2
+; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v12, 4
+; ZVBB-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v10, 8
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: .cfi_def_cfa sp, 16
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: .cfi_def_cfa_offset 0
+; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave6_v12f16_v2f16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: addi sp, sp, -16
+; ZIP-NEXT: .cfi_def_cfa_offset 16
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: slli a0, a0, 1
+; ZIP-NEXT: sub sp, sp, a0
+; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZIP-NEXT: addi a0, sp, 16
+; ZIP-NEXT: csrr a1, vlenb
+; ZIP-NEXT: srli a1, a1, 2
+; ZIP-NEXT: add a2, a0, a1
+; ZIP-NEXT: add a3, a2, a1
+; ZIP-NEXT: vsetvli a4, zero, e16, mf4, ta, ma
+; ZIP-NEXT: vsseg6e16.v v8, (a0)
+; ZIP-NEXT: vle16.v v9, (a2)
+; ZIP-NEXT: add a2, a3, a1
+; ZIP-NEXT: vle16.v v11, (a2)
+; ZIP-NEXT: add a2, a2, a1
+; ZIP-NEXT: vle16.v v12, (a3)
+; ZIP-NEXT: add a1, a2, a1
+; ZIP-NEXT: vle16.v v10, (a2)
+; ZIP-NEXT: vle16.v v8, (a0)
+; ZIP-NEXT: vle16.v v13, (a1)
+; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZIP-NEXT: vslideup.vi v12, v11, 2
+; ZIP-NEXT: vslideup.vi v8, v9, 2
+; ZIP-NEXT: vslideup.vi v10, v13, 2
+; ZIP-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; ZIP-NEXT: vslideup.vi v8, v12, 4
+; ZIP-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; ZIP-NEXT: vslideup.vi v8, v10, 8
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: slli a0, a0, 1
+; ZIP-NEXT: add sp, sp, a0
+; ZIP-NEXT: .cfi_def_cfa sp, 16
+; ZIP-NEXT: addi sp, sp, 16
+; ZIP-NEXT: .cfi_def_cfa_offset 0
+; ZIP-NEXT: ret
+ %res = call <12 x half> @llvm.vector.interleave6.v12f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e, <2 x half> %f)
+ ret <12 x half> %res
+}
+
define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g) {
; CHECK-LABEL: vector_interleave7_v7f16_v1f16:
; CHECK: # %bb.0:
@@ -1045,3 +1635,148 @@ define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b,
%res = call <7 x half> @llvm.vector.interleave7.v7f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g)
ret <7 x half> %res
}
+
+define <8 x half> @vector_interleave8_v8f16_v1f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g, <1 x half> %h) {
+; CHECK-LABEL: vector_interleave8_v8f16_v1f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a1, a1, 2
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: add a4, a3, a1
+; CHECK-NEXT: add a5, a4, a1
+; CHECK-NEXT: add a6, a5, a1
+; CHECK-NEXT: vsetvli a7, zero, e16, mf4, ta, ma
+; CHECK-NEXT: vsseg8e16.v v8, (a0)
+; CHECK-NEXT: vle16.v v9, (a6)
+; CHECK-NEXT: add a6, a6, a1
+; CHECK-NEXT: vle16.v v10, (a5)
+; CHECK-NEXT: vle16.v v11, (a6)
+; CHECK-NEXT: add a1, a6, a1
+; CHECK-NEXT: vle16.v v12, (a2)
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: vle16.v v13, (a3)
+; CHECK-NEXT: vle16.v v14, (a4)
+; CHECK-NEXT: vle16.v v15, (a1)
+; CHECK-NEXT: vsetivli zero, 2, e16, mf2, tu, ma
+; CHECK-NEXT: vslideup.vi v10, v9, 1
+; CHECK-NEXT: vslideup.vi v8, v12, 1
+; CHECK-NEXT: vsetivli zero, 3, e16, mf2, tu, ma
+; CHECK-NEXT: vslideup.vi v10, v11, 2
+; CHECK-NEXT: vslideup.vi v8, v13, 2
+; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT: vslideup.vi v10, v15, 3
+; CHECK-NEXT: vslideup.vi v8, v14, 3
+; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vi v8, v10, 4
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave8_v8f16_v1f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: .cfi_def_cfa_offset 16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a1, a1, 2
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: add a4, a3, a1
+; ZVBB-NEXT: add a5, a4, a1
+; ZVBB-NEXT: add a6, a5, a1
+; ZVBB-NEXT: vsetvli a7, zero, e16, mf4, ta, ma
+; ZVBB-NEXT: vsseg8e16.v v8, (a0)
+; ZVBB-NEXT: vle16.v v9, (a6)
+; ZVBB-NEXT: add a6, a6, a1
+; ZVBB-NEXT: vle16.v v10, (a5)
+; ZVBB-NEXT: vle16.v v11, (a6)
+; ZVBB-NEXT: add a1, a6, a1
+; ZVBB-NEXT: vle16.v v12, (a2)
+; ZVBB-NEXT: vle16.v v8, (a0)
+; ZVBB-NEXT: vle16.v v13, (a3)
+; ZVBB-NEXT: vle16.v v14, (a4)
+; ZVBB-NEXT: vle16.v v15, (a1)
+; ZVBB-NEXT: vsetivli zero, 2, e16, mf2, tu, ma
+; ZVBB-NEXT: vslideup.vi v10, v9, 1
+; ZVBB-NEXT: vslideup.vi v8, v12, 1
+; ZVBB-NEXT: vsetivli zero, 3, e16, mf2, tu, ma
+; ZVBB-NEXT: vslideup.vi v10, v11, 2
+; ZVBB-NEXT: vslideup.vi v8, v13, 2
+; ZVBB-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVBB-NEXT: vslideup.vi v10, v15, 3
+; ZVBB-NEXT: vslideup.vi v8, v14, 3
+; ZVBB-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vi v8, v10, 4
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: .cfi_def_cfa sp, 16
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: .cfi_def_cfa_offset 0
+; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave8_v8f16_v1f16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: addi sp, sp, -16
+; ZIP-NEXT: .cfi_def_cfa_offset 16
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: slli a0, a0, 1
+; ZIP-NEXT: sub sp, sp, a0
+; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZIP-NEXT: addi a0, sp, 16
+; ZIP-NEXT: csrr a1, vlenb
+; ZIP-NEXT: srli a1, a1, 2
+; ZIP-NEXT: add a2, a0, a1
+; ZIP-NEXT: add a3, a2, a1
+; ZIP-NEXT: add a4, a3, a1
+; ZIP-NEXT: add a5, a4, a1
+; ZIP-NEXT: add a6, a5, a1
+; ZIP-NEXT: vsetvli a7, zero, e16, mf4, ta, ma
+; ZIP-NEXT: vsseg8e16.v v8, (a0)
+; ZIP-NEXT: vle16.v v9, (a6)
+; ZIP-NEXT: add a6, a6, a1
+; ZIP-NEXT: vle16.v v10, (a5)
+; ZIP-NEXT: vle16.v v11, (a6)
+; ZIP-NEXT: add a1, a6, a1
+; ZIP-NEXT: vle16.v v12, (a2)
+; ZIP-NEXT: vle16.v v8, (a0)
+; ZIP-NEXT: vle16.v v13, (a3)
+; ZIP-NEXT: vle16.v v14, (a4)
+; ZIP-NEXT: vle16.v v15, (a1)
+; ZIP-NEXT: vsetivli zero, 2, e16, mf2, tu, ma
+; ZIP-NEXT: vslideup.vi v10, v9, 1
+; ZIP-NEXT: vslideup.vi v8, v12, 1
+; ZIP-NEXT: vsetivli zero, 3, e16, mf2, tu, ma
+; ZIP-NEXT: vslideup.vi v10, v11, 2
+; ZIP-NEXT: vslideup.vi v8, v13, 2
+; ZIP-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZIP-NEXT: vslideup.vi v10, v15, 3
+; ZIP-NEXT: vslideup.vi v8, v14, 3
+; ZIP-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; ZIP-NEXT: vslideup.vi v8, v10, 4
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: slli a0, a0, 1
+; ZIP-NEXT: add sp, sp, a0
+; ZIP-NEXT: .cfi_def_cfa sp, 16
+; ZIP-NEXT: addi sp, sp, 16
+; ZIP-NEXT: .cfi_def_cfa_offset 0
+; ZIP-NEXT: ret
+ %res = call <8 x half> @llvm.vector.interleave8.v8f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g, <1 x half> %h)
+ ret <8 x half> %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index 7347000bf5c71..77723609a60c7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -786,6 +786,313 @@ define <vscale x 6 x i64> @vector_interleave_nxv6i64_nxv2i64(<vscale x 2 x i64>
ret <vscale x 6 x i64> %res
}
+define <vscale x 64 x i1> @vector_interleave_nxv64i1_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d) nounwind {
+; CHECK-LABEL: vector_interleave_nxv64i1_nxv16i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmv1r.v v11, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmv.v.i v12, 0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: vmerge.vim v16, v12, 1, v0
+; CHECK-NEXT: slli a2, a1, 1
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: vmerge.vim v14, v12, 1, v0
+; CHECK-NEXT: add a3, a0, a2
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmerge.vim v18, v12, 1, v0
+; CHECK-NEXT: add a4, a3, a2
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmerge.vim v20, v12, 1, v0
+; CHECK-NEXT: add a2, a4, a2
+; CHECK-NEXT: vsseg4e8.v v14, (a0)
+; CHECK-NEXT: vl2r.v v8, (a2)
+; CHECK-NEXT: srli a2, a1, 2
+; CHECK-NEXT: srli a1, a1, 1
+; CHECK-NEXT: vl2r.v v10, (a4)
+; CHECK-NEXT: add a4, a2, a2
+; CHECK-NEXT: vl2r.v v12, (a3)
+; CHECK-NEXT: vl2r.v v14, (a0)
+; CHECK-NEXT: vmsne.vi v16, v8, 0
+; CHECK-NEXT: vmsne.vi v8, v10, 0
+; CHECK-NEXT: vmsne.vi v9, v12, 0
+; CHECK-NEXT: vmsne.vi v0, v14, 0
+; CHECK-NEXT: vsetvli zero, a4, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v16, a2
+; CHECK-NEXT: vslideup.vx v0, v9, a2
+; CHECK-NEXT: add a0, a1, a1
+; CHECK-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v0, v8, a1
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv64i1_nxv16i1:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; ZVBB-NEXT: vmv1r.v v11, v0
+; ZVBB-NEXT: vmv1r.v v0, v8
+; ZVBB-NEXT: vmv.v.i v12, 0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: vmerge.vim v16, v12, 1, v0
+; ZVBB-NEXT: slli a2, a1, 1
+; ZVBB-NEXT: vmv1r.v v0, v11
+; ZVBB-NEXT: vmerge.vim v14, v12, 1, v0
+; ZVBB-NEXT: add a3, a0, a2
+; ZVBB-NEXT: vmv1r.v v0, v9
+; ZVBB-NEXT: vmerge.vim v18, v12, 1, v0
+; ZVBB-NEXT: add a4, a3, a2
+; ZVBB-NEXT: vmv1r.v v0, v10
+; ZVBB-NEXT: vmerge.vim v20, v12, 1, v0
+; ZVBB-NEXT: add a2, a4, a2
+; ZVBB-NEXT: vsseg4e8.v v14, (a0)
+; ZVBB-NEXT: vl2r.v v8, (a2)
+; ZVBB-NEXT: srli a2, a1, 2
+; ZVBB-NEXT: srli a1, a1, 1
+; ZVBB-NEXT: vl2r.v v10, (a4)
+; ZVBB-NEXT: add a4, a2, a2
+; ZVBB-NEXT: vl2r.v v12, (a3)
+; ZVBB-NEXT: vl2r.v v14, (a0)
+; ZVBB-NEXT: vmsne.vi v16, v8, 0
+; ZVBB-NEXT: vmsne.vi v8, v10, 0
+; ZVBB-NEXT: vmsne.vi v9, v12, 0
+; ZVBB-NEXT: vmsne.vi v0, v14, 0
+; ZVBB-NEXT: vsetvli zero, a4, e8, mf2, ta, ma
+; ZVBB-NEXT: vslideup.vx v8, v16, a2
+; ZVBB-NEXT: vslideup.vx v0, v9, a2
+; ZVBB-NEXT: add a0, a1, a1
+; ZVBB-NEXT: vsetvli zero, a0, e8, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v0, v8, a1
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d)
+ ret <vscale x 64 x i1> %res
+}
+
+define <vscale x 64 x i8> @vector_interleave_nxv64i8_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d) nounwind {
+;
+; CHECK-LABEL: vector_interleave_nxv64i8_nxv16i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: vsetvli a3, zero, e8, m2, ta, ma
+; CHECK-NEXT: vsseg4e8.v v8, (a0)
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vl2r.v v12, (a3)
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vl2r.v v14, (a1)
+; CHECK-NEXT: vl2r.v v8, (a0)
+; CHECK-NEXT: vl2r.v v10, (a2)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv64i8_nxv16i8:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 1
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: vsetvli a3, zero, e8, m2, ta, ma
+; ZVBB-NEXT: vsseg4e8.v v8, (a0)
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vl2r.v v12, (a3)
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vl2r.v v14, (a1)
+; ZVBB-NEXT: vl2r.v v8, (a0)
+; ZVBB-NEXT: vl2r.v v10, (a2)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d)
+ ret <vscale x 64 x i8> %res
+}
+
+define <vscale x 32 x i8> @vector_interleave_nxv32i8_nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, <vscale x 8 x i8> %c, <vscale x 8 x i8> %d) nounwind {
+; CHECK-LABEL: vector_interleave_nxv32i8_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vsetvli a4, zero, e8, m1, ta, ma
+; CHECK-NEXT: vsseg4e8.v v8, (a0)
+; CHECK-NEXT: vl1r.v v10, (a3)
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vl1r.v v11, (a1)
+; CHECK-NEXT: vl1r.v v8, (a0)
+; CHECK-NEXT: vl1r.v v9, (a2)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv32i8_nxv8i8:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 2
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vsetvli a4, zero, e8, m1, ta, ma
+; ZVBB-NEXT: vsseg4e8.v v8, (a0)
+; ZVBB-NEXT: vl1r.v v10, (a3)
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vl1r.v v11, (a1)
+; ZVBB-NEXT: vl1r.v v8, (a0)
+; ZVBB-NEXT: vl1r.v v9, (a2)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 2
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 32 x i8> @llvm.vector.interleave4.nxv32i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, <vscale x 8 x i8> %c, <vscale x 8 x i8> %d)
+ ret <vscale x 32 x i8> %res
+}
+
+define <vscale x 16 x i32> @vector_interleave_nxv16i32_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d) nounwind {
+;
+; CHECK-LABEL: vector_interleave_nxv16i32_nxv4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: vsetvli a3, zero, e32, m2, ta, ma
+; CHECK-NEXT: vsseg4e32.v v8, (a0)
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vl2re32.v v12, (a3)
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vl2re32.v v14, (a1)
+; CHECK-NEXT: vl2re32.v v8, (a0)
+; CHECK-NEXT: vl2re32.v v10, (a2)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16i32_nxv4i32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 1
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: vsetvli a3, zero, e32, m2, ta, ma
+; ZVBB-NEXT: vsseg4e32.v v8, (a0)
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vl2re32.v v12, (a3)
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vl2re32.v v14, (a1)
+; ZVBB-NEXT: vl2re32.v v8, (a0)
+; ZVBB-NEXT: vl2re32.v v10, (a2)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 16 x i32> @llvm.vector.interleave4.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d)
+ ret <vscale x 16 x i32> %res
+}
+
+define <vscale x 8 x i64> @vector_interleave_nxv8i64_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d) nounwind {
+;
+; CHECK-LABEL: vector_interleave_nxv8i64_nxv2i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: vsetvli a3, zero, e64, m2, ta, ma
+; CHECK-NEXT: vsseg4e64.v v8, (a0)
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vl2re64.v v12, (a3)
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vl2re64.v v14, (a1)
+; CHECK-NEXT: vl2re64.v v8, (a0)
+; CHECK-NEXT: vl2re64.v v10, (a2)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv8i64_nxv2i64:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 1
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: vsetvli a3, zero, e64, m2, ta, ma
+; ZVBB-NEXT: vsseg4e64.v v8, (a0)
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vl2re64.v v12, (a3)
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vl2re64.v v14, (a1)
+; ZVBB-NEXT: vl2re64.v v8, (a0)
+; ZVBB-NEXT: vl2re64.v v10, (a2)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 8 x i64> @llvm.vector.interleave4.nxv8i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d)
+ ret <vscale x 8 x i64> %res
+}
+
define <vscale x 80 x i1> @vector_interleave_nxv80i1_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e) nounwind {
; CHECK-LABEL: vector_interleave_nxv80i1_nxv16i1:
; CHECK: # %bb.0:
@@ -2009,1449 +2316,1552 @@ define <vscale x 10 x i64> @vector_interleave_nxv10i64_nxv2i64(<vscale x 2 x i64
ret <vscale x 10 x i64> %res
}
-define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e, <vscale x 16 x i1> %f, <vscale x 16 x i1> %g) nounwind {
-; CHECK-LABEL: vector_interleave_nxv112i1_nxv16i1:
+define <vscale x 96 x i1> @vector_interleave_nxv96i1_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e, <vscale x 16 x i1> %f) nounwind {
+; CHECK-LABEL: vector_interleave_nxv96i1_nxv16i1:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 14
+; CHECK-NEXT: li a1, 12
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT: vmv.v.i v14, 0
-; CHECK-NEXT: addi a4, sp, 16
+; CHECK-NEXT: vmv.v.i v20, 0
+; CHECK-NEXT: vmerge.vim v14, v20, 1, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vim v22, v20, 1, v0
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmv1r.v v16, v23
+; CHECK-NEXT: vmerge.vim v8, v20, 1, v0
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 3
-; CHECK-NEXT: sub a0, a1, a0
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add a0, sp, a0
; CHECK-NEXT: addi a0, a0, 16
-; CHECK-NEXT: csrr a2, vlenb
-; CHECK-NEXT: vmerge.vim v16, v14, 1, v0
-; CHECK-NEXT: vmv1r.v v0, v8
-; CHECK-NEXT: vmerge.vim v22, v14, 1, v0
-; CHECK-NEXT: add a3, a4, a2
-; CHECK-NEXT: srli a1, a2, 2
-; CHECK-NEXT: add a5, a0, a2
-; CHECK-NEXT: vmv4r.v v24, v16
-; CHECK-NEXT: vmv1r.v v0, v9
-; CHECK-NEXT: vmerge.vim v18, v14, 1, v0
-; CHECK-NEXT: add a6, a3, a2
-; CHECK-NEXT: vmv1r.v v25, v22
+; CHECK-NEXT: vmv1r.v v17, v9
; CHECK-NEXT: vmv1r.v v0, v10
-; CHECK-NEXT: vmerge.vim v8, v14, 1, v0
-; CHECK-NEXT: vmv1r.v v26, v18
+; CHECK-NEXT: vmerge.vim v24, v20, 1, v0
+; CHECK-NEXT: addi a5, sp, 16
+; CHECK-NEXT: vmv1r.v v18, v25
; CHECK-NEXT: vmv1r.v v0, v11
-; CHECK-NEXT: vmerge.vim v20, v14, 1, v0
-; CHECK-NEXT: vmv1r.v v27, v8
+; CHECK-NEXT: vmerge.vim v26, v20, 1, v0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: vmv1r.v v19, v27
; CHECK-NEXT: vmv1r.v v0, v12
-; CHECK-NEXT: vmerge.vim v10, v14, 1, v0
-; CHECK-NEXT: vmv1r.v v28, v20
-; CHECK-NEXT: vmv1r.v v18, v23
+; CHECK-NEXT: vmerge.vim v10, v20, 1, v0
+; CHECK-NEXT: add a3, a0, a2
+; CHECK-NEXT: vmv1r.v v20, v11
+; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT: vsseg6e8.v v15, (a0)
+; CHECK-NEXT: vmv1r.v v15, v22
+; CHECK-NEXT: add a4, a5, a2
+; CHECK-NEXT: vmv1r.v v16, v8
+; CHECK-NEXT: srli a1, a2, 2
+; CHECK-NEXT: vmv1r.v v17, v24
+; CHECK-NEXT: add a6, a4, a2
+; CHECK-NEXT: vmv1r.v v18, v26
+; CHECK-NEXT: add a7, a3, a2
+; CHECK-NEXT: vmv1r.v v19, v10
+; CHECK-NEXT: vsseg6e8.v v14, (a5)
+; CHECK-NEXT: vl1r.v v8, (a0)
+; CHECK-NEXT: add a0, a6, a2
+; CHECK-NEXT: vl1r.v v10, (a6)
+; CHECK-NEXT: add a6, a7, a2
+; CHECK-NEXT: vl1r.v v12, (a5)
+; CHECK-NEXT: add a5, a0, a2
+; CHECK-NEXT: vl1r.v v14, (a7)
; CHECK-NEXT: add a7, a6, a2
-; CHECK-NEXT: vmv1r.v v29, v10
-; CHECK-NEXT: vmv1r.v v20, v9
-; CHECK-NEXT: vmv1r.v v0, v13
-; CHECK-NEXT: vmerge.vim v30, v14, 1, v0
-; CHECK-NEXT: vmv1r.v v22, v11
-; CHECK-NEXT: vsetvli t0, zero, e8, m1, ta, ma
-; CHECK-NEXT: vsseg7e8.v v24, (a4)
-; CHECK-NEXT: vmv1r.v v23, v31
-; CHECK-NEXT: vsseg7e8.v v17, (a0)
-; CHECK-NEXT: vl1r.v v8, (a6)
-; CHECK-NEXT: add a6, a7, a2
-; CHECK-NEXT: vl1r.v v10, (a4)
-; CHECK-NEXT: add a4, a6, a2
-; CHECK-NEXT: vl1r.v v12, (a6)
-; CHECK-NEXT: add a6, a4, a2
-; CHECK-NEXT: vl1r.v v14, (a6)
-; CHECK-NEXT: add a6, a5, a2
; CHECK-NEXT: vl1r.v v16, (a5)
-; CHECK-NEXT: add a5, a6, a2
-; CHECK-NEXT: vl1r.v v18, (a5)
; CHECK-NEXT: add a5, a5, a2
-; CHECK-NEXT: vl1r.v v9, (a7)
-; CHECK-NEXT: add a7, a5, a2
-; CHECK-NEXT: vl1r.v v20, (a7)
+; CHECK-NEXT: vl1r.v v18, (a7)
; CHECK-NEXT: add a7, a7, a2
; CHECK-NEXT: srli a2, a2, 1
-; CHECK-NEXT: vl1r.v v11, (a3)
+; CHECK-NEXT: vl1r.v v9, (a3)
; CHECK-NEXT: add a3, a1, a1
+; CHECK-NEXT: vl1r.v v17, (a5)
+; CHECK-NEXT: add a5, a2, a2
+; CHECK-NEXT: vl1r.v v11, (a0)
; CHECK-NEXT: vl1r.v v13, (a4)
-; CHECK-NEXT: add a4, a2, a2
-; CHECK-NEXT: vl1r.v v15, (a0)
-; CHECK-NEXT: vl1r.v v19, (a5)
-; CHECK-NEXT: vl1r.v v17, (a6)
-; CHECK-NEXT: vl1r.v v21, (a7)
+; CHECK-NEXT: vl1r.v v19, (a7)
+; CHECK-NEXT: vl1r.v v15, (a6)
; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT: vmsne.vi v22, v8, 0
-; CHECK-NEXT: vmsne.vi v0, v10, 0
-; CHECK-NEXT: vmsne.vi v9, v12, 0
-; CHECK-NEXT: vmsne.vi v10, v14, 0
-; CHECK-NEXT: vmsne.vi v11, v18, 0
-; CHECK-NEXT: vmsne.vi v8, v16, 0
-; CHECK-NEXT: vmsne.vi v12, v20, 0
+; CHECK-NEXT: vmsne.vi v20, v8, 0
+; CHECK-NEXT: vmsne.vi v9, v16, 0
+; CHECK-NEXT: vmsne.vi v16, v10, 0
+; CHECK-NEXT: vmsne.vi v0, v12, 0
+; CHECK-NEXT: vmsne.vi v10, v18, 0
+; CHECK-NEXT: vmsne.vi v8, v14, 0
; CHECK-NEXT: vsetvli zero, a3, e8, mf2, ta, ma
-; CHECK-NEXT: vslideup.vx v0, v22, a1
-; CHECK-NEXT: vslideup.vx v9, v10, a1
-; CHECK-NEXT: vslideup.vx v8, v11, a1
-; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v20, a1
+; CHECK-NEXT: vslideup.vx v0, v16, a1
+; CHECK-NEXT: vsetvli zero, a5, e8, m1, ta, ma
; CHECK-NEXT: vslideup.vx v0, v9, a2
-; CHECK-NEXT: vslideup.vx v8, v12, a2
+; CHECK-NEXT: vsetvli zero, a3, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v10, a1
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 14
+; CHECK-NEXT: li a1, 12
; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv112i1_nxv16i1:
+; ZVBB-LABEL: vector_interleave_nxv96i1_nxv16i1:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: li a1, 14
+; ZVBB-NEXT: li a1, 12
; ZVBB-NEXT: mul a0, a0, a1
; ZVBB-NEXT: sub sp, sp, a0
; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma
-; ZVBB-NEXT: vmv.v.i v14, 0
-; ZVBB-NEXT: addi a4, sp, 16
+; ZVBB-NEXT: vmv.v.i v20, 0
+; ZVBB-NEXT: vmerge.vim v14, v20, 1, v0
+; ZVBB-NEXT: vmv1r.v v0, v8
+; ZVBB-NEXT: vmerge.vim v22, v20, 1, v0
+; ZVBB-NEXT: vmv1r.v v0, v9
+; ZVBB-NEXT: vmv1r.v v16, v23
+; ZVBB-NEXT: vmerge.vim v8, v20, 1, v0
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a1, a0, 3
-; ZVBB-NEXT: sub a0, a1, a0
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
; ZVBB-NEXT: add a0, sp, a0
; ZVBB-NEXT: addi a0, a0, 16
-; ZVBB-NEXT: csrr a2, vlenb
-; ZVBB-NEXT: vmerge.vim v16, v14, 1, v0
-; ZVBB-NEXT: vmv1r.v v0, v8
-; ZVBB-NEXT: vmerge.vim v22, v14, 1, v0
-; ZVBB-NEXT: add a3, a4, a2
-; ZVBB-NEXT: srli a1, a2, 2
-; ZVBB-NEXT: add a5, a0, a2
-; ZVBB-NEXT: vmv4r.v v24, v16
-; ZVBB-NEXT: vmv1r.v v0, v9
-; ZVBB-NEXT: vmerge.vim v18, v14, 1, v0
-; ZVBB-NEXT: add a6, a3, a2
-; ZVBB-NEXT: vmv1r.v v25, v22
+; ZVBB-NEXT: vmv1r.v v17, v9
; ZVBB-NEXT: vmv1r.v v0, v10
-; ZVBB-NEXT: vmerge.vim v8, v14, 1, v0
-; ZVBB-NEXT: vmv1r.v v26, v18
+; ZVBB-NEXT: vmerge.vim v24, v20, 1, v0
+; ZVBB-NEXT: addi a5, sp, 16
+; ZVBB-NEXT: vmv1r.v v18, v25
; ZVBB-NEXT: vmv1r.v v0, v11
-; ZVBB-NEXT: vmerge.vim v20, v14, 1, v0
-; ZVBB-NEXT: vmv1r.v v27, v8
+; ZVBB-NEXT: vmerge.vim v26, v20, 1, v0
+; ZVBB-NEXT: csrr a2, vlenb
+; ZVBB-NEXT: vmv1r.v v19, v27
; ZVBB-NEXT: vmv1r.v v0, v12
-; ZVBB-NEXT: vmerge.vim v10, v14, 1, v0
-; ZVBB-NEXT: vmv1r.v v28, v20
-; ZVBB-NEXT: vmv1r.v v18, v23
-; ZVBB-NEXT: add a7, a6, a2
-; ZVBB-NEXT: vmv1r.v v29, v10
-; ZVBB-NEXT: vmv1r.v v20, v9
-; ZVBB-NEXT: vmv1r.v v0, v13
-; ZVBB-NEXT: vmerge.vim v30, v14, 1, v0
-; ZVBB-NEXT: vmv1r.v v22, v11
-; ZVBB-NEXT: vsetvli t0, zero, e8, m1, ta, ma
-; ZVBB-NEXT: vsseg7e8.v v24, (a4)
-; ZVBB-NEXT: vmv1r.v v23, v31
-; ZVBB-NEXT: vsseg7e8.v v17, (a0)
-; ZVBB-NEXT: vl1r.v v8, (a6)
-; ZVBB-NEXT: add a6, a7, a2
-; ZVBB-NEXT: vl1r.v v10, (a4)
-; ZVBB-NEXT: add a4, a6, a2
-; ZVBB-NEXT: vl1r.v v12, (a6)
+; ZVBB-NEXT: vmerge.vim v10, v20, 1, v0
+; ZVBB-NEXT: add a3, a0, a2
+; ZVBB-NEXT: vmv1r.v v20, v11
+; ZVBB-NEXT: vsetvli a1, zero, e8, m1, ta, ma
+; ZVBB-NEXT: vsseg6e8.v v15, (a0)
+; ZVBB-NEXT: vmv1r.v v15, v22
+; ZVBB-NEXT: add a4, a5, a2
+; ZVBB-NEXT: vmv1r.v v16, v8
+; ZVBB-NEXT: srli a1, a2, 2
+; ZVBB-NEXT: vmv1r.v v17, v24
; ZVBB-NEXT: add a6, a4, a2
-; ZVBB-NEXT: vl1r.v v14, (a6)
-; ZVBB-NEXT: add a6, a5, a2
+; ZVBB-NEXT: vmv1r.v v18, v26
+; ZVBB-NEXT: add a7, a3, a2
+; ZVBB-NEXT: vmv1r.v v19, v10
+; ZVBB-NEXT: vsseg6e8.v v14, (a5)
+; ZVBB-NEXT: vl1r.v v8, (a0)
+; ZVBB-NEXT: add a0, a6, a2
+; ZVBB-NEXT: vl1r.v v10, (a6)
+; ZVBB-NEXT: add a6, a7, a2
+; ZVBB-NEXT: vl1r.v v12, (a5)
+; ZVBB-NEXT: add a5, a0, a2
+; ZVBB-NEXT: vl1r.v v14, (a7)
+; ZVBB-NEXT: add a7, a6, a2
; ZVBB-NEXT: vl1r.v v16, (a5)
-; ZVBB-NEXT: add a5, a6, a2
-; ZVBB-NEXT: vl1r.v v18, (a5)
; ZVBB-NEXT: add a5, a5, a2
-; ZVBB-NEXT: vl1r.v v9, (a7)
-; ZVBB-NEXT: add a7, a5, a2
-; ZVBB-NEXT: vl1r.v v20, (a7)
+; ZVBB-NEXT: vl1r.v v18, (a7)
; ZVBB-NEXT: add a7, a7, a2
; ZVBB-NEXT: srli a2, a2, 1
-; ZVBB-NEXT: vl1r.v v11, (a3)
+; ZVBB-NEXT: vl1r.v v9, (a3)
; ZVBB-NEXT: add a3, a1, a1
+; ZVBB-NEXT: vl1r.v v17, (a5)
+; ZVBB-NEXT: add a5, a2, a2
+; ZVBB-NEXT: vl1r.v v11, (a0)
; ZVBB-NEXT: vl1r.v v13, (a4)
-; ZVBB-NEXT: add a4, a2, a2
-; ZVBB-NEXT: vl1r.v v15, (a0)
-; ZVBB-NEXT: vl1r.v v19, (a5)
-; ZVBB-NEXT: vl1r.v v17, (a6)
-; ZVBB-NEXT: vl1r.v v21, (a7)
+; ZVBB-NEXT: vl1r.v v19, (a7)
+; ZVBB-NEXT: vl1r.v v15, (a6)
; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma
-; ZVBB-NEXT: vmsne.vi v22, v8, 0
-; ZVBB-NEXT: vmsne.vi v0, v10, 0
-; ZVBB-NEXT: vmsne.vi v9, v12, 0
-; ZVBB-NEXT: vmsne.vi v10, v14, 0
-; ZVBB-NEXT: vmsne.vi v11, v18, 0
-; ZVBB-NEXT: vmsne.vi v8, v16, 0
-; ZVBB-NEXT: vmsne.vi v12, v20, 0
+; ZVBB-NEXT: vmsne.vi v20, v8, 0
+; ZVBB-NEXT: vmsne.vi v9, v16, 0
+; ZVBB-NEXT: vmsne.vi v16, v10, 0
+; ZVBB-NEXT: vmsne.vi v0, v12, 0
+; ZVBB-NEXT: vmsne.vi v10, v18, 0
+; ZVBB-NEXT: vmsne.vi v8, v14, 0
; ZVBB-NEXT: vsetvli zero, a3, e8, mf2, ta, ma
-; ZVBB-NEXT: vslideup.vx v0, v22, a1
-; ZVBB-NEXT: vslideup.vx v9, v10, a1
-; ZVBB-NEXT: vslideup.vx v8, v11, a1
-; ZVBB-NEXT: vsetvli zero, a4, e8, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v9, v20, a1
+; ZVBB-NEXT: vslideup.vx v0, v16, a1
+; ZVBB-NEXT: vsetvli zero, a5, e8, m1, ta, ma
; ZVBB-NEXT: vslideup.vx v0, v9, a2
-; ZVBB-NEXT: vslideup.vx v8, v12, a2
+; ZVBB-NEXT: vsetvli zero, a3, e8, mf2, ta, ma
+; ZVBB-NEXT: vslideup.vx v8, v10, a1
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: li a1, 14
+; ZVBB-NEXT: li a1, 12
; ZVBB-NEXT: mul a0, a0, a1
; ZVBB-NEXT: add sp, sp, a0
; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
- %res = call <vscale x 112 x i1> @llvm.vector.interleave7.nxv112i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e, <vscale x 16 x i1> %f, <vscale x 16 x i1> %g)
- ret <vscale x 112 x i1> %res
+ %res = call <vscale x 96 x i1> @llvm.vector.interleave6.nxv96i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e, <vscale x 16 x i1> %f)
+ ret <vscale x 96 x i1> %res
}
-
-define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, <vscale x 16 x i8> %e, <vscale x 16 x i8> %f, <vscale x 16 x i8> %g) nounwind {
+define <vscale x 96 x i8> @vector_interleave_nxv96i8_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, <vscale x 16 x i8> %e, <vscale x 16 x i8> %f) nounwind {
;
-; RV32-LABEL: vector_interleave_nxv112i8_nxv16i8:
+; RV32-LABEL: vector_interleave_nxv96i8_nxv16i8:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -80
; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
; RV32-NEXT: addi s0, sp, 80
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 28
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: sub sp, sp, a0
; RV32-NEXT: andi sp, sp, -64
; RV32-NEXT: vsetvli a0, zero, e8, m1, ta, ma
-; RV32-NEXT: vmv2r.v v26, v20
-; RV32-NEXT: addi a0, sp, 64
-; RV32-NEXT: vmv2r.v v24, v16
+; RV32-NEXT: vmv2r.v v20, v14
+; RV32-NEXT: vmv2r.v v22, v12
+; RV32-NEXT: vmv2r.v v24, v10
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a2, a1, 3
-; RV32-NEXT: sub a1, a2, a1
+; RV32-NEXT: li a0, 6
+; RV32-NEXT: mul a1, a1, a0
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 64
-; RV32-NEXT: vmv2r.v v22, v12
+; RV32-NEXT: vmv1r.v v10, v25
+; RV32-NEXT: vmv1r.v v11, v23
+; RV32-NEXT: vmv1r.v v12, v21
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv1r.v v13, v17
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: vmv2r.v v20, v8
-; RV32-NEXT: vmv1r.v v1, v20
-; RV32-NEXT: vmv1r.v v3, v22
-; RV32-NEXT: vmv1r.v v5, v24
-; RV32-NEXT: vmv1r.v v7, v26
+; RV32-NEXT: vmv1r.v v14, v19
+; RV32-NEXT: vsseg6e8.v v9, (a1)
+; RV32-NEXT: vmv1r.v v9, v24
+; RV32-NEXT: add a5, a1, a2
+; RV32-NEXT: vmv1r.v v10, v22
; RV32-NEXT: add a3, a0, a2
-; RV32-NEXT: vmv1r.v v2, v10
-; RV32-NEXT: add a4, a1, a2
-; RV32-NEXT: slli a5, a2, 2
-; RV32-NEXT: vmv1r.v v4, v14
-; RV32-NEXT: slli a6, a2, 4
-; RV32-NEXT: add a7, a4, a2
-; RV32-NEXT: vmv1r.v v6, v18
-; RV32-NEXT: sub a5, a6, a5
-; RV32-NEXT: vmv1r.v v22, v11
-; RV32-NEXT: add a6, a7, a2
-; RV32-NEXT: vmv1r.v v24, v15
-; RV32-NEXT: vsseg7e8.v v1, (a0)
-; RV32-NEXT: vmv1r.v v26, v19
-; RV32-NEXT: vsseg7e8.v v21, (a1)
-; RV32-NEXT: vl1r.v v18, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1r.v v19, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1r.v v20, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1r.v v21, (a6)
-; RV32-NEXT: add a6, a3, a2
-; RV32-NEXT: vl1r.v v10, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1r.v v11, (a6)
-; RV32-NEXT: vl1r.v v8, (a0)
-; RV32-NEXT: vl1r.v v16, (a4)
-; RV32-NEXT: vl1r.v v9, (a3)
-; RV32-NEXT: vl1r.v v17, (a7)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a3, 14
-; RV32-NEXT: mul a0, a0, a3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 64
-; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vmv1r.v v11, v20
+; RV32-NEXT: add a4, a3, a2
+; RV32-NEXT: vmv1r.v v12, v16
+; RV32-NEXT: add a6, a5, a2
+; RV32-NEXT: vmv1r.v v13, v18
+; RV32-NEXT: vsseg6e8.v v8, (a0)
+; RV32-NEXT: vl1r.v v14, (a1)
+; RV32-NEXT: add a1, a6, a2
+; RV32-NEXT: vl1r.v v15, (a5)
+; RV32-NEXT: add a5, a1, a2
+; RV32-NEXT: vl1r.v v18, (a5)
+; RV32-NEXT: add a5, a5, a2
+; RV32-NEXT: vl1r.v v19, (a5)
+; RV32-NEXT: add a5, a4, a2
+; RV32-NEXT: vl1r.v v16, (a6)
+; RV32-NEXT: add a6, a5, a2
; RV32-NEXT: vl1r.v v12, (a6)
; RV32-NEXT: add a6, a6, a2
; RV32-NEXT: vl1r.v v13, (a6)
-; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: li a7, 12
+; RV32-NEXT: mul a6, a6, a7
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 64
+; RV32-NEXT: vl1r.v v17, (a1)
+; RV32-NEXT: vl1r.v v10, (a4)
+; RV32-NEXT: vl1r.v v11, (a5)
+; RV32-NEXT: vl1r.v v8, (a0)
+; RV32-NEXT: vl1r.v v9, (a3)
; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a0, a2
-; RV32-NEXT: vl1r.v v14, (a6)
-; RV32-NEXT: vl1r.v v15, (a1)
-; RV32-NEXT: add a5, a0, a5
-; RV32-NEXT: vs2r.v v20, (a5)
+; RV32-NEXT: add a2, a6, a2
; RV32-NEXT: vs4r.v v16, (a2)
-; RV32-NEXT: vs8r.v v8, (a0)
+; RV32-NEXT: vs8r.v v8, (a6)
; RV32-NEXT: vl8r.v v16, (a2)
-; RV32-NEXT: vl8r.v v8, (a0)
+; RV32-NEXT: vl8r.v v8, (a6)
; RV32-NEXT: addi sp, s0, -80
; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 80
; RV32-NEXT: ret
;
-; RV64-LABEL: vector_interleave_nxv112i8_nxv16i8:
+; RV64-LABEL: vector_interleave_nxv96i8_nxv16i8:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -80
; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; RV64-NEXT: addi s0, sp, 80
; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: li a1, 28
+; RV64-NEXT: mul a0, a0, a1
; RV64-NEXT: sub sp, sp, a0
; RV64-NEXT: andi sp, sp, -64
; RV64-NEXT: vsetvli a0, zero, e8, m1, ta, ma
-; RV64-NEXT: vmv2r.v v26, v20
-; RV64-NEXT: addi a0, sp, 64
-; RV64-NEXT: vmv2r.v v24, v16
+; RV64-NEXT: vmv2r.v v20, v14
+; RV64-NEXT: vmv2r.v v22, v12
+; RV64-NEXT: vmv2r.v v24, v10
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a2, a1, 3
-; RV64-NEXT: sub a1, a2, a1
+; RV64-NEXT: li a0, 6
+; RV64-NEXT: mul a1, a1, a0
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 64
-; RV64-NEXT: vmv2r.v v22, v12
+; RV64-NEXT: vmv1r.v v10, v25
+; RV64-NEXT: vmv1r.v v11, v23
+; RV64-NEXT: vmv1r.v v12, v21
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv1r.v v13, v17
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: vmv2r.v v20, v8
-; RV64-NEXT: vmv1r.v v1, v20
-; RV64-NEXT: vmv1r.v v3, v22
-; RV64-NEXT: vmv1r.v v5, v24
-; RV64-NEXT: vmv1r.v v7, v26
+; RV64-NEXT: vmv1r.v v14, v19
+; RV64-NEXT: vsseg6e8.v v9, (a1)
+; RV64-NEXT: vmv1r.v v9, v24
+; RV64-NEXT: add a5, a1, a2
+; RV64-NEXT: vmv1r.v v10, v22
; RV64-NEXT: add a3, a0, a2
-; RV64-NEXT: vmv1r.v v2, v10
-; RV64-NEXT: add a4, a1, a2
-; RV64-NEXT: slli a5, a2, 2
-; RV64-NEXT: vmv1r.v v4, v14
-; RV64-NEXT: slli a6, a2, 4
-; RV64-NEXT: add a7, a4, a2
-; RV64-NEXT: vmv1r.v v6, v18
-; RV64-NEXT: sub a5, a6, a5
-; RV64-NEXT: vmv1r.v v22, v11
-; RV64-NEXT: add a6, a7, a2
-; RV64-NEXT: vmv1r.v v24, v15
-; RV64-NEXT: vsseg7e8.v v1, (a0)
-; RV64-NEXT: vmv1r.v v26, v19
-; RV64-NEXT: vsseg7e8.v v21, (a1)
-; RV64-NEXT: vl1r.v v18, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1r.v v19, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1r.v v20, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1r.v v21, (a6)
-; RV64-NEXT: add a6, a3, a2
-; RV64-NEXT: vl1r.v v10, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1r.v v11, (a6)
-; RV64-NEXT: vl1r.v v8, (a0)
-; RV64-NEXT: vl1r.v v16, (a4)
-; RV64-NEXT: vl1r.v v9, (a3)
-; RV64-NEXT: vl1r.v v17, (a7)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: li a3, 14
-; RV64-NEXT: mul a0, a0, a3
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 64
-; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vmv1r.v v11, v20
+; RV64-NEXT: add a4, a3, a2
+; RV64-NEXT: vmv1r.v v12, v16
+; RV64-NEXT: add a6, a5, a2
+; RV64-NEXT: vmv1r.v v13, v18
+; RV64-NEXT: vsseg6e8.v v8, (a0)
+; RV64-NEXT: vl1r.v v14, (a1)
+; RV64-NEXT: add a1, a6, a2
+; RV64-NEXT: vl1r.v v15, (a5)
+; RV64-NEXT: add a5, a1, a2
+; RV64-NEXT: vl1r.v v18, (a5)
+; RV64-NEXT: add a5, a5, a2
+; RV64-NEXT: vl1r.v v19, (a5)
+; RV64-NEXT: add a5, a4, a2
+; RV64-NEXT: vl1r.v v16, (a6)
+; RV64-NEXT: add a6, a5, a2
; RV64-NEXT: vl1r.v v12, (a6)
; RV64-NEXT: add a6, a6, a2
; RV64-NEXT: vl1r.v v13, (a6)
-; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: csrr a6, vlenb
+; RV64-NEXT: li a7, 12
+; RV64-NEXT: mul a6, a6, a7
+; RV64-NEXT: add a6, sp, a6
+; RV64-NEXT: addi a6, a6, 64
+; RV64-NEXT: vl1r.v v17, (a1)
+; RV64-NEXT: vl1r.v v10, (a4)
+; RV64-NEXT: vl1r.v v11, (a5)
+; RV64-NEXT: vl1r.v v8, (a0)
+; RV64-NEXT: vl1r.v v9, (a3)
; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a0, a2
-; RV64-NEXT: vl1r.v v14, (a6)
-; RV64-NEXT: vl1r.v v15, (a1)
-; RV64-NEXT: add a5, a0, a5
-; RV64-NEXT: vs2r.v v20, (a5)
+; RV64-NEXT: add a2, a6, a2
; RV64-NEXT: vs4r.v v16, (a2)
-; RV64-NEXT: vs8r.v v8, (a0)
+; RV64-NEXT: vs8r.v v8, (a6)
; RV64-NEXT: vl8r.v v16, (a2)
-; RV64-NEXT: vl8r.v v8, (a0)
+; RV64-NEXT: vl8r.v v8, (a6)
; RV64-NEXT: addi sp, s0, -80
; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 80
; RV64-NEXT: ret
;
-; ZVBB-RV32-LABEL: vector_interleave_nxv112i8_nxv16i8:
+; ZVBB-RV32-LABEL: vector_interleave_nxv96i8_nxv16i8:
; ZVBB-RV32: # %bb.0:
; ZVBB-RV32-NEXT: addi sp, sp, -80
; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
; ZVBB-RV32-NEXT: addi s0, sp, 80
; ZVBB-RV32-NEXT: csrr a0, vlenb
-; ZVBB-RV32-NEXT: slli a0, a0, 5
+; ZVBB-RV32-NEXT: li a1, 28
+; ZVBB-RV32-NEXT: mul a0, a0, a1
; ZVBB-RV32-NEXT: sub sp, sp, a0
; ZVBB-RV32-NEXT: andi sp, sp, -64
; ZVBB-RV32-NEXT: vsetvli a0, zero, e8, m1, ta, ma
-; ZVBB-RV32-NEXT: vmv2r.v v26, v20
-; ZVBB-RV32-NEXT: addi a0, sp, 64
-; ZVBB-RV32-NEXT: vmv2r.v v24, v16
+; ZVBB-RV32-NEXT: vmv2r.v v20, v14
+; ZVBB-RV32-NEXT: vmv2r.v v22, v12
+; ZVBB-RV32-NEXT: vmv2r.v v24, v10
; ZVBB-RV32-NEXT: csrr a1, vlenb
-; ZVBB-RV32-NEXT: slli a2, a1, 3
-; ZVBB-RV32-NEXT: sub a1, a2, a1
+; ZVBB-RV32-NEXT: li a0, 6
+; ZVBB-RV32-NEXT: mul a1, a1, a0
; ZVBB-RV32-NEXT: add a1, sp, a1
; ZVBB-RV32-NEXT: addi a1, a1, 64
-; ZVBB-RV32-NEXT: vmv2r.v v22, v12
+; ZVBB-RV32-NEXT: vmv1r.v v10, v25
+; ZVBB-RV32-NEXT: vmv1r.v v11, v23
+; ZVBB-RV32-NEXT: vmv1r.v v12, v21
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv1r.v v13, v17
; ZVBB-RV32-NEXT: csrr a2, vlenb
-; ZVBB-RV32-NEXT: vmv2r.v v20, v8
-; ZVBB-RV32-NEXT: vmv1r.v v1, v20
-; ZVBB-RV32-NEXT: vmv1r.v v3, v22
-; ZVBB-RV32-NEXT: vmv1r.v v5, v24
-; ZVBB-RV32-NEXT: vmv1r.v v7, v26
+; ZVBB-RV32-NEXT: vmv1r.v v14, v19
+; ZVBB-RV32-NEXT: vsseg6e8.v v9, (a1)
+; ZVBB-RV32-NEXT: vmv1r.v v9, v24
+; ZVBB-RV32-NEXT: add a5, a1, a2
+; ZVBB-RV32-NEXT: vmv1r.v v10, v22
; ZVBB-RV32-NEXT: add a3, a0, a2
-; ZVBB-RV32-NEXT: vmv1r.v v2, v10
-; ZVBB-RV32-NEXT: add a4, a1, a2
-; ZVBB-RV32-NEXT: slli a5, a2, 2
-; ZVBB-RV32-NEXT: vmv1r.v v4, v14
-; ZVBB-RV32-NEXT: slli a6, a2, 4
-; ZVBB-RV32-NEXT: add a7, a4, a2
-; ZVBB-RV32-NEXT: vmv1r.v v6, v18
-; ZVBB-RV32-NEXT: sub a5, a6, a5
-; ZVBB-RV32-NEXT: vmv1r.v v22, v11
-; ZVBB-RV32-NEXT: add a6, a7, a2
-; ZVBB-RV32-NEXT: vmv1r.v v24, v15
-; ZVBB-RV32-NEXT: vsseg7e8.v v1, (a0)
-; ZVBB-RV32-NEXT: vmv1r.v v26, v19
-; ZVBB-RV32-NEXT: vsseg7e8.v v21, (a1)
-; ZVBB-RV32-NEXT: vl1r.v v18, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1r.v v19, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1r.v v20, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1r.v v21, (a6)
-; ZVBB-RV32-NEXT: add a6, a3, a2
-; ZVBB-RV32-NEXT: vl1r.v v10, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1r.v v11, (a6)
-; ZVBB-RV32-NEXT: vl1r.v v8, (a0)
-; ZVBB-RV32-NEXT: vl1r.v v16, (a4)
-; ZVBB-RV32-NEXT: vl1r.v v9, (a3)
-; ZVBB-RV32-NEXT: vl1r.v v17, (a7)
-; ZVBB-RV32-NEXT: csrr a0, vlenb
-; ZVBB-RV32-NEXT: li a3, 14
-; ZVBB-RV32-NEXT: mul a0, a0, a3
-; ZVBB-RV32-NEXT: add a0, sp, a0
-; ZVBB-RV32-NEXT: addi a0, a0, 64
-; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vmv1r.v v11, v20
+; ZVBB-RV32-NEXT: add a4, a3, a2
+; ZVBB-RV32-NEXT: vmv1r.v v12, v16
+; ZVBB-RV32-NEXT: add a6, a5, a2
+; ZVBB-RV32-NEXT: vmv1r.v v13, v18
+; ZVBB-RV32-NEXT: vsseg6e8.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1r.v v14, (a1)
+; ZVBB-RV32-NEXT: add a1, a6, a2
+; ZVBB-RV32-NEXT: vl1r.v v15, (a5)
+; ZVBB-RV32-NEXT: add a5, a1, a2
+; ZVBB-RV32-NEXT: vl1r.v v18, (a5)
+; ZVBB-RV32-NEXT: add a5, a5, a2
+; ZVBB-RV32-NEXT: vl1r.v v19, (a5)
+; ZVBB-RV32-NEXT: add a5, a4, a2
+; ZVBB-RV32-NEXT: vl1r.v v16, (a6)
+; ZVBB-RV32-NEXT: add a6, a5, a2
; ZVBB-RV32-NEXT: vl1r.v v12, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
; ZVBB-RV32-NEXT: vl1r.v v13, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: csrr a6, vlenb
+; ZVBB-RV32-NEXT: li a7, 12
+; ZVBB-RV32-NEXT: mul a6, a6, a7
+; ZVBB-RV32-NEXT: add a6, sp, a6
+; ZVBB-RV32-NEXT: addi a6, a6, 64
+; ZVBB-RV32-NEXT: vl1r.v v17, (a1)
+; ZVBB-RV32-NEXT: vl1r.v v10, (a4)
+; ZVBB-RV32-NEXT: vl1r.v v11, (a5)
+; ZVBB-RV32-NEXT: vl1r.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1r.v v9, (a3)
; ZVBB-RV32-NEXT: slli a2, a2, 3
-; ZVBB-RV32-NEXT: add a2, a0, a2
-; ZVBB-RV32-NEXT: vl1r.v v14, (a6)
-; ZVBB-RV32-NEXT: vl1r.v v15, (a1)
-; ZVBB-RV32-NEXT: add a5, a0, a5
-; ZVBB-RV32-NEXT: vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT: add a2, a6, a2
; ZVBB-RV32-NEXT: vs4r.v v16, (a2)
-; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT: vs8r.v v8, (a6)
; ZVBB-RV32-NEXT: vl8r.v v16, (a2)
-; ZVBB-RV32-NEXT: vl8r.v v8, (a0)
+; ZVBB-RV32-NEXT: vl8r.v v8, (a6)
; ZVBB-RV32-NEXT: addi sp, s0, -80
; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; ZVBB-RV32-NEXT: addi sp, sp, 80
; ZVBB-RV32-NEXT: ret
;
-; ZVBB-RV64-LABEL: vector_interleave_nxv112i8_nxv16i8:
+; ZVBB-RV64-LABEL: vector_interleave_nxv96i8_nxv16i8:
; ZVBB-RV64: # %bb.0:
; ZVBB-RV64-NEXT: addi sp, sp, -80
; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; ZVBB-RV64-NEXT: addi s0, sp, 80
; ZVBB-RV64-NEXT: csrr a0, vlenb
-; ZVBB-RV64-NEXT: slli a0, a0, 5
+; ZVBB-RV64-NEXT: li a1, 28
+; ZVBB-RV64-NEXT: mul a0, a0, a1
; ZVBB-RV64-NEXT: sub sp, sp, a0
; ZVBB-RV64-NEXT: andi sp, sp, -64
; ZVBB-RV64-NEXT: vsetvli a0, zero, e8, m1, ta, ma
-; ZVBB-RV64-NEXT: vmv2r.v v26, v20
-; ZVBB-RV64-NEXT: addi a0, sp, 64
-; ZVBB-RV64-NEXT: vmv2r.v v24, v16
+; ZVBB-RV64-NEXT: vmv2r.v v20, v14
+; ZVBB-RV64-NEXT: vmv2r.v v22, v12
+; ZVBB-RV64-NEXT: vmv2r.v v24, v10
; ZVBB-RV64-NEXT: csrr a1, vlenb
-; ZVBB-RV64-NEXT: slli a2, a1, 3
-; ZVBB-RV64-NEXT: sub a1, a2, a1
+; ZVBB-RV64-NEXT: li a0, 6
+; ZVBB-RV64-NEXT: mul a1, a1, a0
; ZVBB-RV64-NEXT: add a1, sp, a1
; ZVBB-RV64-NEXT: addi a1, a1, 64
-; ZVBB-RV64-NEXT: vmv2r.v v22, v12
+; ZVBB-RV64-NEXT: vmv1r.v v10, v25
+; ZVBB-RV64-NEXT: vmv1r.v v11, v23
+; ZVBB-RV64-NEXT: vmv1r.v v12, v21
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv1r.v v13, v17
; ZVBB-RV64-NEXT: csrr a2, vlenb
-; ZVBB-RV64-NEXT: vmv2r.v v20, v8
-; ZVBB-RV64-NEXT: vmv1r.v v1, v20
-; ZVBB-RV64-NEXT: vmv1r.v v3, v22
-; ZVBB-RV64-NEXT: vmv1r.v v5, v24
-; ZVBB-RV64-NEXT: vmv1r.v v7, v26
+; ZVBB-RV64-NEXT: vmv1r.v v14, v19
+; ZVBB-RV64-NEXT: vsseg6e8.v v9, (a1)
+; ZVBB-RV64-NEXT: vmv1r.v v9, v24
+; ZVBB-RV64-NEXT: add a5, a1, a2
+; ZVBB-RV64-NEXT: vmv1r.v v10, v22
; ZVBB-RV64-NEXT: add a3, a0, a2
-; ZVBB-RV64-NEXT: vmv1r.v v2, v10
-; ZVBB-RV64-NEXT: add a4, a1, a2
-; ZVBB-RV64-NEXT: slli a5, a2, 2
-; ZVBB-RV64-NEXT: vmv1r.v v4, v14
-; ZVBB-RV64-NEXT: slli a6, a2, 4
-; ZVBB-RV64-NEXT: add a7, a4, a2
-; ZVBB-RV64-NEXT: vmv1r.v v6, v18
-; ZVBB-RV64-NEXT: sub a5, a6, a5
-; ZVBB-RV64-NEXT: vmv1r.v v22, v11
-; ZVBB-RV64-NEXT: add a6, a7, a2
-; ZVBB-RV64-NEXT: vmv1r.v v24, v15
-; ZVBB-RV64-NEXT: vsseg7e8.v v1, (a0)
-; ZVBB-RV64-NEXT: vmv1r.v v26, v19
-; ZVBB-RV64-NEXT: vsseg7e8.v v21, (a1)
-; ZVBB-RV64-NEXT: vl1r.v v18, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1r.v v19, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1r.v v20, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1r.v v21, (a6)
-; ZVBB-RV64-NEXT: add a6, a3, a2
-; ZVBB-RV64-NEXT: vl1r.v v10, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1r.v v11, (a6)
-; ZVBB-RV64-NEXT: vl1r.v v8, (a0)
-; ZVBB-RV64-NEXT: vl1r.v v16, (a4)
-; ZVBB-RV64-NEXT: vl1r.v v9, (a3)
-; ZVBB-RV64-NEXT: vl1r.v v17, (a7)
-; ZVBB-RV64-NEXT: csrr a0, vlenb
-; ZVBB-RV64-NEXT: li a3, 14
-; ZVBB-RV64-NEXT: mul a0, a0, a3
-; ZVBB-RV64-NEXT: add a0, sp, a0
-; ZVBB-RV64-NEXT: addi a0, a0, 64
-; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vmv1r.v v11, v20
+; ZVBB-RV64-NEXT: add a4, a3, a2
+; ZVBB-RV64-NEXT: vmv1r.v v12, v16
+; ZVBB-RV64-NEXT: add a6, a5, a2
+; ZVBB-RV64-NEXT: vmv1r.v v13, v18
+; ZVBB-RV64-NEXT: vsseg6e8.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1r.v v14, (a1)
+; ZVBB-RV64-NEXT: add a1, a6, a2
+; ZVBB-RV64-NEXT: vl1r.v v15, (a5)
+; ZVBB-RV64-NEXT: add a5, a1, a2
+; ZVBB-RV64-NEXT: vl1r.v v18, (a5)
+; ZVBB-RV64-NEXT: add a5, a5, a2
+; ZVBB-RV64-NEXT: vl1r.v v19, (a5)
+; ZVBB-RV64-NEXT: add a5, a4, a2
+; ZVBB-RV64-NEXT: vl1r.v v16, (a6)
+; ZVBB-RV64-NEXT: add a6, a5, a2
; ZVBB-RV64-NEXT: vl1r.v v12, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
; ZVBB-RV64-NEXT: vl1r.v v13, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: csrr a6, vlenb
+; ZVBB-RV64-NEXT: li a7, 12
+; ZVBB-RV64-NEXT: mul a6, a6, a7
+; ZVBB-RV64-NEXT: add a6, sp, a6
+; ZVBB-RV64-NEXT: addi a6, a6, 64
+; ZVBB-RV64-NEXT: vl1r.v v17, (a1)
+; ZVBB-RV64-NEXT: vl1r.v v10, (a4)
+; ZVBB-RV64-NEXT: vl1r.v v11, (a5)
+; ZVBB-RV64-NEXT: vl1r.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1r.v v9, (a3)
; ZVBB-RV64-NEXT: slli a2, a2, 3
-; ZVBB-RV64-NEXT: add a2, a0, a2
-; ZVBB-RV64-NEXT: vl1r.v v14, (a6)
-; ZVBB-RV64-NEXT: vl1r.v v15, (a1)
-; ZVBB-RV64-NEXT: add a5, a0, a5
-; ZVBB-RV64-NEXT: vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT: add a2, a6, a2
; ZVBB-RV64-NEXT: vs4r.v v16, (a2)
-; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT: vs8r.v v8, (a6)
; ZVBB-RV64-NEXT: vl8r.v v16, (a2)
-; ZVBB-RV64-NEXT: vl8r.v v8, (a0)
+; ZVBB-RV64-NEXT: vl8r.v v8, (a6)
; ZVBB-RV64-NEXT: addi sp, s0, -80
; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZVBB-RV64-NEXT: addi sp, sp, 80
; ZVBB-RV64-NEXT: ret
;
-; ZIP-LABEL: vector_interleave_nxv112i8_nxv16i8:
+; ZIP-LABEL: vector_interleave_nxv96i8_nxv16i8:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -80
; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; ZIP-NEXT: addi s0, sp, 80
; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: slli a0, a0, 5
+; ZIP-NEXT: li a1, 28
+; ZIP-NEXT: mul a0, a0, a1
; ZIP-NEXT: sub sp, sp, a0
; ZIP-NEXT: andi sp, sp, -64
; ZIP-NEXT: vsetvli a0, zero, e8, m1, ta, ma
-; ZIP-NEXT: vmv2r.v v26, v20
-; ZIP-NEXT: addi a0, sp, 64
-; ZIP-NEXT: vmv2r.v v24, v16
+; ZIP-NEXT: vmv2r.v v20, v14
+; ZIP-NEXT: vmv2r.v v22, v12
+; ZIP-NEXT: vmv2r.v v24, v10
; ZIP-NEXT: csrr a1, vlenb
-; ZIP-NEXT: slli a2, a1, 3
-; ZIP-NEXT: sub a1, a2, a1
+; ZIP-NEXT: li a0, 6
+; ZIP-NEXT: mul a1, a1, a0
; ZIP-NEXT: add a1, sp, a1
; ZIP-NEXT: addi a1, a1, 64
-; ZIP-NEXT: vmv2r.v v22, v12
+; ZIP-NEXT: vmv1r.v v10, v25
+; ZIP-NEXT: vmv1r.v v11, v23
+; ZIP-NEXT: vmv1r.v v12, v21
+; ZIP-NEXT: addi a0, sp, 64
+; ZIP-NEXT: vmv1r.v v13, v17
; ZIP-NEXT: csrr a2, vlenb
-; ZIP-NEXT: vmv2r.v v20, v8
-; ZIP-NEXT: vmv1r.v v1, v20
-; ZIP-NEXT: vmv1r.v v3, v22
-; ZIP-NEXT: vmv1r.v v5, v24
-; ZIP-NEXT: vmv1r.v v7, v26
+; ZIP-NEXT: vmv1r.v v14, v19
+; ZIP-NEXT: vsseg6e8.v v9, (a1)
+; ZIP-NEXT: vmv1r.v v9, v24
+; ZIP-NEXT: add a5, a1, a2
+; ZIP-NEXT: vmv1r.v v10, v22
; ZIP-NEXT: add a3, a0, a2
-; ZIP-NEXT: vmv1r.v v2, v10
-; ZIP-NEXT: add a4, a1, a2
-; ZIP-NEXT: slli a5, a2, 2
-; ZIP-NEXT: vmv1r.v v4, v14
-; ZIP-NEXT: slli a6, a2, 4
-; ZIP-NEXT: add a7, a4, a2
-; ZIP-NEXT: vmv1r.v v6, v18
-; ZIP-NEXT: sub a5, a6, a5
-; ZIP-NEXT: vmv1r.v v22, v11
-; ZIP-NEXT: add a6, a7, a2
-; ZIP-NEXT: vmv1r.v v24, v15
-; ZIP-NEXT: vsseg7e8.v v1, (a0)
-; ZIP-NEXT: vmv1r.v v26, v19
-; ZIP-NEXT: vsseg7e8.v v21, (a1)
-; ZIP-NEXT: vl1r.v v18, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1r.v v19, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1r.v v20, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1r.v v21, (a6)
-; ZIP-NEXT: add a6, a3, a2
-; ZIP-NEXT: vl1r.v v10, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1r.v v11, (a6)
-; ZIP-NEXT: vl1r.v v8, (a0)
-; ZIP-NEXT: vl1r.v v16, (a4)
-; ZIP-NEXT: vl1r.v v9, (a3)
-; ZIP-NEXT: vl1r.v v17, (a7)
-; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: li a3, 14
-; ZIP-NEXT: mul a0, a0, a3
-; ZIP-NEXT: add a0, sp, a0
-; ZIP-NEXT: addi a0, a0, 64
-; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vmv1r.v v11, v20
+; ZIP-NEXT: add a4, a3, a2
+; ZIP-NEXT: vmv1r.v v12, v16
+; ZIP-NEXT: add a6, a5, a2
+; ZIP-NEXT: vmv1r.v v13, v18
+; ZIP-NEXT: vsseg6e8.v v8, (a0)
+; ZIP-NEXT: vl1r.v v14, (a1)
+; ZIP-NEXT: add a1, a6, a2
+; ZIP-NEXT: vl1r.v v15, (a5)
+; ZIP-NEXT: add a5, a1, a2
+; ZIP-NEXT: vl1r.v v18, (a5)
+; ZIP-NEXT: add a5, a5, a2
+; ZIP-NEXT: vl1r.v v19, (a5)
+; ZIP-NEXT: add a5, a4, a2
+; ZIP-NEXT: vl1r.v v16, (a6)
+; ZIP-NEXT: add a6, a5, a2
; ZIP-NEXT: vl1r.v v12, (a6)
; ZIP-NEXT: add a6, a6, a2
; ZIP-NEXT: vl1r.v v13, (a6)
-; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: csrr a6, vlenb
+; ZIP-NEXT: li a7, 12
+; ZIP-NEXT: mul a6, a6, a7
+; ZIP-NEXT: add a6, sp, a6
+; ZIP-NEXT: addi a6, a6, 64
+; ZIP-NEXT: vl1r.v v17, (a1)
+; ZIP-NEXT: vl1r.v v10, (a4)
+; ZIP-NEXT: vl1r.v v11, (a5)
+; ZIP-NEXT: vl1r.v v8, (a0)
+; ZIP-NEXT: vl1r.v v9, (a3)
; ZIP-NEXT: slli a2, a2, 3
-; ZIP-NEXT: add a2, a0, a2
-; ZIP-NEXT: vl1r.v v14, (a6)
-; ZIP-NEXT: vl1r.v v15, (a1)
-; ZIP-NEXT: add a5, a0, a5
-; ZIP-NEXT: vs2r.v v20, (a5)
+; ZIP-NEXT: add a2, a6, a2
; ZIP-NEXT: vs4r.v v16, (a2)
-; ZIP-NEXT: vs8r.v v8, (a0)
+; ZIP-NEXT: vs8r.v v8, (a6)
; ZIP-NEXT: vl8r.v v16, (a2)
-; ZIP-NEXT: vl8r.v v8, (a0)
+; ZIP-NEXT: vl8r.v v8, (a6)
; ZIP-NEXT: addi sp, s0, -80
; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZIP-NEXT: addi sp, sp, 80
; ZIP-NEXT: ret
- %res = call <vscale x 112 x i8> @llvm.vector.interleave7.nxv112i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, <vscale x 16 x i8> %e, <vscale x 16 x i8> %f, <vscale x 16 x i8> %g)
- ret <vscale x 112 x i8> %res
+ %res = call <vscale x 96 x i8> @llvm.vector.interleave6.nxv96i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, <vscale x 16 x i8> %e, <vscale x 16 x i8> %f)
+ ret <vscale x 96 x i8> %res
}
-
-define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c, <vscale x 8 x i16> %d, <vscale x 8 x i16> %e, <vscale x 8 x i16> %f, <vscale x 8 x i16> %g) nounwind {
+define <vscale x 48 x i8> @vector_interleave_nxv48i8_nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, <vscale x 8 x i8> %c, <vscale x 8 x i8> %d, <vscale x 8 x i8> %e, <vscale x 8 x i8> %f) nounwind {
+; CHECK-LABEL: vector_interleave_nxv48i8_nxv8i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vsetvli a4, zero, e8, m1, ta, ma
+; CHECK-NEXT: vsseg6e8.v v8, (a0)
+; CHECK-NEXT: vl1r.v v10, (a3)
+; CHECK-NEXT: add a3, a3, a1
+; CHECK-NEXT: vl1r.v v11, (a3)
+; CHECK-NEXT: add a3, a3, a1
+; CHECK-NEXT: vl1r.v v8, (a0)
+; CHECK-NEXT: vl1r.v v9, (a2)
+; CHECK-NEXT: vl1r.v v12, (a3)
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vl1r.v v13, (a1)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
;
-; RV32-LABEL: vector_interleave_nxv56i16_nxv8i16:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -80
-; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
-; RV32-NEXT: addi s0, sp, 80
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
-; RV32-NEXT: sub sp, sp, a0
-; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; RV32-NEXT: vmv2r.v v26, v20
-; RV32-NEXT: addi a0, sp, 64
-; RV32-NEXT: vmv2r.v v24, v16
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a2, a1, 3
-; RV32-NEXT: sub a1, a2, a1
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 64
-; RV32-NEXT: vmv2r.v v22, v12
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: vmv2r.v v20, v8
-; RV32-NEXT: vmv1r.v v1, v20
-; RV32-NEXT: vmv1r.v v3, v22
-; RV32-NEXT: vmv1r.v v5, v24
-; RV32-NEXT: vmv1r.v v7, v26
-; RV32-NEXT: add a3, a0, a2
-; RV32-NEXT: vmv1r.v v2, v10
-; RV32-NEXT: add a4, a1, a2
-; RV32-NEXT: slli a5, a2, 2
-; RV32-NEXT: vmv1r.v v4, v14
-; RV32-NEXT: slli a6, a2, 4
-; RV32-NEXT: add a7, a4, a2
-; RV32-NEXT: vmv1r.v v6, v18
-; RV32-NEXT: sub a5, a6, a5
-; RV32-NEXT: vmv1r.v v22, v11
-; RV32-NEXT: add a6, a7, a2
-; RV32-NEXT: vmv1r.v v24, v15
-; RV32-NEXT: vsseg7e16.v v1, (a0)
-; RV32-NEXT: vmv1r.v v26, v19
-; RV32-NEXT: vsseg7e16.v v21, (a1)
-; RV32-NEXT: vl1re16.v v18, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re16.v v19, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re16.v v20, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re16.v v21, (a6)
-; RV32-NEXT: add a6, a3, a2
-; RV32-NEXT: vl1re16.v v10, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re16.v v11, (a6)
-; RV32-NEXT: vl1re16.v v8, (a0)
-; RV32-NEXT: vl1re16.v v16, (a4)
-; RV32-NEXT: vl1re16.v v9, (a3)
-; RV32-NEXT: vl1re16.v v17, (a7)
+; ZVBB-LABEL: vector_interleave_nxv48i8_nxv8i8:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vsetvli a4, zero, e8, m1, ta, ma
+; ZVBB-NEXT: vsseg6e8.v v8, (a0)
+; ZVBB-NEXT: vl1r.v v10, (a3)
+; ZVBB-NEXT: add a3, a3, a1
+; ZVBB-NEXT: vl1r.v v11, (a3)
+; ZVBB-NEXT: add a3, a3, a1
+; ZVBB-NEXT: vl1r.v v8, (a0)
+; ZVBB-NEXT: vl1r.v v9, (a2)
+; ZVBB-NEXT: vl1r.v v12, (a3)
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vl1r.v v13, (a1)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 48 x i8> @llvm.vector.interleave6.nxv48i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, <vscale x 8 x i8> %c, <vscale x 8 x i8> %d, <vscale x 8 x i8> %e, <vscale x 8 x i8> %f)
+ ret <vscale x 48 x i8> %res
+}
+
+define <vscale x 24 x i32> @vector_interleave_nxv24i32_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, <vscale x 4 x i32> %e, <vscale x 4 x i32> %f) nounwind {
+;
+; RV32-LABEL: vector_interleave_nxv24i32_nxv4i32:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -80
+; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 80
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a3, 14
-; RV32-NEXT: mul a0, a0, a3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 64
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re16.v v12, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re16.v v13, (a6)
+; RV32-NEXT: li a1, 28
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; RV32-NEXT: vmv2r.v v20, v14
+; RV32-NEXT: vmv2r.v v22, v12
+; RV32-NEXT: vmv2r.v v24, v10
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a0, 6
+; RV32-NEXT: mul a1, a1, a0
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 64
+; RV32-NEXT: vmv1r.v v10, v25
+; RV32-NEXT: vmv1r.v v11, v23
+; RV32-NEXT: vmv1r.v v12, v21
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv1r.v v13, v17
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: vmv1r.v v14, v19
+; RV32-NEXT: vsseg6e32.v v9, (a1)
+; RV32-NEXT: vmv1r.v v9, v24
+; RV32-NEXT: add a5, a1, a2
+; RV32-NEXT: vmv1r.v v10, v22
+; RV32-NEXT: add a3, a0, a2
+; RV32-NEXT: vmv1r.v v11, v20
+; RV32-NEXT: add a4, a3, a2
+; RV32-NEXT: vmv1r.v v12, v16
+; RV32-NEXT: add a6, a5, a2
+; RV32-NEXT: vmv1r.v v13, v18
+; RV32-NEXT: vsseg6e32.v v8, (a0)
+; RV32-NEXT: vl1re32.v v14, (a1)
+; RV32-NEXT: add a1, a6, a2
+; RV32-NEXT: vl1re32.v v15, (a5)
+; RV32-NEXT: add a5, a1, a2
+; RV32-NEXT: vl1re32.v v18, (a5)
+; RV32-NEXT: add a5, a5, a2
+; RV32-NEXT: vl1re32.v v19, (a5)
+; RV32-NEXT: add a5, a4, a2
+; RV32-NEXT: vl1re32.v v16, (a6)
+; RV32-NEXT: add a6, a5, a2
+; RV32-NEXT: vl1re32.v v12, (a6)
; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re32.v v13, (a6)
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: li a7, 12
+; RV32-NEXT: mul a6, a6, a7
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 64
+; RV32-NEXT: vl1re32.v v17, (a1)
+; RV32-NEXT: vl1re32.v v10, (a4)
+; RV32-NEXT: vl1re32.v v11, (a5)
+; RV32-NEXT: vl1re32.v v8, (a0)
+; RV32-NEXT: vl1re32.v v9, (a3)
; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a0, a2
-; RV32-NEXT: vl1re16.v v14, (a6)
-; RV32-NEXT: vl1re16.v v15, (a1)
-; RV32-NEXT: add a5, a0, a5
-; RV32-NEXT: vs2r.v v20, (a5)
+; RV32-NEXT: add a2, a6, a2
; RV32-NEXT: vs4r.v v16, (a2)
-; RV32-NEXT: vs8r.v v8, (a0)
-; RV32-NEXT: vl8re16.v v16, (a2)
-; RV32-NEXT: vl8re16.v v8, (a0)
+; RV32-NEXT: vs8r.v v8, (a6)
+; RV32-NEXT: vl8re32.v v16, (a2)
+; RV32-NEXT: vl8re32.v v8, (a6)
; RV32-NEXT: addi sp, s0, -80
; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 80
; RV32-NEXT: ret
;
-; RV64-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; RV64-LABEL: vector_interleave_nxv24i32_nxv4i32:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -80
; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; RV64-NEXT: addi s0, sp, 80
; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: li a1, 28
+; RV64-NEXT: mul a0, a0, a1
; RV64-NEXT: sub sp, sp, a0
; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; RV64-NEXT: vmv2r.v v26, v20
-; RV64-NEXT: addi a0, sp, 64
-; RV64-NEXT: vmv2r.v v24, v16
+; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; RV64-NEXT: vmv2r.v v20, v14
+; RV64-NEXT: vmv2r.v v22, v12
+; RV64-NEXT: vmv2r.v v24, v10
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a2, a1, 3
-; RV64-NEXT: sub a1, a2, a1
+; RV64-NEXT: li a0, 6
+; RV64-NEXT: mul a1, a1, a0
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 64
-; RV64-NEXT: vmv2r.v v22, v12
+; RV64-NEXT: vmv1r.v v10, v25
+; RV64-NEXT: vmv1r.v v11, v23
+; RV64-NEXT: vmv1r.v v12, v21
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv1r.v v13, v17
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: vmv2r.v v20, v8
-; RV64-NEXT: vmv1r.v v1, v20
-; RV64-NEXT: vmv1r.v v3, v22
-; RV64-NEXT: vmv1r.v v5, v24
-; RV64-NEXT: vmv1r.v v7, v26
+; RV64-NEXT: vmv1r.v v14, v19
+; RV64-NEXT: vsseg6e32.v v9, (a1)
+; RV64-NEXT: vmv1r.v v9, v24
+; RV64-NEXT: add a5, a1, a2
+; RV64-NEXT: vmv1r.v v10, v22
; RV64-NEXT: add a3, a0, a2
-; RV64-NEXT: vmv1r.v v2, v10
-; RV64-NEXT: add a4, a1, a2
-; RV64-NEXT: slli a5, a2, 2
-; RV64-NEXT: vmv1r.v v4, v14
-; RV64-NEXT: slli a6, a2, 4
-; RV64-NEXT: add a7, a4, a2
-; RV64-NEXT: vmv1r.v v6, v18
-; RV64-NEXT: sub a5, a6, a5
-; RV64-NEXT: vmv1r.v v22, v11
-; RV64-NEXT: add a6, a7, a2
-; RV64-NEXT: vmv1r.v v24, v15
-; RV64-NEXT: vsseg7e16.v v1, (a0)
-; RV64-NEXT: vmv1r.v v26, v19
-; RV64-NEXT: vsseg7e16.v v21, (a1)
-; RV64-NEXT: vl1re16.v v18, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re16.v v19, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re16.v v20, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re16.v v21, (a6)
-; RV64-NEXT: add a6, a3, a2
-; RV64-NEXT: vl1re16.v v10, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re16.v v11, (a6)
-; RV64-NEXT: vl1re16.v v8, (a0)
-; RV64-NEXT: vl1re16.v v16, (a4)
-; RV64-NEXT: vl1re16.v v9, (a3)
-; RV64-NEXT: vl1re16.v v17, (a7)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: li a3, 14
-; RV64-NEXT: mul a0, a0, a3
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 64
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re16.v v12, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re16.v v13, (a6)
+; RV64-NEXT: vmv1r.v v11, v20
+; RV64-NEXT: add a4, a3, a2
+; RV64-NEXT: vmv1r.v v12, v16
+; RV64-NEXT: add a6, a5, a2
+; RV64-NEXT: vmv1r.v v13, v18
+; RV64-NEXT: vsseg6e32.v v8, (a0)
+; RV64-NEXT: vl1re32.v v14, (a1)
+; RV64-NEXT: add a1, a6, a2
+; RV64-NEXT: vl1re32.v v15, (a5)
+; RV64-NEXT: add a5, a1, a2
+; RV64-NEXT: vl1re32.v v18, (a5)
+; RV64-NEXT: add a5, a5, a2
+; RV64-NEXT: vl1re32.v v19, (a5)
+; RV64-NEXT: add a5, a4, a2
+; RV64-NEXT: vl1re32.v v16, (a6)
+; RV64-NEXT: add a6, a5, a2
+; RV64-NEXT: vl1re32.v v12, (a6)
; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re32.v v13, (a6)
+; RV64-NEXT: csrr a6, vlenb
+; RV64-NEXT: li a7, 12
+; RV64-NEXT: mul a6, a6, a7
+; RV64-NEXT: add a6, sp, a6
+; RV64-NEXT: addi a6, a6, 64
+; RV64-NEXT: vl1re32.v v17, (a1)
+; RV64-NEXT: vl1re32.v v10, (a4)
+; RV64-NEXT: vl1re32.v v11, (a5)
+; RV64-NEXT: vl1re32.v v8, (a0)
+; RV64-NEXT: vl1re32.v v9, (a3)
; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a0, a2
-; RV64-NEXT: vl1re16.v v14, (a6)
-; RV64-NEXT: vl1re16.v v15, (a1)
-; RV64-NEXT: add a5, a0, a5
-; RV64-NEXT: vs2r.v v20, (a5)
+; RV64-NEXT: add a2, a6, a2
; RV64-NEXT: vs4r.v v16, (a2)
-; RV64-NEXT: vs8r.v v8, (a0)
-; RV64-NEXT: vl8re16.v v16, (a2)
-; RV64-NEXT: vl8re16.v v8, (a0)
+; RV64-NEXT: vs8r.v v8, (a6)
+; RV64-NEXT: vl8re32.v v16, (a2)
+; RV64-NEXT: vl8re32.v v8, (a6)
; RV64-NEXT: addi sp, s0, -80
; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 80
; RV64-NEXT: ret
;
-; ZVBB-RV32-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; ZVBB-RV32-LABEL: vector_interleave_nxv24i32_nxv4i32:
; ZVBB-RV32: # %bb.0:
; ZVBB-RV32-NEXT: addi sp, sp, -80
; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
; ZVBB-RV32-NEXT: addi s0, sp, 80
; ZVBB-RV32-NEXT: csrr a0, vlenb
-; ZVBB-RV32-NEXT: slli a0, a0, 5
+; ZVBB-RV32-NEXT: li a1, 28
+; ZVBB-RV32-NEXT: mul a0, a0, a1
; ZVBB-RV32-NEXT: sub sp, sp, a0
; ZVBB-RV32-NEXT: andi sp, sp, -64
-; ZVBB-RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; ZVBB-RV32-NEXT: vmv2r.v v26, v20
-; ZVBB-RV32-NEXT: addi a0, sp, 64
-; ZVBB-RV32-NEXT: vmv2r.v v24, v16
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV32-NEXT: vmv2r.v v20, v14
+; ZVBB-RV32-NEXT: vmv2r.v v22, v12
+; ZVBB-RV32-NEXT: vmv2r.v v24, v10
; ZVBB-RV32-NEXT: csrr a1, vlenb
-; ZVBB-RV32-NEXT: slli a2, a1, 3
-; ZVBB-RV32-NEXT: sub a1, a2, a1
+; ZVBB-RV32-NEXT: li a0, 6
+; ZVBB-RV32-NEXT: mul a1, a1, a0
; ZVBB-RV32-NEXT: add a1, sp, a1
; ZVBB-RV32-NEXT: addi a1, a1, 64
-; ZVBB-RV32-NEXT: vmv2r.v v22, v12
+; ZVBB-RV32-NEXT: vmv1r.v v10, v25
+; ZVBB-RV32-NEXT: vmv1r.v v11, v23
+; ZVBB-RV32-NEXT: vmv1r.v v12, v21
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv1r.v v13, v17
; ZVBB-RV32-NEXT: csrr a2, vlenb
-; ZVBB-RV32-NEXT: vmv2r.v v20, v8
-; ZVBB-RV32-NEXT: vmv1r.v v1, v20
-; ZVBB-RV32-NEXT: vmv1r.v v3, v22
-; ZVBB-RV32-NEXT: vmv1r.v v5, v24
-; ZVBB-RV32-NEXT: vmv1r.v v7, v26
+; ZVBB-RV32-NEXT: vmv1r.v v14, v19
+; ZVBB-RV32-NEXT: vsseg6e32.v v9, (a1)
+; ZVBB-RV32-NEXT: vmv1r.v v9, v24
+; ZVBB-RV32-NEXT: add a5, a1, a2
+; ZVBB-RV32-NEXT: vmv1r.v v10, v22
; ZVBB-RV32-NEXT: add a3, a0, a2
-; ZVBB-RV32-NEXT: vmv1r.v v2, v10
-; ZVBB-RV32-NEXT: add a4, a1, a2
-; ZVBB-RV32-NEXT: slli a5, a2, 2
-; ZVBB-RV32-NEXT: vmv1r.v v4, v14
-; ZVBB-RV32-NEXT: slli a6, a2, 4
-; ZVBB-RV32-NEXT: add a7, a4, a2
-; ZVBB-RV32-NEXT: vmv1r.v v6, v18
-; ZVBB-RV32-NEXT: sub a5, a6, a5
-; ZVBB-RV32-NEXT: vmv1r.v v22, v11
-; ZVBB-RV32-NEXT: add a6, a7, a2
-; ZVBB-RV32-NEXT: vmv1r.v v24, v15
-; ZVBB-RV32-NEXT: vsseg7e16.v v1, (a0)
-; ZVBB-RV32-NEXT: vmv1r.v v26, v19
-; ZVBB-RV32-NEXT: vsseg7e16.v v21, (a1)
-; ZVBB-RV32-NEXT: vl1re16.v v18, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re16.v v19, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re16.v v20, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re16.v v21, (a6)
-; ZVBB-RV32-NEXT: add a6, a3, a2
-; ZVBB-RV32-NEXT: vl1re16.v v10, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re16.v v11, (a6)
-; ZVBB-RV32-NEXT: vl1re16.v v8, (a0)
-; ZVBB-RV32-NEXT: vl1re16.v v16, (a4)
-; ZVBB-RV32-NEXT: vl1re16.v v9, (a3)
-; ZVBB-RV32-NEXT: vl1re16.v v17, (a7)
-; ZVBB-RV32-NEXT: csrr a0, vlenb
-; ZVBB-RV32-NEXT: li a3, 14
-; ZVBB-RV32-NEXT: mul a0, a0, a3
-; ZVBB-RV32-NEXT: add a0, sp, a0
-; ZVBB-RV32-NEXT: addi a0, a0, 64
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re16.v v12, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re16.v v13, (a6)
+; ZVBB-RV32-NEXT: vmv1r.v v11, v20
+; ZVBB-RV32-NEXT: add a4, a3, a2
+; ZVBB-RV32-NEXT: vmv1r.v v12, v16
+; ZVBB-RV32-NEXT: add a6, a5, a2
+; ZVBB-RV32-NEXT: vmv1r.v v13, v18
+; ZVBB-RV32-NEXT: vsseg6e32.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re32.v v14, (a1)
+; ZVBB-RV32-NEXT: add a1, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v15, (a5)
+; ZVBB-RV32-NEXT: add a5, a1, a2
+; ZVBB-RV32-NEXT: vl1re32.v v18, (a5)
+; ZVBB-RV32-NEXT: add a5, a5, a2
+; ZVBB-RV32-NEXT: vl1re32.v v19, (a5)
+; ZVBB-RV32-NEXT: add a5, a4, a2
+; ZVBB-RV32-NEXT: vl1re32.v v16, (a6)
+; ZVBB-RV32-NEXT: add a6, a5, a2
+; ZVBB-RV32-NEXT: vl1re32.v v12, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v13, (a6)
+; ZVBB-RV32-NEXT: csrr a6, vlenb
+; ZVBB-RV32-NEXT: li a7, 12
+; ZVBB-RV32-NEXT: mul a6, a6, a7
+; ZVBB-RV32-NEXT: add a6, sp, a6
+; ZVBB-RV32-NEXT: addi a6, a6, 64
+; ZVBB-RV32-NEXT: vl1re32.v v17, (a1)
+; ZVBB-RV32-NEXT: vl1re32.v v10, (a4)
+; ZVBB-RV32-NEXT: vl1re32.v v11, (a5)
+; ZVBB-RV32-NEXT: vl1re32.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re32.v v9, (a3)
; ZVBB-RV32-NEXT: slli a2, a2, 3
-; ZVBB-RV32-NEXT: add a2, a0, a2
-; ZVBB-RV32-NEXT: vl1re16.v v14, (a6)
-; ZVBB-RV32-NEXT: vl1re16.v v15, (a1)
-; ZVBB-RV32-NEXT: add a5, a0, a5
-; ZVBB-RV32-NEXT: vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT: add a2, a6, a2
; ZVBB-RV32-NEXT: vs4r.v v16, (a2)
-; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
-; ZVBB-RV32-NEXT: vl8re16.v v16, (a2)
-; ZVBB-RV32-NEXT: vl8re16.v v8, (a0)
+; ZVBB-RV32-NEXT: vs8r.v v8, (a6)
+; ZVBB-RV32-NEXT: vl8re32.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8re32.v v8, (a6)
; ZVBB-RV32-NEXT: addi sp, s0, -80
; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; ZVBB-RV32-NEXT: addi sp, sp, 80
; ZVBB-RV32-NEXT: ret
;
-; ZVBB-RV64-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; ZVBB-RV64-LABEL: vector_interleave_nxv24i32_nxv4i32:
; ZVBB-RV64: # %bb.0:
; ZVBB-RV64-NEXT: addi sp, sp, -80
; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; ZVBB-RV64-NEXT: addi s0, sp, 80
; ZVBB-RV64-NEXT: csrr a0, vlenb
-; ZVBB-RV64-NEXT: slli a0, a0, 5
+; ZVBB-RV64-NEXT: li a1, 28
+; ZVBB-RV64-NEXT: mul a0, a0, a1
; ZVBB-RV64-NEXT: sub sp, sp, a0
; ZVBB-RV64-NEXT: andi sp, sp, -64
-; ZVBB-RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; ZVBB-RV64-NEXT: vmv2r.v v26, v20
-; ZVBB-RV64-NEXT: addi a0, sp, 64
-; ZVBB-RV64-NEXT: vmv2r.v v24, v16
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV64-NEXT: vmv2r.v v20, v14
+; ZVBB-RV64-NEXT: vmv2r.v v22, v12
+; ZVBB-RV64-NEXT: vmv2r.v v24, v10
; ZVBB-RV64-NEXT: csrr a1, vlenb
-; ZVBB-RV64-NEXT: slli a2, a1, 3
-; ZVBB-RV64-NEXT: sub a1, a2, a1
+; ZVBB-RV64-NEXT: li a0, 6
+; ZVBB-RV64-NEXT: mul a1, a1, a0
; ZVBB-RV64-NEXT: add a1, sp, a1
; ZVBB-RV64-NEXT: addi a1, a1, 64
-; ZVBB-RV64-NEXT: vmv2r.v v22, v12
+; ZVBB-RV64-NEXT: vmv1r.v v10, v25
+; ZVBB-RV64-NEXT: vmv1r.v v11, v23
+; ZVBB-RV64-NEXT: vmv1r.v v12, v21
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv1r.v v13, v17
; ZVBB-RV64-NEXT: csrr a2, vlenb
-; ZVBB-RV64-NEXT: vmv2r.v v20, v8
-; ZVBB-RV64-NEXT: vmv1r.v v1, v20
-; ZVBB-RV64-NEXT: vmv1r.v v3, v22
-; ZVBB-RV64-NEXT: vmv1r.v v5, v24
-; ZVBB-RV64-NEXT: vmv1r.v v7, v26
+; ZVBB-RV64-NEXT: vmv1r.v v14, v19
+; ZVBB-RV64-NEXT: vsseg6e32.v v9, (a1)
+; ZVBB-RV64-NEXT: vmv1r.v v9, v24
+; ZVBB-RV64-NEXT: add a5, a1, a2
+; ZVBB-RV64-NEXT: vmv1r.v v10, v22
; ZVBB-RV64-NEXT: add a3, a0, a2
-; ZVBB-RV64-NEXT: vmv1r.v v2, v10
-; ZVBB-RV64-NEXT: add a4, a1, a2
-; ZVBB-RV64-NEXT: slli a5, a2, 2
-; ZVBB-RV64-NEXT: vmv1r.v v4, v14
-; ZVBB-RV64-NEXT: slli a6, a2, 4
-; ZVBB-RV64-NEXT: add a7, a4, a2
-; ZVBB-RV64-NEXT: vmv1r.v v6, v18
-; ZVBB-RV64-NEXT: sub a5, a6, a5
-; ZVBB-RV64-NEXT: vmv1r.v v22, v11
-; ZVBB-RV64-NEXT: add a6, a7, a2
-; ZVBB-RV64-NEXT: vmv1r.v v24, v15
-; ZVBB-RV64-NEXT: vsseg7e16.v v1, (a0)
-; ZVBB-RV64-NEXT: vmv1r.v v26, v19
-; ZVBB-RV64-NEXT: vsseg7e16.v v21, (a1)
-; ZVBB-RV64-NEXT: vl1re16.v v18, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re16.v v19, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re16.v v20, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re16.v v21, (a6)
-; ZVBB-RV64-NEXT: add a6, a3, a2
-; ZVBB-RV64-NEXT: vl1re16.v v10, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re16.v v11, (a6)
-; ZVBB-RV64-NEXT: vl1re16.v v8, (a0)
-; ZVBB-RV64-NEXT: vl1re16.v v16, (a4)
-; ZVBB-RV64-NEXT: vl1re16.v v9, (a3)
-; ZVBB-RV64-NEXT: vl1re16.v v17, (a7)
-; ZVBB-RV64-NEXT: csrr a0, vlenb
-; ZVBB-RV64-NEXT: li a3, 14
-; ZVBB-RV64-NEXT: mul a0, a0, a3
-; ZVBB-RV64-NEXT: add a0, sp, a0
-; ZVBB-RV64-NEXT: addi a0, a0, 64
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re16.v v12, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re16.v v13, (a6)
+; ZVBB-RV64-NEXT: vmv1r.v v11, v20
+; ZVBB-RV64-NEXT: add a4, a3, a2
+; ZVBB-RV64-NEXT: vmv1r.v v12, v16
+; ZVBB-RV64-NEXT: add a6, a5, a2
+; ZVBB-RV64-NEXT: vmv1r.v v13, v18
+; ZVBB-RV64-NEXT: vsseg6e32.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re32.v v14, (a1)
+; ZVBB-RV64-NEXT: add a1, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v15, (a5)
+; ZVBB-RV64-NEXT: add a5, a1, a2
+; ZVBB-RV64-NEXT: vl1re32.v v18, (a5)
+; ZVBB-RV64-NEXT: add a5, a5, a2
+; ZVBB-RV64-NEXT: vl1re32.v v19, (a5)
+; ZVBB-RV64-NEXT: add a5, a4, a2
+; ZVBB-RV64-NEXT: vl1re32.v v16, (a6)
+; ZVBB-RV64-NEXT: add a6, a5, a2
+; ZVBB-RV64-NEXT: vl1re32.v v12, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v13, (a6)
+; ZVBB-RV64-NEXT: csrr a6, vlenb
+; ZVBB-RV64-NEXT: li a7, 12
+; ZVBB-RV64-NEXT: mul a6, a6, a7
+; ZVBB-RV64-NEXT: add a6, sp, a6
+; ZVBB-RV64-NEXT: addi a6, a6, 64
+; ZVBB-RV64-NEXT: vl1re32.v v17, (a1)
+; ZVBB-RV64-NEXT: vl1re32.v v10, (a4)
+; ZVBB-RV64-NEXT: vl1re32.v v11, (a5)
+; ZVBB-RV64-NEXT: vl1re32.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re32.v v9, (a3)
; ZVBB-RV64-NEXT: slli a2, a2, 3
-; ZVBB-RV64-NEXT: add a2, a0, a2
-; ZVBB-RV64-NEXT: vl1re16.v v14, (a6)
-; ZVBB-RV64-NEXT: vl1re16.v v15, (a1)
-; ZVBB-RV64-NEXT: add a5, a0, a5
-; ZVBB-RV64-NEXT: vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT: add a2, a6, a2
; ZVBB-RV64-NEXT: vs4r.v v16, (a2)
-; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
-; ZVBB-RV64-NEXT: vl8re16.v v16, (a2)
-; ZVBB-RV64-NEXT: vl8re16.v v8, (a0)
+; ZVBB-RV64-NEXT: vs8r.v v8, (a6)
+; ZVBB-RV64-NEXT: vl8re32.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8re32.v v8, (a6)
; ZVBB-RV64-NEXT: addi sp, s0, -80
; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZVBB-RV64-NEXT: addi sp, sp, 80
; ZVBB-RV64-NEXT: ret
;
-; ZIP-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; ZIP-LABEL: vector_interleave_nxv24i32_nxv4i32:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -80
; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; ZIP-NEXT: addi s0, sp, 80
; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: slli a0, a0, 5
+; ZIP-NEXT: li a1, 28
+; ZIP-NEXT: mul a0, a0, a1
; ZIP-NEXT: sub sp, sp, a0
; ZIP-NEXT: andi sp, sp, -64
-; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; ZIP-NEXT: vmv2r.v v26, v20
-; ZIP-NEXT: addi a0, sp, 64
-; ZIP-NEXT: vmv2r.v v24, v16
+; ZIP-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZIP-NEXT: vmv2r.v v20, v14
+; ZIP-NEXT: vmv2r.v v22, v12
+; ZIP-NEXT: vmv2r.v v24, v10
; ZIP-NEXT: csrr a1, vlenb
-; ZIP-NEXT: slli a2, a1, 3
-; ZIP-NEXT: sub a1, a2, a1
+; ZIP-NEXT: li a0, 6
+; ZIP-NEXT: mul a1, a1, a0
; ZIP-NEXT: add a1, sp, a1
; ZIP-NEXT: addi a1, a1, 64
-; ZIP-NEXT: vmv2r.v v22, v12
+; ZIP-NEXT: vmv1r.v v10, v25
+; ZIP-NEXT: vmv1r.v v11, v23
+; ZIP-NEXT: vmv1r.v v12, v21
+; ZIP-NEXT: addi a0, sp, 64
+; ZIP-NEXT: vmv1r.v v13, v17
; ZIP-NEXT: csrr a2, vlenb
-; ZIP-NEXT: vmv2r.v v20, v8
-; ZIP-NEXT: vmv1r.v v1, v20
-; ZIP-NEXT: vmv1r.v v3, v22
-; ZIP-NEXT: vmv1r.v v5, v24
-; ZIP-NEXT: vmv1r.v v7, v26
+; ZIP-NEXT: vmv1r.v v14, v19
+; ZIP-NEXT: vsseg6e32.v v9, (a1)
+; ZIP-NEXT: vmv1r.v v9, v24
+; ZIP-NEXT: add a5, a1, a2
+; ZIP-NEXT: vmv1r.v v10, v22
; ZIP-NEXT: add a3, a0, a2
-; ZIP-NEXT: vmv1r.v v2, v10
-; ZIP-NEXT: add a4, a1, a2
-; ZIP-NEXT: slli a5, a2, 2
-; ZIP-NEXT: vmv1r.v v4, v14
-; ZIP-NEXT: slli a6, a2, 4
-; ZIP-NEXT: add a7, a4, a2
-; ZIP-NEXT: vmv1r.v v6, v18
-; ZIP-NEXT: sub a5, a6, a5
-; ZIP-NEXT: vmv1r.v v22, v11
-; ZIP-NEXT: add a6, a7, a2
-; ZIP-NEXT: vmv1r.v v24, v15
-; ZIP-NEXT: vsseg7e16.v v1, (a0)
-; ZIP-NEXT: vmv1r.v v26, v19
-; ZIP-NEXT: vsseg7e16.v v21, (a1)
-; ZIP-NEXT: vl1re16.v v18, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re16.v v19, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re16.v v20, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re16.v v21, (a6)
-; ZIP-NEXT: add a6, a3, a2
-; ZIP-NEXT: vl1re16.v v10, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re16.v v11, (a6)
-; ZIP-NEXT: vl1re16.v v8, (a0)
-; ZIP-NEXT: vl1re16.v v16, (a4)
-; ZIP-NEXT: vl1re16.v v9, (a3)
-; ZIP-NEXT: vl1re16.v v17, (a7)
-; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: li a3, 14
-; ZIP-NEXT: mul a0, a0, a3
-; ZIP-NEXT: add a0, sp, a0
-; ZIP-NEXT: addi a0, a0, 64
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re16.v v12, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re16.v v13, (a6)
+; ZIP-NEXT: vmv1r.v v11, v20
+; ZIP-NEXT: add a4, a3, a2
+; ZIP-NEXT: vmv1r.v v12, v16
+; ZIP-NEXT: add a6, a5, a2
+; ZIP-NEXT: vmv1r.v v13, v18
+; ZIP-NEXT: vsseg6e32.v v8, (a0)
+; ZIP-NEXT: vl1re32.v v14, (a1)
+; ZIP-NEXT: add a1, a6, a2
+; ZIP-NEXT: vl1re32.v v15, (a5)
+; ZIP-NEXT: add a5, a1, a2
+; ZIP-NEXT: vl1re32.v v18, (a5)
+; ZIP-NEXT: add a5, a5, a2
+; ZIP-NEXT: vl1re32.v v19, (a5)
+; ZIP-NEXT: add a5, a4, a2
+; ZIP-NEXT: vl1re32.v v16, (a6)
+; ZIP-NEXT: add a6, a5, a2
+; ZIP-NEXT: vl1re32.v v12, (a6)
; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re32.v v13, (a6)
+; ZIP-NEXT: csrr a6, vlenb
+; ZIP-NEXT: li a7, 12
+; ZIP-NEXT: mul a6, a6, a7
+; ZIP-NEXT: add a6, sp, a6
+; ZIP-NEXT: addi a6, a6, 64
+; ZIP-NEXT: vl1re32.v v17, (a1)
+; ZIP-NEXT: vl1re32.v v10, (a4)
+; ZIP-NEXT: vl1re32.v v11, (a5)
+; ZIP-NEXT: vl1re32.v v8, (a0)
+; ZIP-NEXT: vl1re32.v v9, (a3)
; ZIP-NEXT: slli a2, a2, 3
-; ZIP-NEXT: add a2, a0, a2
-; ZIP-NEXT: vl1re16.v v14, (a6)
-; ZIP-NEXT: vl1re16.v v15, (a1)
-; ZIP-NEXT: add a5, a0, a5
-; ZIP-NEXT: vs2r.v v20, (a5)
+; ZIP-NEXT: add a2, a6, a2
; ZIP-NEXT: vs4r.v v16, (a2)
-; ZIP-NEXT: vs8r.v v8, (a0)
-; ZIP-NEXT: vl8re16.v v16, (a2)
-; ZIP-NEXT: vl8re16.v v8, (a0)
+; ZIP-NEXT: vs8r.v v8, (a6)
+; ZIP-NEXT: vl8re32.v v16, (a2)
+; ZIP-NEXT: vl8re32.v v8, (a6)
; ZIP-NEXT: addi sp, s0, -80
; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZIP-NEXT: addi sp, sp, 80
; ZIP-NEXT: ret
- %res = call <vscale x 56 x i16> @llvm.vector.interleave7.nxv56i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c, <vscale x 8 x i16> %d, <vscale x 8 x i16> %e, <vscale x 8 x i16> %f, <vscale x 8 x i16> %g)
- ret <vscale x 56 x i16> %res
+ %res = call <vscale x 24 x i32> @llvm.vector.interleave6.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, <vscale x 4 x i32> %e, <vscale x 4 x i32> %f)
+ ret <vscale x 24 x i32> %res
}
-
-define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, <vscale x 4 x i32> %e, <vscale x 4 x i32> %f, <vscale x 4 x i32> %g) nounwind {
+define <vscale x 12 x i64> @vector_interleave_nxv12i64_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d, <vscale x 2 x i64> %e, <vscale x 2 x i64> %f) nounwind {
;
-; RV32-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; RV32-LABEL: vector_interleave_nxv12i64_nxv2i64:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -80
; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
; RV32-NEXT: addi s0, sp, 80
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: li a1, 28
+; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: sub sp, sp, a0
; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; RV32-NEXT: vmv2r.v v26, v20
-; RV32-NEXT: addi a0, sp, 64
-; RV32-NEXT: vmv2r.v v24, v16
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT: vmv2r.v v20, v14
+; RV32-NEXT: vmv2r.v v22, v12
+; RV32-NEXT: vmv2r.v v24, v10
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a2, a1, 3
-; RV32-NEXT: sub a1, a2, a1
+; RV32-NEXT: li a0, 6
+; RV32-NEXT: mul a1, a1, a0
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 64
-; RV32-NEXT: vmv2r.v v22, v12
+; RV32-NEXT: vmv1r.v v10, v25
+; RV32-NEXT: vmv1r.v v11, v23
+; RV32-NEXT: vmv1r.v v12, v21
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv1r.v v13, v17
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: vmv2r.v v20, v8
-; RV32-NEXT: vmv1r.v v1, v20
-; RV32-NEXT: vmv1r.v v3, v22
-; RV32-NEXT: vmv1r.v v5, v24
-; RV32-NEXT: vmv1r.v v7, v26
+; RV32-NEXT: vmv1r.v v14, v19
+; RV32-NEXT: vsseg6e64.v v9, (a1)
+; RV32-NEXT: vmv1r.v v9, v24
+; RV32-NEXT: add a5, a1, a2
+; RV32-NEXT: vmv1r.v v10, v22
; RV32-NEXT: add a3, a0, a2
-; RV32-NEXT: vmv1r.v v2, v10
-; RV32-NEXT: add a4, a1, a2
-; RV32-NEXT: slli a5, a2, 2
-; RV32-NEXT: vmv1r.v v4, v14
-; RV32-NEXT: slli a6, a2, 4
-; RV32-NEXT: add a7, a4, a2
-; RV32-NEXT: vmv1r.v v6, v18
-; RV32-NEXT: sub a5, a6, a5
-; RV32-NEXT: vmv1r.v v22, v11
-; RV32-NEXT: add a6, a7, a2
-; RV32-NEXT: vmv1r.v v24, v15
-; RV32-NEXT: vsseg7e32.v v1, (a0)
-; RV32-NEXT: vmv1r.v v26, v19
-; RV32-NEXT: vsseg7e32.v v21, (a1)
-; RV32-NEXT: vl1re32.v v18, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re32.v v19, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re32.v v20, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re32.v v21, (a6)
-; RV32-NEXT: add a6, a3, a2
-; RV32-NEXT: vl1re32.v v10, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re32.v v11, (a6)
-; RV32-NEXT: vl1re32.v v8, (a0)
-; RV32-NEXT: vl1re32.v v16, (a4)
-; RV32-NEXT: vl1re32.v v9, (a3)
-; RV32-NEXT: vl1re32.v v17, (a7)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a3, 14
-; RV32-NEXT: mul a0, a0, a3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 64
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re32.v v12, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re32.v v13, (a6)
+; RV32-NEXT: vmv1r.v v11, v20
+; RV32-NEXT: add a4, a3, a2
+; RV32-NEXT: vmv1r.v v12, v16
+; RV32-NEXT: add a6, a5, a2
+; RV32-NEXT: vmv1r.v v13, v18
+; RV32-NEXT: vsseg6e64.v v8, (a0)
+; RV32-NEXT: vl1re64.v v14, (a1)
+; RV32-NEXT: add a1, a6, a2
+; RV32-NEXT: vl1re64.v v15, (a5)
+; RV32-NEXT: add a5, a1, a2
+; RV32-NEXT: vl1re64.v v18, (a5)
+; RV32-NEXT: add a5, a5, a2
+; RV32-NEXT: vl1re64.v v19, (a5)
+; RV32-NEXT: add a5, a4, a2
+; RV32-NEXT: vl1re64.v v16, (a6)
+; RV32-NEXT: add a6, a5, a2
+; RV32-NEXT: vl1re64.v v12, (a6)
; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re64.v v13, (a6)
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: li a7, 12
+; RV32-NEXT: mul a6, a6, a7
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 64
+; RV32-NEXT: vl1re64.v v17, (a1)
+; RV32-NEXT: vl1re64.v v10, (a4)
+; RV32-NEXT: vl1re64.v v11, (a5)
+; RV32-NEXT: vl1re64.v v8, (a0)
+; RV32-NEXT: vl1re64.v v9, (a3)
; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a0, a2
-; RV32-NEXT: vl1re32.v v14, (a6)
-; RV32-NEXT: vl1re32.v v15, (a1)
-; RV32-NEXT: add a5, a0, a5
-; RV32-NEXT: vs2r.v v20, (a5)
+; RV32-NEXT: add a2, a6, a2
; RV32-NEXT: vs4r.v v16, (a2)
-; RV32-NEXT: vs8r.v v8, (a0)
-; RV32-NEXT: vl8re32.v v16, (a2)
-; RV32-NEXT: vl8re32.v v8, (a0)
+; RV32-NEXT: vs8r.v v8, (a6)
+; RV32-NEXT: vl8re64.v v16, (a2)
+; RV32-NEXT: vl8re64.v v8, (a6)
; RV32-NEXT: addi sp, s0, -80
; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 80
; RV32-NEXT: ret
;
-; RV64-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; RV64-LABEL: vector_interleave_nxv12i64_nxv2i64:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -80
; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; RV64-NEXT: addi s0, sp, 80
; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: li a1, 28
+; RV64-NEXT: mul a0, a0, a1
; RV64-NEXT: sub sp, sp, a0
; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; RV64-NEXT: vmv2r.v v26, v20
-; RV64-NEXT: addi a0, sp, 64
-; RV64-NEXT: vmv2r.v v24, v16
+; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv2r.v v20, v14
+; RV64-NEXT: vmv2r.v v22, v12
+; RV64-NEXT: vmv2r.v v24, v10
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a2, a1, 3
-; RV64-NEXT: sub a1, a2, a1
+; RV64-NEXT: li a0, 6
+; RV64-NEXT: mul a1, a1, a0
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 64
-; RV64-NEXT: vmv2r.v v22, v12
+; RV64-NEXT: vmv1r.v v10, v25
+; RV64-NEXT: vmv1r.v v11, v23
+; RV64-NEXT: vmv1r.v v12, v21
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv1r.v v13, v17
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: vmv2r.v v20, v8
-; RV64-NEXT: vmv1r.v v1, v20
-; RV64-NEXT: vmv1r.v v3, v22
-; RV64-NEXT: vmv1r.v v5, v24
-; RV64-NEXT: vmv1r.v v7, v26
+; RV64-NEXT: vmv1r.v v14, v19
+; RV64-NEXT: vsseg6e64.v v9, (a1)
+; RV64-NEXT: vmv1r.v v9, v24
+; RV64-NEXT: add a5, a1, a2
+; RV64-NEXT: vmv1r.v v10, v22
; RV64-NEXT: add a3, a0, a2
-; RV64-NEXT: vmv1r.v v2, v10
-; RV64-NEXT: add a4, a1, a2
-; RV64-NEXT: slli a5, a2, 2
-; RV64-NEXT: vmv1r.v v4, v14
-; RV64-NEXT: slli a6, a2, 4
-; RV64-NEXT: add a7, a4, a2
-; RV64-NEXT: vmv1r.v v6, v18
-; RV64-NEXT: sub a5, a6, a5
-; RV64-NEXT: vmv1r.v v22, v11
-; RV64-NEXT: add a6, a7, a2
-; RV64-NEXT: vmv1r.v v24, v15
-; RV64-NEXT: vsseg7e32.v v1, (a0)
-; RV64-NEXT: vmv1r.v v26, v19
-; RV64-NEXT: vsseg7e32.v v21, (a1)
-; RV64-NEXT: vl1re32.v v18, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re32.v v19, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re32.v v20, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re32.v v21, (a6)
-; RV64-NEXT: add a6, a3, a2
-; RV64-NEXT: vl1re32.v v10, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re32.v v11, (a6)
-; RV64-NEXT: vl1re32.v v8, (a0)
-; RV64-NEXT: vl1re32.v v16, (a4)
-; RV64-NEXT: vl1re32.v v9, (a3)
-; RV64-NEXT: vl1re32.v v17, (a7)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: li a3, 14
-; RV64-NEXT: mul a0, a0, a3
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 64
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re32.v v12, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re32.v v13, (a6)
+; RV64-NEXT: vmv1r.v v11, v20
+; RV64-NEXT: add a4, a3, a2
+; RV64-NEXT: vmv1r.v v12, v16
+; RV64-NEXT: add a6, a5, a2
+; RV64-NEXT: vmv1r.v v13, v18
+; RV64-NEXT: vsseg6e64.v v8, (a0)
+; RV64-NEXT: vl1re64.v v14, (a1)
+; RV64-NEXT: add a1, a6, a2
+; RV64-NEXT: vl1re64.v v15, (a5)
+; RV64-NEXT: add a5, a1, a2
+; RV64-NEXT: vl1re64.v v18, (a5)
+; RV64-NEXT: add a5, a5, a2
+; RV64-NEXT: vl1re64.v v19, (a5)
+; RV64-NEXT: add a5, a4, a2
+; RV64-NEXT: vl1re64.v v16, (a6)
+; RV64-NEXT: add a6, a5, a2
+; RV64-NEXT: vl1re64.v v12, (a6)
; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re64.v v13, (a6)
+; RV64-NEXT: csrr a6, vlenb
+; RV64-NEXT: li a7, 12
+; RV64-NEXT: mul a6, a6, a7
+; RV64-NEXT: add a6, sp, a6
+; RV64-NEXT: addi a6, a6, 64
+; RV64-NEXT: vl1re64.v v17, (a1)
+; RV64-NEXT: vl1re64.v v10, (a4)
+; RV64-NEXT: vl1re64.v v11, (a5)
+; RV64-NEXT: vl1re64.v v8, (a0)
+; RV64-NEXT: vl1re64.v v9, (a3)
; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a0, a2
-; RV64-NEXT: vl1re32.v v14, (a6)
-; RV64-NEXT: vl1re32.v v15, (a1)
-; RV64-NEXT: add a5, a0, a5
-; RV64-NEXT: vs2r.v v20, (a5)
+; RV64-NEXT: add a2, a6, a2
; RV64-NEXT: vs4r.v v16, (a2)
-; RV64-NEXT: vs8r.v v8, (a0)
-; RV64-NEXT: vl8re32.v v16, (a2)
-; RV64-NEXT: vl8re32.v v8, (a0)
+; RV64-NEXT: vs8r.v v8, (a6)
+; RV64-NEXT: vl8re64.v v16, (a2)
+; RV64-NEXT: vl8re64.v v8, (a6)
; RV64-NEXT: addi sp, s0, -80
; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 80
; RV64-NEXT: ret
;
-; ZVBB-RV32-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; ZVBB-RV32-LABEL: vector_interleave_nxv12i64_nxv2i64:
; ZVBB-RV32: # %bb.0:
; ZVBB-RV32-NEXT: addi sp, sp, -80
; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
; ZVBB-RV32-NEXT: addi s0, sp, 80
; ZVBB-RV32-NEXT: csrr a0, vlenb
-; ZVBB-RV32-NEXT: slli a0, a0, 5
+; ZVBB-RV32-NEXT: li a1, 28
+; ZVBB-RV32-NEXT: mul a0, a0, a1
; ZVBB-RV32-NEXT: sub sp, sp, a0
; ZVBB-RV32-NEXT: andi sp, sp, -64
-; ZVBB-RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; ZVBB-RV32-NEXT: vmv2r.v v26, v20
-; ZVBB-RV32-NEXT: addi a0, sp, 64
-; ZVBB-RV32-NEXT: vmv2r.v v24, v16
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV32-NEXT: vmv2r.v v20, v14
+; ZVBB-RV32-NEXT: vmv2r.v v22, v12
+; ZVBB-RV32-NEXT: vmv2r.v v24, v10
; ZVBB-RV32-NEXT: csrr a1, vlenb
-; ZVBB-RV32-NEXT: slli a2, a1, 3
-; ZVBB-RV32-NEXT: sub a1, a2, a1
+; ZVBB-RV32-NEXT: li a0, 6
+; ZVBB-RV32-NEXT: mul a1, a1, a0
; ZVBB-RV32-NEXT: add a1, sp, a1
; ZVBB-RV32-NEXT: addi a1, a1, 64
-; ZVBB-RV32-NEXT: vmv2r.v v22, v12
+; ZVBB-RV32-NEXT: vmv1r.v v10, v25
+; ZVBB-RV32-NEXT: vmv1r.v v11, v23
+; ZVBB-RV32-NEXT: vmv1r.v v12, v21
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv1r.v v13, v17
; ZVBB-RV32-NEXT: csrr a2, vlenb
-; ZVBB-RV32-NEXT: vmv2r.v v20, v8
-; ZVBB-RV32-NEXT: vmv1r.v v1, v20
-; ZVBB-RV32-NEXT: vmv1r.v v3, v22
-; ZVBB-RV32-NEXT: vmv1r.v v5, v24
-; ZVBB-RV32-NEXT: vmv1r.v v7, v26
+; ZVBB-RV32-NEXT: vmv1r.v v14, v19
+; ZVBB-RV32-NEXT: vsseg6e64.v v9, (a1)
+; ZVBB-RV32-NEXT: vmv1r.v v9, v24
+; ZVBB-RV32-NEXT: add a5, a1, a2
+; ZVBB-RV32-NEXT: vmv1r.v v10, v22
; ZVBB-RV32-NEXT: add a3, a0, a2
-; ZVBB-RV32-NEXT: vmv1r.v v2, v10
-; ZVBB-RV32-NEXT: add a4, a1, a2
-; ZVBB-RV32-NEXT: slli a5, a2, 2
-; ZVBB-RV32-NEXT: vmv1r.v v4, v14
-; ZVBB-RV32-NEXT: slli a6, a2, 4
-; ZVBB-RV32-NEXT: add a7, a4, a2
-; ZVBB-RV32-NEXT: vmv1r.v v6, v18
-; ZVBB-RV32-NEXT: sub a5, a6, a5
-; ZVBB-RV32-NEXT: vmv1r.v v22, v11
-; ZVBB-RV32-NEXT: add a6, a7, a2
-; ZVBB-RV32-NEXT: vmv1r.v v24, v15
-; ZVBB-RV32-NEXT: vsseg7e32.v v1, (a0)
-; ZVBB-RV32-NEXT: vmv1r.v v26, v19
-; ZVBB-RV32-NEXT: vsseg7e32.v v21, (a1)
-; ZVBB-RV32-NEXT: vl1re32.v v18, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re32.v v19, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re32.v v20, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re32.v v21, (a6)
-; ZVBB-RV32-NEXT: add a6, a3, a2
-; ZVBB-RV32-NEXT: vl1re32.v v10, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re32.v v11, (a6)
-; ZVBB-RV32-NEXT: vl1re32.v v8, (a0)
-; ZVBB-RV32-NEXT: vl1re32.v v16, (a4)
-; ZVBB-RV32-NEXT: vl1re32.v v9, (a3)
-; ZVBB-RV32-NEXT: vl1re32.v v17, (a7)
-; ZVBB-RV32-NEXT: csrr a0, vlenb
-; ZVBB-RV32-NEXT: li a3, 14
-; ZVBB-RV32-NEXT: mul a0, a0, a3
-; ZVBB-RV32-NEXT: add a0, sp, a0
-; ZVBB-RV32-NEXT: addi a0, a0, 64
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re32.v v12, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re32.v v13, (a6)
+; ZVBB-RV32-NEXT: vmv1r.v v11, v20
+; ZVBB-RV32-NEXT: add a4, a3, a2
+; ZVBB-RV32-NEXT: vmv1r.v v12, v16
+; ZVBB-RV32-NEXT: add a6, a5, a2
+; ZVBB-RV32-NEXT: vmv1r.v v13, v18
+; ZVBB-RV32-NEXT: vsseg6e64.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re64.v v14, (a1)
+; ZVBB-RV32-NEXT: add a1, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v15, (a5)
+; ZVBB-RV32-NEXT: add a5, a1, a2
+; ZVBB-RV32-NEXT: vl1re64.v v18, (a5)
+; ZVBB-RV32-NEXT: add a5, a5, a2
+; ZVBB-RV32-NEXT: vl1re64.v v19, (a5)
+; ZVBB-RV32-NEXT: add a5, a4, a2
+; ZVBB-RV32-NEXT: vl1re64.v v16, (a6)
+; ZVBB-RV32-NEXT: add a6, a5, a2
+; ZVBB-RV32-NEXT: vl1re64.v v12, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v13, (a6)
+; ZVBB-RV32-NEXT: csrr a6, vlenb
+; ZVBB-RV32-NEXT: li a7, 12
+; ZVBB-RV32-NEXT: mul a6, a6, a7
+; ZVBB-RV32-NEXT: add a6, sp, a6
+; ZVBB-RV32-NEXT: addi a6, a6, 64
+; ZVBB-RV32-NEXT: vl1re64.v v17, (a1)
+; ZVBB-RV32-NEXT: vl1re64.v v10, (a4)
+; ZVBB-RV32-NEXT: vl1re64.v v11, (a5)
+; ZVBB-RV32-NEXT: vl1re64.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re64.v v9, (a3)
; ZVBB-RV32-NEXT: slli a2, a2, 3
-; ZVBB-RV32-NEXT: add a2, a0, a2
-; ZVBB-RV32-NEXT: vl1re32.v v14, (a6)
-; ZVBB-RV32-NEXT: vl1re32.v v15, (a1)
-; ZVBB-RV32-NEXT: add a5, a0, a5
-; ZVBB-RV32-NEXT: vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT: add a2, a6, a2
; ZVBB-RV32-NEXT: vs4r.v v16, (a2)
-; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
-; ZVBB-RV32-NEXT: vl8re32.v v16, (a2)
-; ZVBB-RV32-NEXT: vl8re32.v v8, (a0)
+; ZVBB-RV32-NEXT: vs8r.v v8, (a6)
+; ZVBB-RV32-NEXT: vl8re64.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8re64.v v8, (a6)
; ZVBB-RV32-NEXT: addi sp, s0, -80
; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; ZVBB-RV32-NEXT: addi sp, sp, 80
; ZVBB-RV32-NEXT: ret
;
-; ZVBB-RV64-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; ZVBB-RV64-LABEL: vector_interleave_nxv12i64_nxv2i64:
; ZVBB-RV64: # %bb.0:
; ZVBB-RV64-NEXT: addi sp, sp, -80
; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; ZVBB-RV64-NEXT: addi s0, sp, 80
; ZVBB-RV64-NEXT: csrr a0, vlenb
-; ZVBB-RV64-NEXT: slli a0, a0, 5
+; ZVBB-RV64-NEXT: li a1, 28
+; ZVBB-RV64-NEXT: mul a0, a0, a1
; ZVBB-RV64-NEXT: sub sp, sp, a0
; ZVBB-RV64-NEXT: andi sp, sp, -64
-; ZVBB-RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; ZVBB-RV64-NEXT: vmv2r.v v26, v20
-; ZVBB-RV64-NEXT: addi a0, sp, 64
-; ZVBB-RV64-NEXT: vmv2r.v v24, v16
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV64-NEXT: vmv2r.v v20, v14
+; ZVBB-RV64-NEXT: vmv2r.v v22, v12
+; ZVBB-RV64-NEXT: vmv2r.v v24, v10
; ZVBB-RV64-NEXT: csrr a1, vlenb
-; ZVBB-RV64-NEXT: slli a2, a1, 3
-; ZVBB-RV64-NEXT: sub a1, a2, a1
+; ZVBB-RV64-NEXT: li a0, 6
+; ZVBB-RV64-NEXT: mul a1, a1, a0
; ZVBB-RV64-NEXT: add a1, sp, a1
; ZVBB-RV64-NEXT: addi a1, a1, 64
-; ZVBB-RV64-NEXT: vmv2r.v v22, v12
+; ZVBB-RV64-NEXT: vmv1r.v v10, v25
+; ZVBB-RV64-NEXT: vmv1r.v v11, v23
+; ZVBB-RV64-NEXT: vmv1r.v v12, v21
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv1r.v v13, v17
; ZVBB-RV64-NEXT: csrr a2, vlenb
-; ZVBB-RV64-NEXT: vmv2r.v v20, v8
-; ZVBB-RV64-NEXT: vmv1r.v v1, v20
-; ZVBB-RV64-NEXT: vmv1r.v v3, v22
-; ZVBB-RV64-NEXT: vmv1r.v v5, v24
-; ZVBB-RV64-NEXT: vmv1r.v v7, v26
+; ZVBB-RV64-NEXT: vmv1r.v v14, v19
+; ZVBB-RV64-NEXT: vsseg6e64.v v9, (a1)
+; ZVBB-RV64-NEXT: vmv1r.v v9, v24
+; ZVBB-RV64-NEXT: add a5, a1, a2
+; ZVBB-RV64-NEXT: vmv1r.v v10, v22
; ZVBB-RV64-NEXT: add a3, a0, a2
-; ZVBB-RV64-NEXT: vmv1r.v v2, v10
-; ZVBB-RV64-NEXT: add a4, a1, a2
-; ZVBB-RV64-NEXT: slli a5, a2, 2
-; ZVBB-RV64-NEXT: vmv1r.v v4, v14
-; ZVBB-RV64-NEXT: slli a6, a2, 4
-; ZVBB-RV64-NEXT: add a7, a4, a2
-; ZVBB-RV64-NEXT: vmv1r.v v6, v18
-; ZVBB-RV64-NEXT: sub a5, a6, a5
-; ZVBB-RV64-NEXT: vmv1r.v v22, v11
-; ZVBB-RV64-NEXT: add a6, a7, a2
-; ZVBB-RV64-NEXT: vmv1r.v v24, v15
-; ZVBB-RV64-NEXT: vsseg7e32.v v1, (a0)
-; ZVBB-RV64-NEXT: vmv1r.v v26, v19
-; ZVBB-RV64-NEXT: vsseg7e32.v v21, (a1)
-; ZVBB-RV64-NEXT: vl1re32.v v18, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re32.v v19, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re32.v v20, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re32.v v21, (a6)
-; ZVBB-RV64-NEXT: add a6, a3, a2
-; ZVBB-RV64-NEXT: vl1re32.v v10, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re32.v v11, (a6)
-; ZVBB-RV64-NEXT: vl1re32.v v8, (a0)
-; ZVBB-RV64-NEXT: vl1re32.v v16, (a4)
-; ZVBB-RV64-NEXT: vl1re32.v v9, (a3)
-; ZVBB-RV64-NEXT: vl1re32.v v17, (a7)
-; ZVBB-RV64-NEXT: csrr a0, vlenb
-; ZVBB-RV64-NEXT: li a3, 14
-; ZVBB-RV64-NEXT: mul a0, a0, a3
-; ZVBB-RV64-NEXT: add a0, sp, a0
-; ZVBB-RV64-NEXT: addi a0, a0, 64
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re32.v v12, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re32.v v13, (a6)
+; ZVBB-RV64-NEXT: vmv1r.v v11, v20
+; ZVBB-RV64-NEXT: add a4, a3, a2
+; ZVBB-RV64-NEXT: vmv1r.v v12, v16
+; ZVBB-RV64-NEXT: add a6, a5, a2
+; ZVBB-RV64-NEXT: vmv1r.v v13, v18
+; ZVBB-RV64-NEXT: vsseg6e64.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re64.v v14, (a1)
+; ZVBB-RV64-NEXT: add a1, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v15, (a5)
+; ZVBB-RV64-NEXT: add a5, a1, a2
+; ZVBB-RV64-NEXT: vl1re64.v v18, (a5)
+; ZVBB-RV64-NEXT: add a5, a5, a2
+; ZVBB-RV64-NEXT: vl1re64.v v19, (a5)
+; ZVBB-RV64-NEXT: add a5, a4, a2
+; ZVBB-RV64-NEXT: vl1re64.v v16, (a6)
+; ZVBB-RV64-NEXT: add a6, a5, a2
+; ZVBB-RV64-NEXT: vl1re64.v v12, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v13, (a6)
+; ZVBB-RV64-NEXT: csrr a6, vlenb
+; ZVBB-RV64-NEXT: li a7, 12
+; ZVBB-RV64-NEXT: mul a6, a6, a7
+; ZVBB-RV64-NEXT: add a6, sp, a6
+; ZVBB-RV64-NEXT: addi a6, a6, 64
+; ZVBB-RV64-NEXT: vl1re64.v v17, (a1)
+; ZVBB-RV64-NEXT: vl1re64.v v10, (a4)
+; ZVBB-RV64-NEXT: vl1re64.v v11, (a5)
+; ZVBB-RV64-NEXT: vl1re64.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re64.v v9, (a3)
; ZVBB-RV64-NEXT: slli a2, a2, 3
-; ZVBB-RV64-NEXT: add a2, a0, a2
-; ZVBB-RV64-NEXT: vl1re32.v v14, (a6)
-; ZVBB-RV64-NEXT: vl1re32.v v15, (a1)
-; ZVBB-RV64-NEXT: add a5, a0, a5
-; ZVBB-RV64-NEXT: vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT: add a2, a6, a2
; ZVBB-RV64-NEXT: vs4r.v v16, (a2)
-; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
-; ZVBB-RV64-NEXT: vl8re32.v v16, (a2)
-; ZVBB-RV64-NEXT: vl8re32.v v8, (a0)
+; ZVBB-RV64-NEXT: vs8r.v v8, (a6)
+; ZVBB-RV64-NEXT: vl8re64.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8re64.v v8, (a6)
; ZVBB-RV64-NEXT: addi sp, s0, -80
; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZVBB-RV64-NEXT: addi sp, sp, 80
; ZVBB-RV64-NEXT: ret
;
-; ZIP-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; ZIP-LABEL: vector_interleave_nxv12i64_nxv2i64:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -80
; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; ZIP-NEXT: addi s0, sp, 80
; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: slli a0, a0, 5
+; ZIP-NEXT: li a1, 28
+; ZIP-NEXT: mul a0, a0, a1
; ZIP-NEXT: sub sp, sp, a0
; ZIP-NEXT: andi sp, sp, -64
-; ZIP-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; ZIP-NEXT: vmv2r.v v26, v20
-; ZIP-NEXT: addi a0, sp, 64
-; ZIP-NEXT: vmv2r.v v24, v16
+; ZIP-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZIP-NEXT: vmv2r.v v20, v14
+; ZIP-NEXT: vmv2r.v v22, v12
+; ZIP-NEXT: vmv2r.v v24, v10
; ZIP-NEXT: csrr a1, vlenb
-; ZIP-NEXT: slli a2, a1, 3
-; ZIP-NEXT: sub a1, a2, a1
+; ZIP-NEXT: li a0, 6
+; ZIP-NEXT: mul a1, a1, a0
; ZIP-NEXT: add a1, sp, a1
; ZIP-NEXT: addi a1, a1, 64
-; ZIP-NEXT: vmv2r.v v22, v12
+; ZIP-NEXT: vmv1r.v v10, v25
+; ZIP-NEXT: vmv1r.v v11, v23
+; ZIP-NEXT: vmv1r.v v12, v21
+; ZIP-NEXT: addi a0, sp, 64
+; ZIP-NEXT: vmv1r.v v13, v17
; ZIP-NEXT: csrr a2, vlenb
-; ZIP-NEXT: vmv2r.v v20, v8
-; ZIP-NEXT: vmv1r.v v1, v20
-; ZIP-NEXT: vmv1r.v v3, v22
-; ZIP-NEXT: vmv1r.v v5, v24
-; ZIP-NEXT: vmv1r.v v7, v26
+; ZIP-NEXT: vmv1r.v v14, v19
+; ZIP-NEXT: vsseg6e64.v v9, (a1)
+; ZIP-NEXT: vmv1r.v v9, v24
+; ZIP-NEXT: add a5, a1, a2
+; ZIP-NEXT: vmv1r.v v10, v22
; ZIP-NEXT: add a3, a0, a2
-; ZIP-NEXT: vmv1r.v v2, v10
-; ZIP-NEXT: add a4, a1, a2
-; ZIP-NEXT: slli a5, a2, 2
-; ZIP-NEXT: vmv1r.v v4, v14
-; ZIP-NEXT: slli a6, a2, 4
-; ZIP-NEXT: add a7, a4, a2
-; ZIP-NEXT: vmv1r.v v6, v18
-; ZIP-NEXT: sub a5, a6, a5
-; ZIP-NEXT: vmv1r.v v22, v11
-; ZIP-NEXT: add a6, a7, a2
-; ZIP-NEXT: vmv1r.v v24, v15
-; ZIP-NEXT: vsseg7e32.v v1, (a0)
-; ZIP-NEXT: vmv1r.v v26, v19
-; ZIP-NEXT: vsseg7e32.v v21, (a1)
-; ZIP-NEXT: vl1re32.v v18, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re32.v v19, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re32.v v20, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re32.v v21, (a6)
-; ZIP-NEXT: add a6, a3, a2
-; ZIP-NEXT: vl1re32.v v10, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re32.v v11, (a6)
-; ZIP-NEXT: vl1re32.v v8, (a0)
-; ZIP-NEXT: vl1re32.v v16, (a4)
-; ZIP-NEXT: vl1re32.v v9, (a3)
-; ZIP-NEXT: vl1re32.v v17, (a7)
-; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: li a3, 14
-; ZIP-NEXT: mul a0, a0, a3
-; ZIP-NEXT: add a0, sp, a0
-; ZIP-NEXT: addi a0, a0, 64
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re32.v v12, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re32.v v13, (a6)
+; ZIP-NEXT: vmv1r.v v11, v20
+; ZIP-NEXT: add a4, a3, a2
+; ZIP-NEXT: vmv1r.v v12, v16
+; ZIP-NEXT: add a6, a5, a2
+; ZIP-NEXT: vmv1r.v v13, v18
+; ZIP-NEXT: vsseg6e64.v v8, (a0)
+; ZIP-NEXT: vl1re64.v v14, (a1)
+; ZIP-NEXT: add a1, a6, a2
+; ZIP-NEXT: vl1re64.v v15, (a5)
+; ZIP-NEXT: add a5, a1, a2
+; ZIP-NEXT: vl1re64.v v18, (a5)
+; ZIP-NEXT: add a5, a5, a2
+; ZIP-NEXT: vl1re64.v v19, (a5)
+; ZIP-NEXT: add a5, a4, a2
+; ZIP-NEXT: vl1re64.v v16, (a6)
+; ZIP-NEXT: add a6, a5, a2
+; ZIP-NEXT: vl1re64.v v12, (a6)
; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re64.v v13, (a6)
+; ZIP-NEXT: csrr a6, vlenb
+; ZIP-NEXT: li a7, 12
+; ZIP-NEXT: mul a6, a6, a7
+; ZIP-NEXT: add a6, sp, a6
+; ZIP-NEXT: addi a6, a6, 64
+; ZIP-NEXT: vl1re64.v v17, (a1)
+; ZIP-NEXT: vl1re64.v v10, (a4)
+; ZIP-NEXT: vl1re64.v v11, (a5)
+; ZIP-NEXT: vl1re64.v v8, (a0)
+; ZIP-NEXT: vl1re64.v v9, (a3)
; ZIP-NEXT: slli a2, a2, 3
-; ZIP-NEXT: add a2, a0, a2
-; ZIP-NEXT: vl1re32.v v14, (a6)
-; ZIP-NEXT: vl1re32.v v15, (a1)
-; ZIP-NEXT: add a5, a0, a5
-; ZIP-NEXT: vs2r.v v20, (a5)
+; ZIP-NEXT: add a2, a6, a2
; ZIP-NEXT: vs4r.v v16, (a2)
-; ZIP-NEXT: vs8r.v v8, (a0)
-; ZIP-NEXT: vl8re32.v v16, (a2)
-; ZIP-NEXT: vl8re32.v v8, (a0)
+; ZIP-NEXT: vs8r.v v8, (a6)
+; ZIP-NEXT: vl8re64.v v16, (a2)
+; ZIP-NEXT: vl8re64.v v8, (a6)
; ZIP-NEXT: addi sp, s0, -80
; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZIP-NEXT: addi sp, sp, 80
; ZIP-NEXT: ret
- %res = call <vscale x 28 x i32> @llvm.vector.interleave7.nxv28i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, <vscale x 4 x i32> %e, <vscale x 4 x i32> %f, <vscale x 4 x i32> %g)
- ret <vscale x 28 x i32> %res
+ %res = call <vscale x 12 x i64> @llvm.vector.interleave6.nxv12i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d, <vscale x 2 x i64> %e, <vscale x 2 x i64> %f)
+ ret <vscale x 12 x i64> %res
}
-define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d, <vscale x 2 x i64> %e, <vscale x 2 x i64> %f, <vscale x 2 x i64> %g) nounwind {
+define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e, <vscale x 16 x i1> %f, <vscale x 16 x i1> %g) nounwind {
+; CHECK-LABEL: vector_interleave_nxv112i1_nxv16i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 14
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmv.v.i v14, 0
+; CHECK-NEXT: addi a4, sp, 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 3
+; CHECK-NEXT: sub a0, a1, a0
+; CHECK-NEXT: add a0, sp, a0
+; CHECK-NEXT: addi a0, a0, 16
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: vmerge.vim v16, v14, 1, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vim v22, v14, 1, v0
+; CHECK-NEXT: add a3, a4, a2
+; CHECK-NEXT: srli a1, a2, 2
+; CHECK-NEXT: add a5, a0, a2
+; CHECK-NEXT: vmv4r.v v24, v16
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmerge.vim v18, v14, 1, v0
+; CHECK-NEXT: add a6, a3, a2
+; CHECK-NEXT: vmv1r.v v25, v22
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmerge.vim v8, v14, 1, v0
+; CHECK-NEXT: vmv1r.v v26, v18
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: vmerge.vim v20, v14, 1, v0
+; CHECK-NEXT: vmv1r.v v27, v8
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vim v10, v14, 1, v0
+; CHECK-NEXT: vmv1r.v v28, v20
+; CHECK-NEXT: vmv1r.v v18, v23
+; CHECK-NEXT: add a7, a6, a2
+; CHECK-NEXT: vmv1r.v v29, v10
+; CHECK-NEXT: vmv1r.v v20, v9
+; CHECK-NEXT: vmv1r.v v0, v13
+; CHECK-NEXT: vmerge.vim v30, v14, 1, v0
+; CHECK-NEXT: vmv1r.v v22, v11
+; CHECK-NEXT: vsetvli t0, zero, e8, m1, ta, ma
+; CHECK-NEXT: vsseg7e8.v v24, (a4)
+; CHECK-NEXT: vmv1r.v v23, v31
+; CHECK-NEXT: vsseg7e8.v v17, (a0)
+; CHECK-NEXT: vl1r.v v8, (a6)
+; CHECK-NEXT: add a6, a7, a2
+; CHECK-NEXT: vl1r.v v10, (a4)
+; CHECK-NEXT: add a4, a6, a2
+; CHECK-NEXT: vl1r.v v12, (a6)
+; CHECK-NEXT: add a6, a4, a2
+; CHECK-NEXT: vl1r.v v14, (a6)
+; CHECK-NEXT: add a6, a5, a2
+; CHECK-NEXT: vl1r.v v16, (a5)
+; CHECK-NEXT: add a5, a6, a2
+; CHECK-NEXT: vl1r.v v18, (a5)
+; CHECK-NEXT: add a5, a5, a2
+; CHECK-NEXT: vl1r.v v9, (a7)
+; CHECK-NEXT: add a7, a5, a2
+; CHECK-NEXT: vl1r.v v20, (a7)
+; CHECK-NEXT: add a7, a7, a2
+; CHECK-NEXT: srli a2, a2, 1
+; CHECK-NEXT: vl1r.v v11, (a3)
+; CHECK-NEXT: add a3, a1, a1
+; CHECK-NEXT: vl1r.v v13, (a4)
+; CHECK-NEXT: add a4, a2, a2
+; CHECK-NEXT: vl1r.v v15, (a0)
+; CHECK-NEXT: vl1r.v v19, (a5)
+; CHECK-NEXT: vl1r.v v17, (a6)
+; CHECK-NEXT: vl1r.v v21, (a7)
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmsne.vi v22, v8, 0
+; CHECK-NEXT: vmsne.vi v0, v10, 0
+; CHECK-NEXT: vmsne.vi v9, v12, 0
+; CHECK-NEXT: vmsne.vi v10, v14, 0
+; CHECK-NEXT: vmsne.vi v11, v18, 0
+; CHECK-NEXT: vmsne.vi v8, v16, 0
+; CHECK-NEXT: vmsne.vi v12, v20, 0
+; CHECK-NEXT: vsetvli zero, a3, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v0, v22, a1
+; CHECK-NEXT: vslideup.vx v9, v10, a1
+; CHECK-NEXT: vslideup.vx v8, v11, a1
+; CHECK-NEXT: vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v0, v9, a2
+; CHECK-NEXT: vslideup.vx v8, v12, a2
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 14
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
;
-; RV32-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; ZVBB-LABEL: vector_interleave_nxv112i1_nxv16i1:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 14
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; ZVBB-NEXT: vmv.v.i v14, 0
+; ZVBB-NEXT: addi a4, sp, 16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a1, a0, 3
+; ZVBB-NEXT: sub a0, a1, a0
+; ZVBB-NEXT: add a0, sp, a0
+; ZVBB-NEXT: addi a0, a0, 16
+; ZVBB-NEXT: csrr a2, vlenb
+; ZVBB-NEXT: vmerge.vim v16, v14, 1, v0
+; ZVBB-NEXT: vmv1r.v v0, v8
+; ZVBB-NEXT: vmerge.vim v22, v14, 1, v0
+; ZVBB-NEXT: add a3, a4, a2
+; ZVBB-NEXT: srli a1, a2, 2
+; ZVBB-NEXT: add a5, a0, a2
+; ZVBB-NEXT: vmv4r.v v24, v16
+; ZVBB-NEXT: vmv1r.v v0, v9
+; ZVBB-NEXT: vmerge.vim v18, v14, 1, v0
+; ZVBB-NEXT: add a6, a3, a2
+; ZVBB-NEXT: vmv1r.v v25, v22
+; ZVBB-NEXT: vmv1r.v v0, v10
+; ZVBB-NEXT: vmerge.vim v8, v14, 1, v0
+; ZVBB-NEXT: vmv1r.v v26, v18
+; ZVBB-NEXT: vmv1r.v v0, v11
+; ZVBB-NEXT: vmerge.vim v20, v14, 1, v0
+; ZVBB-NEXT: vmv1r.v v27, v8
+; ZVBB-NEXT: vmv1r.v v0, v12
+; ZVBB-NEXT: vmerge.vim v10, v14, 1, v0
+; ZVBB-NEXT: vmv1r.v v28, v20
+; ZVBB-NEXT: vmv1r.v v18, v23
+; ZVBB-NEXT: add a7, a6, a2
+; ZVBB-NEXT: vmv1r.v v29, v10
+; ZVBB-NEXT: vmv1r.v v20, v9
+; ZVBB-NEXT: vmv1r.v v0, v13
+; ZVBB-NEXT: vmerge.vim v30, v14, 1, v0
+; ZVBB-NEXT: vmv1r.v v22, v11
+; ZVBB-NEXT: vsetvli t0, zero, e8, m1, ta, ma
+; ZVBB-NEXT: vsseg7e8.v v24, (a4)
+; ZVBB-NEXT: vmv1r.v v23, v31
+; ZVBB-NEXT: vsseg7e8.v v17, (a0)
+; ZVBB-NEXT: vl1r.v v8, (a6)
+; ZVBB-NEXT: add a6, a7, a2
+; ZVBB-NEXT: vl1r.v v10, (a4)
+; ZVBB-NEXT: add a4, a6, a2
+; ZVBB-NEXT: vl1r.v v12, (a6)
+; ZVBB-NEXT: add a6, a4, a2
+; ZVBB-NEXT: vl1r.v v14, (a6)
+; ZVBB-NEXT: add a6, a5, a2
+; ZVBB-NEXT: vl1r.v v16, (a5)
+; ZVBB-NEXT: add a5, a6, a2
+; ZVBB-NEXT: vl1r.v v18, (a5)
+; ZVBB-NEXT: add a5, a5, a2
+; ZVBB-NEXT: vl1r.v v9, (a7)
+; ZVBB-NEXT: add a7, a5, a2
+; ZVBB-NEXT: vl1r.v v20, (a7)
+; ZVBB-NEXT: add a7, a7, a2
+; ZVBB-NEXT: srli a2, a2, 1
+; ZVBB-NEXT: vl1r.v v11, (a3)
+; ZVBB-NEXT: add a3, a1, a1
+; ZVBB-NEXT: vl1r.v v13, (a4)
+; ZVBB-NEXT: add a4, a2, a2
+; ZVBB-NEXT: vl1r.v v15, (a0)
+; ZVBB-NEXT: vl1r.v v19, (a5)
+; ZVBB-NEXT: vl1r.v v17, (a6)
+; ZVBB-NEXT: vl1r.v v21, (a7)
+; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; ZVBB-NEXT: vmsne.vi v22, v8, 0
+; ZVBB-NEXT: vmsne.vi v0, v10, 0
+; ZVBB-NEXT: vmsne.vi v9, v12, 0
+; ZVBB-NEXT: vmsne.vi v10, v14, 0
+; ZVBB-NEXT: vmsne.vi v11, v18, 0
+; ZVBB-NEXT: vmsne.vi v8, v16, 0
+; ZVBB-NEXT: vmsne.vi v12, v20, 0
+; ZVBB-NEXT: vsetvli zero, a3, e8, mf2, ta, ma
+; ZVBB-NEXT: vslideup.vx v0, v22, a1
+; ZVBB-NEXT: vslideup.vx v9, v10, a1
+; ZVBB-NEXT: vslideup.vx v8, v11, a1
+; ZVBB-NEXT: vsetvli zero, a4, e8, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v0, v9, a2
+; ZVBB-NEXT: vslideup.vx v8, v12, a2
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 14
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 112 x i1> @llvm.vector.interleave7.nxv112i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e, <vscale x 16 x i1> %f, <vscale x 16 x i1> %g)
+ ret <vscale x 112 x i1> %res
+}
+
+
+define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, <vscale x 16 x i8> %e, <vscale x 16 x i8> %f, <vscale x 16 x i8> %g) nounwind {
+;
+; RV32-LABEL: vector_interleave_nxv112i8_nxv16i8:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -80
; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
@@ -3461,7 +3871,7 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: sub sp, sp, a0
; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT: vsetvli a0, zero, e8, m1, ta, ma
; RV32-NEXT: vmv2r.v v26, v20
; RV32-NEXT: addi a0, sp, 64
; RV32-NEXT: vmv2r.v v24, v16
@@ -3489,51 +3899,51 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
; RV32-NEXT: vmv1r.v v22, v11
; RV32-NEXT: add a6, a7, a2
; RV32-NEXT: vmv1r.v v24, v15
-; RV32-NEXT: vsseg7e64.v v1, (a0)
+; RV32-NEXT: vsseg7e8.v v1, (a0)
; RV32-NEXT: vmv1r.v v26, v19
-; RV32-NEXT: vsseg7e64.v v21, (a1)
-; RV32-NEXT: vl1re64.v v18, (a6)
+; RV32-NEXT: vsseg7e8.v v21, (a1)
+; RV32-NEXT: vl1r.v v18, (a6)
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re64.v v19, (a6)
+; RV32-NEXT: vl1r.v v19, (a6)
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re64.v v20, (a6)
+; RV32-NEXT: vl1r.v v20, (a6)
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re64.v v21, (a6)
+; RV32-NEXT: vl1r.v v21, (a6)
; RV32-NEXT: add a6, a3, a2
-; RV32-NEXT: vl1re64.v v10, (a6)
+; RV32-NEXT: vl1r.v v10, (a6)
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re64.v v11, (a6)
-; RV32-NEXT: vl1re64.v v8, (a0)
-; RV32-NEXT: vl1re64.v v16, (a4)
-; RV32-NEXT: vl1re64.v v9, (a3)
-; RV32-NEXT: vl1re64.v v17, (a7)
+; RV32-NEXT: vl1r.v v11, (a6)
+; RV32-NEXT: vl1r.v v8, (a0)
+; RV32-NEXT: vl1r.v v16, (a4)
+; RV32-NEXT: vl1r.v v9, (a3)
+; RV32-NEXT: vl1r.v v17, (a7)
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a3, 14
; RV32-NEXT: mul a0, a0, a3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 64
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re64.v v12, (a6)
+; RV32-NEXT: vl1r.v v12, (a6)
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re64.v v13, (a6)
+; RV32-NEXT: vl1r.v v13, (a6)
; RV32-NEXT: add a6, a6, a2
; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: add a2, a0, a2
-; RV32-NEXT: vl1re64.v v14, (a6)
-; RV32-NEXT: vl1re64.v v15, (a1)
+; RV32-NEXT: vl1r.v v14, (a6)
+; RV32-NEXT: vl1r.v v15, (a1)
; RV32-NEXT: add a5, a0, a5
; RV32-NEXT: vs2r.v v20, (a5)
; RV32-NEXT: vs4r.v v16, (a2)
; RV32-NEXT: vs8r.v v8, (a0)
-; RV32-NEXT: vl8re64.v v16, (a2)
-; RV32-NEXT: vl8re64.v v8, (a0)
+; RV32-NEXT: vl8r.v v16, (a2)
+; RV32-NEXT: vl8r.v v8, (a0)
; RV32-NEXT: addi sp, s0, -80
; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 80
; RV32-NEXT: ret
;
-; RV64-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; RV64-LABEL: vector_interleave_nxv112i8_nxv16i8:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -80
; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -3543,7 +3953,7 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
; RV64-NEXT: slli a0, a0, 5
; RV64-NEXT: sub sp, sp, a0
; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV64-NEXT: vsetvli a0, zero, e8, m1, ta, ma
; RV64-NEXT: vmv2r.v v26, v20
; RV64-NEXT: addi a0, sp, 64
; RV64-NEXT: vmv2r.v v24, v16
@@ -3571,51 +3981,51 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
; RV64-NEXT: vmv1r.v v22, v11
; RV64-NEXT: add a6, a7, a2
; RV64-NEXT: vmv1r.v v24, v15
-; RV64-NEXT: vsseg7e64.v v1, (a0)
+; RV64-NEXT: vsseg7e8.v v1, (a0)
; RV64-NEXT: vmv1r.v v26, v19
-; RV64-NEXT: vsseg7e64.v v21, (a1)
-; RV64-NEXT: vl1re64.v v18, (a6)
+; RV64-NEXT: vsseg7e8.v v21, (a1)
+; RV64-NEXT: vl1r.v v18, (a6)
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re64.v v19, (a6)
+; RV64-NEXT: vl1r.v v19, (a6)
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re64.v v20, (a6)
+; RV64-NEXT: vl1r.v v20, (a6)
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re64.v v21, (a6)
+; RV64-NEXT: vl1r.v v21, (a6)
; RV64-NEXT: add a6, a3, a2
-; RV64-NEXT: vl1re64.v v10, (a6)
+; RV64-NEXT: vl1r.v v10, (a6)
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re64.v v11, (a6)
-; RV64-NEXT: vl1re64.v v8, (a0)
-; RV64-NEXT: vl1re64.v v16, (a4)
-; RV64-NEXT: vl1re64.v v9, (a3)
-; RV64-NEXT: vl1re64.v v17, (a7)
+; RV64-NEXT: vl1r.v v11, (a6)
+; RV64-NEXT: vl1r.v v8, (a0)
+; RV64-NEXT: vl1r.v v16, (a4)
+; RV64-NEXT: vl1r.v v9, (a3)
+; RV64-NEXT: vl1r.v v17, (a7)
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: li a3, 14
; RV64-NEXT: mul a0, a0, a3
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 64
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re64.v v12, (a6)
+; RV64-NEXT: vl1r.v v12, (a6)
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re64.v v13, (a6)
+; RV64-NEXT: vl1r.v v13, (a6)
; RV64-NEXT: add a6, a6, a2
; RV64-NEXT: slli a2, a2, 3
; RV64-NEXT: add a2, a0, a2
-; RV64-NEXT: vl1re64.v v14, (a6)
-; RV64-NEXT: vl1re64.v v15, (a1)
+; RV64-NEXT: vl1r.v v14, (a6)
+; RV64-NEXT: vl1r.v v15, (a1)
; RV64-NEXT: add a5, a0, a5
; RV64-NEXT: vs2r.v v20, (a5)
; RV64-NEXT: vs4r.v v16, (a2)
; RV64-NEXT: vs8r.v v8, (a0)
-; RV64-NEXT: vl8re64.v v16, (a2)
-; RV64-NEXT: vl8re64.v v8, (a0)
+; RV64-NEXT: vl8r.v v16, (a2)
+; RV64-NEXT: vl8r.v v8, (a0)
; RV64-NEXT: addi sp, s0, -80
; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 80
; RV64-NEXT: ret
;
-; ZVBB-RV32-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; ZVBB-RV32-LABEL: vector_interleave_nxv112i8_nxv16i8:
; ZVBB-RV32: # %bb.0:
; ZVBB-RV32-NEXT: addi sp, sp, -80
; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
@@ -3625,7 +4035,7 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
; ZVBB-RV32-NEXT: slli a0, a0, 5
; ZVBB-RV32-NEXT: sub sp, sp, a0
; ZVBB-RV32-NEXT: andi sp, sp, -64
-; ZVBB-RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e8, m1, ta, ma
; ZVBB-RV32-NEXT: vmv2r.v v26, v20
; ZVBB-RV32-NEXT: addi a0, sp, 64
; ZVBB-RV32-NEXT: vmv2r.v v24, v16
@@ -3653,51 +4063,51 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
; ZVBB-RV32-NEXT: vmv1r.v v22, v11
; ZVBB-RV32-NEXT: add a6, a7, a2
; ZVBB-RV32-NEXT: vmv1r.v v24, v15
-; ZVBB-RV32-NEXT: vsseg7e64.v v1, (a0)
+; ZVBB-RV32-NEXT: vsseg7e8.v v1, (a0)
; ZVBB-RV32-NEXT: vmv1r.v v26, v19
-; ZVBB-RV32-NEXT: vsseg7e64.v v21, (a1)
-; ZVBB-RV32-NEXT: vl1re64.v v18, (a6)
+; ZVBB-RV32-NEXT: vsseg7e8.v v21, (a1)
+; ZVBB-RV32-NEXT: vl1r.v v18, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re64.v v19, (a6)
+; ZVBB-RV32-NEXT: vl1r.v v19, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re64.v v20, (a6)
+; ZVBB-RV32-NEXT: vl1r.v v20, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re64.v v21, (a6)
+; ZVBB-RV32-NEXT: vl1r.v v21, (a6)
; ZVBB-RV32-NEXT: add a6, a3, a2
-; ZVBB-RV32-NEXT: vl1re64.v v10, (a6)
+; ZVBB-RV32-NEXT: vl1r.v v10, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re64.v v11, (a6)
-; ZVBB-RV32-NEXT: vl1re64.v v8, (a0)
-; ZVBB-RV32-NEXT: vl1re64.v v16, (a4)
-; ZVBB-RV32-NEXT: vl1re64.v v9, (a3)
-; ZVBB-RV32-NEXT: vl1re64.v v17, (a7)
+; ZVBB-RV32-NEXT: vl1r.v v11, (a6)
+; ZVBB-RV32-NEXT: vl1r.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1r.v v16, (a4)
+; ZVBB-RV32-NEXT: vl1r.v v9, (a3)
+; ZVBB-RV32-NEXT: vl1r.v v17, (a7)
; ZVBB-RV32-NEXT: csrr a0, vlenb
; ZVBB-RV32-NEXT: li a3, 14
; ZVBB-RV32-NEXT: mul a0, a0, a3
; ZVBB-RV32-NEXT: add a0, sp, a0
; ZVBB-RV32-NEXT: addi a0, a0, 64
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re64.v v12, (a6)
+; ZVBB-RV32-NEXT: vl1r.v v12, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re64.v v13, (a6)
+; ZVBB-RV32-NEXT: vl1r.v v13, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
; ZVBB-RV32-NEXT: slli a2, a2, 3
; ZVBB-RV32-NEXT: add a2, a0, a2
-; ZVBB-RV32-NEXT: vl1re64.v v14, (a6)
-; ZVBB-RV32-NEXT: vl1re64.v v15, (a1)
+; ZVBB-RV32-NEXT: vl1r.v v14, (a6)
+; ZVBB-RV32-NEXT: vl1r.v v15, (a1)
; ZVBB-RV32-NEXT: add a5, a0, a5
; ZVBB-RV32-NEXT: vs2r.v v20, (a5)
; ZVBB-RV32-NEXT: vs4r.v v16, (a2)
; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
-; ZVBB-RV32-NEXT: vl8re64.v v16, (a2)
-; ZVBB-RV32-NEXT: vl8re64.v v8, (a0)
+; ZVBB-RV32-NEXT: vl8r.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8r.v v8, (a0)
; ZVBB-RV32-NEXT: addi sp, s0, -80
; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; ZVBB-RV32-NEXT: addi sp, sp, 80
; ZVBB-RV32-NEXT: ret
;
-; ZVBB-RV64-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; ZVBB-RV64-LABEL: vector_interleave_nxv112i8_nxv16i8:
; ZVBB-RV64: # %bb.0:
; ZVBB-RV64-NEXT: addi sp, sp, -80
; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -3707,7 +4117,7 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
; ZVBB-RV64-NEXT: slli a0, a0, 5
; ZVBB-RV64-NEXT: sub sp, sp, a0
; ZVBB-RV64-NEXT: andi sp, sp, -64
-; ZVBB-RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e8, m1, ta, ma
; ZVBB-RV64-NEXT: vmv2r.v v26, v20
; ZVBB-RV64-NEXT: addi a0, sp, 64
; ZVBB-RV64-NEXT: vmv2r.v v24, v16
@@ -3735,51 +4145,51 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
; ZVBB-RV64-NEXT: vmv1r.v v22, v11
; ZVBB-RV64-NEXT: add a6, a7, a2
; ZVBB-RV64-NEXT: vmv1r.v v24, v15
-; ZVBB-RV64-NEXT: vsseg7e64.v v1, (a0)
+; ZVBB-RV64-NEXT: vsseg7e8.v v1, (a0)
; ZVBB-RV64-NEXT: vmv1r.v v26, v19
-; ZVBB-RV64-NEXT: vsseg7e64.v v21, (a1)
-; ZVBB-RV64-NEXT: vl1re64.v v18, (a6)
+; ZVBB-RV64-NEXT: vsseg7e8.v v21, (a1)
+; ZVBB-RV64-NEXT: vl1r.v v18, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re64.v v19, (a6)
+; ZVBB-RV64-NEXT: vl1r.v v19, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re64.v v20, (a6)
+; ZVBB-RV64-NEXT: vl1r.v v20, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re64.v v21, (a6)
+; ZVBB-RV64-NEXT: vl1r.v v21, (a6)
; ZVBB-RV64-NEXT: add a6, a3, a2
-; ZVBB-RV64-NEXT: vl1re64.v v10, (a6)
+; ZVBB-RV64-NEXT: vl1r.v v10, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re64.v v11, (a6)
-; ZVBB-RV64-NEXT: vl1re64.v v8, (a0)
-; ZVBB-RV64-NEXT: vl1re64.v v16, (a4)
-; ZVBB-RV64-NEXT: vl1re64.v v9, (a3)
-; ZVBB-RV64-NEXT: vl1re64.v v17, (a7)
+; ZVBB-RV64-NEXT: vl1r.v v11, (a6)
+; ZVBB-RV64-NEXT: vl1r.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1r.v v16, (a4)
+; ZVBB-RV64-NEXT: vl1r.v v9, (a3)
+; ZVBB-RV64-NEXT: vl1r.v v17, (a7)
; ZVBB-RV64-NEXT: csrr a0, vlenb
; ZVBB-RV64-NEXT: li a3, 14
; ZVBB-RV64-NEXT: mul a0, a0, a3
; ZVBB-RV64-NEXT: add a0, sp, a0
; ZVBB-RV64-NEXT: addi a0, a0, 64
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re64.v v12, (a6)
+; ZVBB-RV64-NEXT: vl1r.v v12, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re64.v v13, (a6)
+; ZVBB-RV64-NEXT: vl1r.v v13, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
; ZVBB-RV64-NEXT: slli a2, a2, 3
; ZVBB-RV64-NEXT: add a2, a0, a2
-; ZVBB-RV64-NEXT: vl1re64.v v14, (a6)
-; ZVBB-RV64-NEXT: vl1re64.v v15, (a1)
+; ZVBB-RV64-NEXT: vl1r.v v14, (a6)
+; ZVBB-RV64-NEXT: vl1r.v v15, (a1)
; ZVBB-RV64-NEXT: add a5, a0, a5
; ZVBB-RV64-NEXT: vs2r.v v20, (a5)
; ZVBB-RV64-NEXT: vs4r.v v16, (a2)
; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
-; ZVBB-RV64-NEXT: vl8re64.v v16, (a2)
-; ZVBB-RV64-NEXT: vl8re64.v v8, (a0)
+; ZVBB-RV64-NEXT: vl8r.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8r.v v8, (a0)
; ZVBB-RV64-NEXT: addi sp, s0, -80
; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZVBB-RV64-NEXT: addi sp, sp, 80
; ZVBB-RV64-NEXT: ret
;
-; ZIP-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; ZIP-LABEL: vector_interleave_nxv112i8_nxv16i8:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -80
; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -3789,7 +4199,7 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
; ZIP-NEXT: slli a0, a0, 5
; ZIP-NEXT: sub sp, sp, a0
; ZIP-NEXT: andi sp, sp, -64
-; ZIP-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZIP-NEXT: vsetvli a0, zero, e8, m1, ta, ma
; ZIP-NEXT: vmv2r.v v26, v20
; ZIP-NEXT: addi a0, sp, 64
; ZIP-NEXT: vmv2r.v v24, v16
@@ -3817,938 +4227,5556 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
; ZIP-NEXT: vmv1r.v v22, v11
; ZIP-NEXT: add a6, a7, a2
; ZIP-NEXT: vmv1r.v v24, v15
-; ZIP-NEXT: vsseg7e64.v v1, (a0)
+; ZIP-NEXT: vsseg7e8.v v1, (a0)
; ZIP-NEXT: vmv1r.v v26, v19
-; ZIP-NEXT: vsseg7e64.v v21, (a1)
-; ZIP-NEXT: vl1re64.v v18, (a6)
+; ZIP-NEXT: vsseg7e8.v v21, (a1)
+; ZIP-NEXT: vl1r.v v18, (a6)
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re64.v v19, (a6)
+; ZIP-NEXT: vl1r.v v19, (a6)
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re64.v v20, (a6)
+; ZIP-NEXT: vl1r.v v20, (a6)
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re64.v v21, (a6)
+; ZIP-NEXT: vl1r.v v21, (a6)
; ZIP-NEXT: add a6, a3, a2
-; ZIP-NEXT: vl1re64.v v10, (a6)
+; ZIP-NEXT: vl1r.v v10, (a6)
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re64.v v11, (a6)
-; ZIP-NEXT: vl1re64.v v8, (a0)
-; ZIP-NEXT: vl1re64.v v16, (a4)
-; ZIP-NEXT: vl1re64.v v9, (a3)
-; ZIP-NEXT: vl1re64.v v17, (a7)
+; ZIP-NEXT: vl1r.v v11, (a6)
+; ZIP-NEXT: vl1r.v v8, (a0)
+; ZIP-NEXT: vl1r.v v16, (a4)
+; ZIP-NEXT: vl1r.v v9, (a3)
+; ZIP-NEXT: vl1r.v v17, (a7)
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: li a3, 14
; ZIP-NEXT: mul a0, a0, a3
; ZIP-NEXT: add a0, sp, a0
; ZIP-NEXT: addi a0, a0, 64
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re64.v v12, (a6)
+; ZIP-NEXT: vl1r.v v12, (a6)
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re64.v v13, (a6)
+; ZIP-NEXT: vl1r.v v13, (a6)
; ZIP-NEXT: add a6, a6, a2
; ZIP-NEXT: slli a2, a2, 3
; ZIP-NEXT: add a2, a0, a2
-; ZIP-NEXT: vl1re64.v v14, (a6)
-; ZIP-NEXT: vl1re64.v v15, (a1)
+; ZIP-NEXT: vl1r.v v14, (a6)
+; ZIP-NEXT: vl1r.v v15, (a1)
; ZIP-NEXT: add a5, a0, a5
; ZIP-NEXT: vs2r.v v20, (a5)
; ZIP-NEXT: vs4r.v v16, (a2)
; ZIP-NEXT: vs8r.v v8, (a0)
-; ZIP-NEXT: vl8re64.v v16, (a2)
-; ZIP-NEXT: vl8re64.v v8, (a0)
+; ZIP-NEXT: vl8r.v v16, (a2)
+; ZIP-NEXT: vl8r.v v8, (a0)
; ZIP-NEXT: addi sp, s0, -80
; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZIP-NEXT: addi sp, sp, 80
; ZIP-NEXT: ret
- %res = call <vscale x 14 x i64> @llvm.vector.interleave7.nxv14i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d, <vscale x 2 x i64> %e, <vscale x 2 x i64> %f, <vscale x 2 x i64> %g)
- ret <vscale x 14 x i64> %res
-}
-
-; Floats
-
-define <vscale x 4 x bfloat> @vector_interleave_nxv4bf16_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
-; V-LABEL: vector_interleave_nxv4bf16_nxv2bf16:
-; V: # %bb.0:
-; V-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; V-NEXT: vwaddu.vv v10, v8, v9
-; V-NEXT: li a0, -1
-; V-NEXT: csrr a1, vlenb
-; V-NEXT: vwmaccu.vx v10, a0, v9
-; V-NEXT: srli a1, a1, 2
-; V-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; V-NEXT: vslidedown.vx v8, v10, a1
-; V-NEXT: add a0, a1, a1
-; V-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; V-NEXT: vslideup.vx v10, v8, a1
-; V-NEXT: vmv.v.v v8, v10
-; V-NEXT: ret
-;
-; ZVBB-LABEL: vector_interleave_nxv4bf16_nxv2bf16:
-; ZVBB: # %bb.0:
-; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; ZVBB-NEXT: vwsll.vi v10, v9, 16
-; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: vwaddu.wv v10, v10, v8
-; ZVBB-NEXT: srli a0, a0, 2
-; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; ZVBB-NEXT: vslidedown.vx v8, v10, a0
-; ZVBB-NEXT: add a1, a0, a0
-; ZVBB-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; ZVBB-NEXT: vslideup.vx v10, v8, a0
-; ZVBB-NEXT: vmv.v.v v8, v10
-; ZVBB-NEXT: ret
-;
-; ZIP-LABEL: vector_interleave_nxv4bf16_nxv2bf16:
-; ZIP: # %bb.0:
-; ZIP-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; ZIP-NEXT: ri.vzip2b.vv v11, v8, v9
-; ZIP-NEXT: ri.vzip2a.vv v10, v8, v9
-; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: srli a0, a0, 2
-; ZIP-NEXT: add a1, a0, a0
-; ZIP-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; ZIP-NEXT: vslideup.vx v10, v11, a0
-; ZIP-NEXT: vmv.v.v v8, v10
-; ZIP-NEXT: ret
- %res = call <vscale x 4 x bfloat> @llvm.vector.interleave2.nxv4bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
- ret <vscale x 4 x bfloat> %res
+ %res = call <vscale x 112 x i8> @llvm.vector.interleave7.nxv112i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, <vscale x 16 x i8> %e, <vscale x 16 x i8> %f, <vscale x 16 x i8> %g)
+ ret <vscale x 112 x i8> %res
}
-define <vscale x 8 x bfloat> @vector_interleave_nxv8bf16_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
-; V-LABEL: vector_interleave_nxv8bf16_nxv4bf16:
-; V: # %bb.0:
-; V-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; V-NEXT: vmv1r.v v10, v9
-; V-NEXT: vmv1r.v v11, v8
-; V-NEXT: vwaddu.vv v8, v11, v10
-; V-NEXT: li a0, -1
-; V-NEXT: vwmaccu.vx v8, a0, v10
-; V-NEXT: ret
-;
-; ZVBB-LABEL: vector_interleave_nxv8bf16_nxv4bf16:
-; ZVBB: # %bb.0:
-; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; ZVBB-NEXT: vmv1r.v v10, v9
-; ZVBB-NEXT: vmv1r.v v11, v8
-; ZVBB-NEXT: vwsll.vi v8, v10, 16
-; ZVBB-NEXT: vwaddu.wv v8, v8, v11
-; ZVBB-NEXT: ret
-;
-; ZIP-LABEL: vector_interleave_nxv8bf16_nxv4bf16:
-; ZIP: # %bb.0:
-; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; ZIP-NEXT: vmv1r.v v10, v9
-; ZIP-NEXT: vmv1r.v v11, v8
-; ZIP-NEXT: ri.vzip2b.vv v9, v8, v10
-; ZIP-NEXT: ri.vzip2a.vv v8, v11, v10
-; ZIP-NEXT: ret
- %res = call <vscale x 8 x bfloat> @llvm.vector.interleave2.nxv8bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
- ret <vscale x 8 x bfloat> %res
-}
-define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b) {
-; V-LABEL: vector_interleave_nxv4f16_nxv2f16:
-; V: # %bb.0:
-; V-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; V-NEXT: vwaddu.vv v10, v8, v9
-; V-NEXT: li a0, -1
-; V-NEXT: csrr a1, vlenb
-; V-NEXT: vwmaccu.vx v10, a0, v9
-; V-NEXT: srli a1, a1, 2
-; V-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; V-NEXT: vslidedown.vx v8, v10, a1
-; V-NEXT: add a0, a1, a1
-; V-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; V-NEXT: vslideup.vx v10, v8, a1
-; V-NEXT: vmv.v.v v8, v10
-; V-NEXT: ret
+define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c, <vscale x 8 x i16> %d, <vscale x 8 x i16> %e, <vscale x 8 x i16> %f, <vscale x 8 x i16> %g) nounwind {
+;
+; RV32-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -80
+; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 80
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vmv2r.v v26, v20
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv2r.v v24, v16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 3
+; RV32-NEXT: sub a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 64
+; RV32-NEXT: vmv2r.v v22, v12
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: vmv2r.v v20, v8
+; RV32-NEXT: vmv1r.v v1, v20
+; RV32-NEXT: vmv1r.v v3, v22
+; RV32-NEXT: vmv1r.v v5, v24
+; RV32-NEXT: vmv1r.v v7, v26
+; RV32-NEXT: add a3, a0, a2
+; RV32-NEXT: vmv1r.v v2, v10
+; RV32-NEXT: add a4, a1, a2
+; RV32-NEXT: slli a5, a2, 2
+; RV32-NEXT: vmv1r.v v4, v14
+; RV32-NEXT: slli a6, a2, 4
+; RV32-NEXT: add a7, a4, a2
+; RV32-NEXT: vmv1r.v v6, v18
+; RV32-NEXT: sub a5, a6, a5
+; RV32-NEXT: vmv1r.v v22, v11
+; RV32-NEXT: add a6, a7, a2
+; RV32-NEXT: vmv1r.v v24, v15
+; RV32-NEXT: vsseg7e16.v v1, (a0)
+; RV32-NEXT: vmv1r.v v26, v19
+; RV32-NEXT: vsseg7e16.v v21, (a1)
+; RV32-NEXT: vl1re16.v v18, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v19, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v20, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v21, (a6)
+; RV32-NEXT: add a6, a3, a2
+; RV32-NEXT: vl1re16.v v10, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v11, (a6)
+; RV32-NEXT: vl1re16.v v8, (a0)
+; RV32-NEXT: vl1re16.v v16, (a4)
+; RV32-NEXT: vl1re16.v v9, (a3)
+; RV32-NEXT: vl1re16.v v17, (a7)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a3, 14
+; RV32-NEXT: mul a0, a0, a3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 64
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v12, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v13, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a0, a2
+; RV32-NEXT: vl1re16.v v14, (a6)
+; RV32-NEXT: vl1re16.v v15, (a1)
+; RV32-NEXT: add a5, a0, a5
+; RV32-NEXT: vs2r.v v20, (a5)
+; RV32-NEXT: vs4r.v v16, (a2)
+; RV32-NEXT: vs8r.v v8, (a0)
+; RV32-NEXT: vl8re16.v v16, (a2)
+; RV32-NEXT: vl8re16.v v8, (a0)
+; RV32-NEXT: addi sp, s0, -80
+; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 80
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -80
+; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi s0, sp, 80
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: sub sp, sp, a0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vmv2r.v v26, v20
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv2r.v v24, v16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a2, a1, 3
+; RV64-NEXT: sub a1, a2, a1
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 64
+; RV64-NEXT: vmv2r.v v22, v12
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: vmv2r.v v20, v8
+; RV64-NEXT: vmv1r.v v1, v20
+; RV64-NEXT: vmv1r.v v3, v22
+; RV64-NEXT: vmv1r.v v5, v24
+; RV64-NEXT: vmv1r.v v7, v26
+; RV64-NEXT: add a3, a0, a2
+; RV64-NEXT: vmv1r.v v2, v10
+; RV64-NEXT: add a4, a1, a2
+; RV64-NEXT: slli a5, a2, 2
+; RV64-NEXT: vmv1r.v v4, v14
+; RV64-NEXT: slli a6, a2, 4
+; RV64-NEXT: add a7, a4, a2
+; RV64-NEXT: vmv1r.v v6, v18
+; RV64-NEXT: sub a5, a6, a5
+; RV64-NEXT: vmv1r.v v22, v11
+; RV64-NEXT: add a6, a7, a2
+; RV64-NEXT: vmv1r.v v24, v15
+; RV64-NEXT: vsseg7e16.v v1, (a0)
+; RV64-NEXT: vmv1r.v v26, v19
+; RV64-NEXT: vsseg7e16.v v21, (a1)
+; RV64-NEXT: vl1re16.v v18, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v19, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v20, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v21, (a6)
+; RV64-NEXT: add a6, a3, a2
+; RV64-NEXT: vl1re16.v v10, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v11, (a6)
+; RV64-NEXT: vl1re16.v v8, (a0)
+; RV64-NEXT: vl1re16.v v16, (a4)
+; RV64-NEXT: vl1re16.v v9, (a3)
+; RV64-NEXT: vl1re16.v v17, (a7)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a3, 14
+; RV64-NEXT: mul a0, a0, a3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 64
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v12, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v13, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a0, a2
+; RV64-NEXT: vl1re16.v v14, (a6)
+; RV64-NEXT: vl1re16.v v15, (a1)
+; RV64-NEXT: add a5, a0, a5
+; RV64-NEXT: vs2r.v v20, (a5)
+; RV64-NEXT: vs4r.v v16, (a2)
+; RV64-NEXT: vs8r.v v8, (a0)
+; RV64-NEXT: vl8re16.v v16, (a2)
+; RV64-NEXT: vl8re16.v v8, (a0)
+; RV64-NEXT: addi sp, s0, -80
+; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 80
+; RV64-NEXT: ret
+;
+; ZVBB-RV32-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; ZVBB-RV32: # %bb.0:
+; ZVBB-RV32-NEXT: addi sp, sp, -80
+; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: addi s0, sp, 80
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: slli a0, a0, 5
+; ZVBB-RV32-NEXT: sub sp, sp, a0
+; ZVBB-RV32-NEXT: andi sp, sp, -64
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV32-NEXT: vmv2r.v v26, v20
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv2r.v v24, v16
+; ZVBB-RV32-NEXT: csrr a1, vlenb
+; ZVBB-RV32-NEXT: slli a2, a1, 3
+; ZVBB-RV32-NEXT: sub a1, a2, a1
+; ZVBB-RV32-NEXT: add a1, sp, a1
+; ZVBB-RV32-NEXT: addi a1, a1, 64
+; ZVBB-RV32-NEXT: vmv2r.v v22, v12
+; ZVBB-RV32-NEXT: csrr a2, vlenb
+; ZVBB-RV32-NEXT: vmv2r.v v20, v8
+; ZVBB-RV32-NEXT: vmv1r.v v1, v20
+; ZVBB-RV32-NEXT: vmv1r.v v3, v22
+; ZVBB-RV32-NEXT: vmv1r.v v5, v24
+; ZVBB-RV32-NEXT: vmv1r.v v7, v26
+; ZVBB-RV32-NEXT: add a3, a0, a2
+; ZVBB-RV32-NEXT: vmv1r.v v2, v10
+; ZVBB-RV32-NEXT: add a4, a1, a2
+; ZVBB-RV32-NEXT: slli a5, a2, 2
+; ZVBB-RV32-NEXT: vmv1r.v v4, v14
+; ZVBB-RV32-NEXT: slli a6, a2, 4
+; ZVBB-RV32-NEXT: add a7, a4, a2
+; ZVBB-RV32-NEXT: vmv1r.v v6, v18
+; ZVBB-RV32-NEXT: sub a5, a6, a5
+; ZVBB-RV32-NEXT: vmv1r.v v22, v11
+; ZVBB-RV32-NEXT: add a6, a7, a2
+; ZVBB-RV32-NEXT: vmv1r.v v24, v15
+; ZVBB-RV32-NEXT: vsseg7e16.v v1, (a0)
+; ZVBB-RV32-NEXT: vmv1r.v v26, v19
+; ZVBB-RV32-NEXT: vsseg7e16.v v21, (a1)
+; ZVBB-RV32-NEXT: vl1re16.v v18, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v19, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v20, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v21, (a6)
+; ZVBB-RV32-NEXT: add a6, a3, a2
+; ZVBB-RV32-NEXT: vl1re16.v v10, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v11, (a6)
+; ZVBB-RV32-NEXT: vl1re16.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re16.v v16, (a4)
+; ZVBB-RV32-NEXT: vl1re16.v v9, (a3)
+; ZVBB-RV32-NEXT: vl1re16.v v17, (a7)
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a3, 14
+; ZVBB-RV32-NEXT: mul a0, a0, a3
+; ZVBB-RV32-NEXT: add a0, sp, a0
+; ZVBB-RV32-NEXT: addi a0, a0, 64
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v12, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v13, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: slli a2, a2, 3
+; ZVBB-RV32-NEXT: add a2, a0, a2
+; ZVBB-RV32-NEXT: vl1re16.v v14, (a6)
+; ZVBB-RV32-NEXT: vl1re16.v v15, (a1)
+; ZVBB-RV32-NEXT: add a5, a0, a5
+; ZVBB-RV32-NEXT: vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT: vs4r.v v16, (a2)
+; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT: vl8re16.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8re16.v v8, (a0)
+; ZVBB-RV32-NEXT: addi sp, s0, -80
+; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: addi sp, sp, 80
+; ZVBB-RV32-NEXT: ret
+;
+; ZVBB-RV64-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; ZVBB-RV64: # %bb.0:
+; ZVBB-RV64-NEXT: addi sp, sp, -80
+; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: addi s0, sp, 80
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: slli a0, a0, 5
+; ZVBB-RV64-NEXT: sub sp, sp, a0
+; ZVBB-RV64-NEXT: andi sp, sp, -64
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV64-NEXT: vmv2r.v v26, v20
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv2r.v v24, v16
+; ZVBB-RV64-NEXT: csrr a1, vlenb
+; ZVBB-RV64-NEXT: slli a2, a1, 3
+; ZVBB-RV64-NEXT: sub a1, a2, a1
+; ZVBB-RV64-NEXT: add a1, sp, a1
+; ZVBB-RV64-NEXT: addi a1, a1, 64
+; ZVBB-RV64-NEXT: vmv2r.v v22, v12
+; ZVBB-RV64-NEXT: csrr a2, vlenb
+; ZVBB-RV64-NEXT: vmv2r.v v20, v8
+; ZVBB-RV64-NEXT: vmv1r.v v1, v20
+; ZVBB-RV64-NEXT: vmv1r.v v3, v22
+; ZVBB-RV64-NEXT: vmv1r.v v5, v24
+; ZVBB-RV64-NEXT: vmv1r.v v7, v26
+; ZVBB-RV64-NEXT: add a3, a0, a2
+; ZVBB-RV64-NEXT: vmv1r.v v2, v10
+; ZVBB-RV64-NEXT: add a4, a1, a2
+; ZVBB-RV64-NEXT: slli a5, a2, 2
+; ZVBB-RV64-NEXT: vmv1r.v v4, v14
+; ZVBB-RV64-NEXT: slli a6, a2, 4
+; ZVBB-RV64-NEXT: add a7, a4, a2
+; ZVBB-RV64-NEXT: vmv1r.v v6, v18
+; ZVBB-RV64-NEXT: sub a5, a6, a5
+; ZVBB-RV64-NEXT: vmv1r.v v22, v11
+; ZVBB-RV64-NEXT: add a6, a7, a2
+; ZVBB-RV64-NEXT: vmv1r.v v24, v15
+; ZVBB-RV64-NEXT: vsseg7e16.v v1, (a0)
+; ZVBB-RV64-NEXT: vmv1r.v v26, v19
+; ZVBB-RV64-NEXT: vsseg7e16.v v21, (a1)
+; ZVBB-RV64-NEXT: vl1re16.v v18, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v19, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v20, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v21, (a6)
+; ZVBB-RV64-NEXT: add a6, a3, a2
+; ZVBB-RV64-NEXT: vl1re16.v v10, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v11, (a6)
+; ZVBB-RV64-NEXT: vl1re16.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re16.v v16, (a4)
+; ZVBB-RV64-NEXT: vl1re16.v v9, (a3)
+; ZVBB-RV64-NEXT: vl1re16.v v17, (a7)
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a3, 14
+; ZVBB-RV64-NEXT: mul a0, a0, a3
+; ZVBB-RV64-NEXT: add a0, sp, a0
+; ZVBB-RV64-NEXT: addi a0, a0, 64
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v12, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v13, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: slli a2, a2, 3
+; ZVBB-RV64-NEXT: add a2, a0, a2
+; ZVBB-RV64-NEXT: vl1re16.v v14, (a6)
+; ZVBB-RV64-NEXT: vl1re16.v v15, (a1)
+; ZVBB-RV64-NEXT: add a5, a0, a5
+; ZVBB-RV64-NEXT: vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT: vs4r.v v16, (a2)
+; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT: vl8re16.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8re16.v v8, (a0)
+; ZVBB-RV64-NEXT: addi sp, s0, -80
+; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: addi sp, sp, 80
+; ZVBB-RV64-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: addi sp, sp, -80
+; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZIP-NEXT: addi s0, sp, 80
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: slli a0, a0, 5
+; ZIP-NEXT: sub sp, sp, a0
+; ZIP-NEXT: andi sp, sp, -64
+; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT: vmv2r.v v26, v20
+; ZIP-NEXT: addi a0, sp, 64
+; ZIP-NEXT: vmv2r.v v24, v16
+; ZIP-NEXT: csrr a1, vlenb
+; ZIP-NEXT: slli a2, a1, 3
+; ZIP-NEXT: sub a1, a2, a1
+; ZIP-NEXT: add a1, sp, a1
+; ZIP-NEXT: addi a1, a1, 64
+; ZIP-NEXT: vmv2r.v v22, v12
+; ZIP-NEXT: csrr a2, vlenb
+; ZIP-NEXT: vmv2r.v v20, v8
+; ZIP-NEXT: vmv1r.v v1, v20
+; ZIP-NEXT: vmv1r.v v3, v22
+; ZIP-NEXT: vmv1r.v v5, v24
+; ZIP-NEXT: vmv1r.v v7, v26
+; ZIP-NEXT: add a3, a0, a2
+; ZIP-NEXT: vmv1r.v v2, v10
+; ZIP-NEXT: add a4, a1, a2
+; ZIP-NEXT: slli a5, a2, 2
+; ZIP-NEXT: vmv1r.v v4, v14
+; ZIP-NEXT: slli a6, a2, 4
+; ZIP-NEXT: add a7, a4, a2
+; ZIP-NEXT: vmv1r.v v6, v18
+; ZIP-NEXT: sub a5, a6, a5
+; ZIP-NEXT: vmv1r.v v22, v11
+; ZIP-NEXT: add a6, a7, a2
+; ZIP-NEXT: vmv1r.v v24, v15
+; ZIP-NEXT: vsseg7e16.v v1, (a0)
+; ZIP-NEXT: vmv1r.v v26, v19
+; ZIP-NEXT: vsseg7e16.v v21, (a1)
+; ZIP-NEXT: vl1re16.v v18, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re16.v v19, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re16.v v20, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re16.v v21, (a6)
+; ZIP-NEXT: add a6, a3, a2
+; ZIP-NEXT: vl1re16.v v10, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re16.v v11, (a6)
+; ZIP-NEXT: vl1re16.v v8, (a0)
+; ZIP-NEXT: vl1re16.v v16, (a4)
+; ZIP-NEXT: vl1re16.v v9, (a3)
+; ZIP-NEXT: vl1re16.v v17, (a7)
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: li a3, 14
+; ZIP-NEXT: mul a0, a0, a3
+; ZIP-NEXT: add a0, sp, a0
+; ZIP-NEXT: addi a0, a0, 64
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re16.v v12, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re16.v v13, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: slli a2, a2, 3
+; ZIP-NEXT: add a2, a0, a2
+; ZIP-NEXT: vl1re16.v v14, (a6)
+; ZIP-NEXT: vl1re16.v v15, (a1)
+; ZIP-NEXT: add a5, a0, a5
+; ZIP-NEXT: vs2r.v v20, (a5)
+; ZIP-NEXT: vs4r.v v16, (a2)
+; ZIP-NEXT: vs8r.v v8, (a0)
+; ZIP-NEXT: vl8re16.v v16, (a2)
+; ZIP-NEXT: vl8re16.v v8, (a0)
+; ZIP-NEXT: addi sp, s0, -80
+; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZIP-NEXT: addi sp, sp, 80
+; ZIP-NEXT: ret
+ %res = call <vscale x 56 x i16> @llvm.vector.interleave7.nxv56i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c, <vscale x 8 x i16> %d, <vscale x 8 x i16> %e, <vscale x 8 x i16> %f, <vscale x 8 x i16> %g)
+ ret <vscale x 56 x i16> %res
+}
+
+
+define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, <vscale x 4 x i32> %e, <vscale x 4 x i32> %f, <vscale x 4 x i32> %g) nounwind {
+;
+; RV32-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -80
+; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 80
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; RV32-NEXT: vmv2r.v v26, v20
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv2r.v v24, v16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 3
+; RV32-NEXT: sub a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 64
+; RV32-NEXT: vmv2r.v v22, v12
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: vmv2r.v v20, v8
+; RV32-NEXT: vmv1r.v v1, v20
+; RV32-NEXT: vmv1r.v v3, v22
+; RV32-NEXT: vmv1r.v v5, v24
+; RV32-NEXT: vmv1r.v v7, v26
+; RV32-NEXT: add a3, a0, a2
+; RV32-NEXT: vmv1r.v v2, v10
+; RV32-NEXT: add a4, a1, a2
+; RV32-NEXT: slli a5, a2, 2
+; RV32-NEXT: vmv1r.v v4, v14
+; RV32-NEXT: slli a6, a2, 4
+; RV32-NEXT: add a7, a4, a2
+; RV32-NEXT: vmv1r.v v6, v18
+; RV32-NEXT: sub a5, a6, a5
+; RV32-NEXT: vmv1r.v v22, v11
+; RV32-NEXT: add a6, a7, a2
+; RV32-NEXT: vmv1r.v v24, v15
+; RV32-NEXT: vsseg7e32.v v1, (a0)
+; RV32-NEXT: vmv1r.v v26, v19
+; RV32-NEXT: vsseg7e32.v v21, (a1)
+; RV32-NEXT: vl1re32.v v18, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re32.v v19, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re32.v v20, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re32.v v21, (a6)
+; RV32-NEXT: add a6, a3, a2
+; RV32-NEXT: vl1re32.v v10, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re32.v v11, (a6)
+; RV32-NEXT: vl1re32.v v8, (a0)
+; RV32-NEXT: vl1re32.v v16, (a4)
+; RV32-NEXT: vl1re32.v v9, (a3)
+; RV32-NEXT: vl1re32.v v17, (a7)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a3, 14
+; RV32-NEXT: mul a0, a0, a3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 64
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re32.v v12, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re32.v v13, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a0, a2
+; RV32-NEXT: vl1re32.v v14, (a6)
+; RV32-NEXT: vl1re32.v v15, (a1)
+; RV32-NEXT: add a5, a0, a5
+; RV32-NEXT: vs2r.v v20, (a5)
+; RV32-NEXT: vs4r.v v16, (a2)
+; RV32-NEXT: vs8r.v v8, (a0)
+; RV32-NEXT: vl8re32.v v16, (a2)
+; RV32-NEXT: vl8re32.v v8, (a0)
+; RV32-NEXT: addi sp, s0, -80
+; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 80
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -80
+; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi s0, sp, 80
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: sub sp, sp, a0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; RV64-NEXT: vmv2r.v v26, v20
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv2r.v v24, v16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a2, a1, 3
+; RV64-NEXT: sub a1, a2, a1
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 64
+; RV64-NEXT: vmv2r.v v22, v12
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: vmv2r.v v20, v8
+; RV64-NEXT: vmv1r.v v1, v20
+; RV64-NEXT: vmv1r.v v3, v22
+; RV64-NEXT: vmv1r.v v5, v24
+; RV64-NEXT: vmv1r.v v7, v26
+; RV64-NEXT: add a3, a0, a2
+; RV64-NEXT: vmv1r.v v2, v10
+; RV64-NEXT: add a4, a1, a2
+; RV64-NEXT: slli a5, a2, 2
+; RV64-NEXT: vmv1r.v v4, v14
+; RV64-NEXT: slli a6, a2, 4
+; RV64-NEXT: add a7, a4, a2
+; RV64-NEXT: vmv1r.v v6, v18
+; RV64-NEXT: sub a5, a6, a5
+; RV64-NEXT: vmv1r.v v22, v11
+; RV64-NEXT: add a6, a7, a2
+; RV64-NEXT: vmv1r.v v24, v15
+; RV64-NEXT: vsseg7e32.v v1, (a0)
+; RV64-NEXT: vmv1r.v v26, v19
+; RV64-NEXT: vsseg7e32.v v21, (a1)
+; RV64-NEXT: vl1re32.v v18, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re32.v v19, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re32.v v20, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re32.v v21, (a6)
+; RV64-NEXT: add a6, a3, a2
+; RV64-NEXT: vl1re32.v v10, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re32.v v11, (a6)
+; RV64-NEXT: vl1re32.v v8, (a0)
+; RV64-NEXT: vl1re32.v v16, (a4)
+; RV64-NEXT: vl1re32.v v9, (a3)
+; RV64-NEXT: vl1re32.v v17, (a7)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a3, 14
+; RV64-NEXT: mul a0, a0, a3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 64
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re32.v v12, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re32.v v13, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a0, a2
+; RV64-NEXT: vl1re32.v v14, (a6)
+; RV64-NEXT: vl1re32.v v15, (a1)
+; RV64-NEXT: add a5, a0, a5
+; RV64-NEXT: vs2r.v v20, (a5)
+; RV64-NEXT: vs4r.v v16, (a2)
+; RV64-NEXT: vs8r.v v8, (a0)
+; RV64-NEXT: vl8re32.v v16, (a2)
+; RV64-NEXT: vl8re32.v v8, (a0)
+; RV64-NEXT: addi sp, s0, -80
+; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 80
+; RV64-NEXT: ret
+;
+; ZVBB-RV32-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; ZVBB-RV32: # %bb.0:
+; ZVBB-RV32-NEXT: addi sp, sp, -80
+; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: addi s0, sp, 80
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: slli a0, a0, 5
+; ZVBB-RV32-NEXT: sub sp, sp, a0
+; ZVBB-RV32-NEXT: andi sp, sp, -64
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV32-NEXT: vmv2r.v v26, v20
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv2r.v v24, v16
+; ZVBB-RV32-NEXT: csrr a1, vlenb
+; ZVBB-RV32-NEXT: slli a2, a1, 3
+; ZVBB-RV32-NEXT: sub a1, a2, a1
+; ZVBB-RV32-NEXT: add a1, sp, a1
+; ZVBB-RV32-NEXT: addi a1, a1, 64
+; ZVBB-RV32-NEXT: vmv2r.v v22, v12
+; ZVBB-RV32-NEXT: csrr a2, vlenb
+; ZVBB-RV32-NEXT: vmv2r.v v20, v8
+; ZVBB-RV32-NEXT: vmv1r.v v1, v20
+; ZVBB-RV32-NEXT: vmv1r.v v3, v22
+; ZVBB-RV32-NEXT: vmv1r.v v5, v24
+; ZVBB-RV32-NEXT: vmv1r.v v7, v26
+; ZVBB-RV32-NEXT: add a3, a0, a2
+; ZVBB-RV32-NEXT: vmv1r.v v2, v10
+; ZVBB-RV32-NEXT: add a4, a1, a2
+; ZVBB-RV32-NEXT: slli a5, a2, 2
+; ZVBB-RV32-NEXT: vmv1r.v v4, v14
+; ZVBB-RV32-NEXT: slli a6, a2, 4
+; ZVBB-RV32-NEXT: add a7, a4, a2
+; ZVBB-RV32-NEXT: vmv1r.v v6, v18
+; ZVBB-RV32-NEXT: sub a5, a6, a5
+; ZVBB-RV32-NEXT: vmv1r.v v22, v11
+; ZVBB-RV32-NEXT: add a6, a7, a2
+; ZVBB-RV32-NEXT: vmv1r.v v24, v15
+; ZVBB-RV32-NEXT: vsseg7e32.v v1, (a0)
+; ZVBB-RV32-NEXT: vmv1r.v v26, v19
+; ZVBB-RV32-NEXT: vsseg7e32.v v21, (a1)
+; ZVBB-RV32-NEXT: vl1re32.v v18, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v19, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v20, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v21, (a6)
+; ZVBB-RV32-NEXT: add a6, a3, a2
+; ZVBB-RV32-NEXT: vl1re32.v v10, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v11, (a6)
+; ZVBB-RV32-NEXT: vl1re32.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re32.v v16, (a4)
+; ZVBB-RV32-NEXT: vl1re32.v v9, (a3)
+; ZVBB-RV32-NEXT: vl1re32.v v17, (a7)
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a3, 14
+; ZVBB-RV32-NEXT: mul a0, a0, a3
+; ZVBB-RV32-NEXT: add a0, sp, a0
+; ZVBB-RV32-NEXT: addi a0, a0, 64
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v12, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v13, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: slli a2, a2, 3
+; ZVBB-RV32-NEXT: add a2, a0, a2
+; ZVBB-RV32-NEXT: vl1re32.v v14, (a6)
+; ZVBB-RV32-NEXT: vl1re32.v v15, (a1)
+; ZVBB-RV32-NEXT: add a5, a0, a5
+; ZVBB-RV32-NEXT: vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT: vs4r.v v16, (a2)
+; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT: vl8re32.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8re32.v v8, (a0)
+; ZVBB-RV32-NEXT: addi sp, s0, -80
+; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: addi sp, sp, 80
+; ZVBB-RV32-NEXT: ret
+;
+; ZVBB-RV64-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; ZVBB-RV64: # %bb.0:
+; ZVBB-RV64-NEXT: addi sp, sp, -80
+; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: addi s0, sp, 80
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: slli a0, a0, 5
+; ZVBB-RV64-NEXT: sub sp, sp, a0
+; ZVBB-RV64-NEXT: andi sp, sp, -64
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV64-NEXT: vmv2r.v v26, v20
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv2r.v v24, v16
+; ZVBB-RV64-NEXT: csrr a1, vlenb
+; ZVBB-RV64-NEXT: slli a2, a1, 3
+; ZVBB-RV64-NEXT: sub a1, a2, a1
+; ZVBB-RV64-NEXT: add a1, sp, a1
+; ZVBB-RV64-NEXT: addi a1, a1, 64
+; ZVBB-RV64-NEXT: vmv2r.v v22, v12
+; ZVBB-RV64-NEXT: csrr a2, vlenb
+; ZVBB-RV64-NEXT: vmv2r.v v20, v8
+; ZVBB-RV64-NEXT: vmv1r.v v1, v20
+; ZVBB-RV64-NEXT: vmv1r.v v3, v22
+; ZVBB-RV64-NEXT: vmv1r.v v5, v24
+; ZVBB-RV64-NEXT: vmv1r.v v7, v26
+; ZVBB-RV64-NEXT: add a3, a0, a2
+; ZVBB-RV64-NEXT: vmv1r.v v2, v10
+; ZVBB-RV64-NEXT: add a4, a1, a2
+; ZVBB-RV64-NEXT: slli a5, a2, 2
+; ZVBB-RV64-NEXT: vmv1r.v v4, v14
+; ZVBB-RV64-NEXT: slli a6, a2, 4
+; ZVBB-RV64-NEXT: add a7, a4, a2
+; ZVBB-RV64-NEXT: vmv1r.v v6, v18
+; ZVBB-RV64-NEXT: sub a5, a6, a5
+; ZVBB-RV64-NEXT: vmv1r.v v22, v11
+; ZVBB-RV64-NEXT: add a6, a7, a2
+; ZVBB-RV64-NEXT: vmv1r.v v24, v15
+; ZVBB-RV64-NEXT: vsseg7e32.v v1, (a0)
+; ZVBB-RV64-NEXT: vmv1r.v v26, v19
+; ZVBB-RV64-NEXT: vsseg7e32.v v21, (a1)
+; ZVBB-RV64-NEXT: vl1re32.v v18, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v19, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v20, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v21, (a6)
+; ZVBB-RV64-NEXT: add a6, a3, a2
+; ZVBB-RV64-NEXT: vl1re32.v v10, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v11, (a6)
+; ZVBB-RV64-NEXT: vl1re32.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re32.v v16, (a4)
+; ZVBB-RV64-NEXT: vl1re32.v v9, (a3)
+; ZVBB-RV64-NEXT: vl1re32.v v17, (a7)
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a3, 14
+; ZVBB-RV64-NEXT: mul a0, a0, a3
+; ZVBB-RV64-NEXT: add a0, sp, a0
+; ZVBB-RV64-NEXT: addi a0, a0, 64
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v12, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v13, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: slli a2, a2, 3
+; ZVBB-RV64-NEXT: add a2, a0, a2
+; ZVBB-RV64-NEXT: vl1re32.v v14, (a6)
+; ZVBB-RV64-NEXT: vl1re32.v v15, (a1)
+; ZVBB-RV64-NEXT: add a5, a0, a5
+; ZVBB-RV64-NEXT: vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT: vs4r.v v16, (a2)
+; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT: vl8re32.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8re32.v v8, (a0)
+; ZVBB-RV64-NEXT: addi sp, s0, -80
+; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: addi sp, sp, 80
+; ZVBB-RV64-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; ZIP: # %bb.0:
+; ZIP-NEXT: addi sp, sp, -80
+; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZIP-NEXT: addi s0, sp, 80
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: slli a0, a0, 5
+; ZIP-NEXT: sub sp, sp, a0
+; ZIP-NEXT: andi sp, sp, -64
+; ZIP-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZIP-NEXT: vmv2r.v v26, v20
+; ZIP-NEXT: addi a0, sp, 64
+; ZIP-NEXT: vmv2r.v v24, v16
+; ZIP-NEXT: csrr a1, vlenb
+; ZIP-NEXT: slli a2, a1, 3
+; ZIP-NEXT: sub a1, a2, a1
+; ZIP-NEXT: add a1, sp, a1
+; ZIP-NEXT: addi a1, a1, 64
+; ZIP-NEXT: vmv2r.v v22, v12
+; ZIP-NEXT: csrr a2, vlenb
+; ZIP-NEXT: vmv2r.v v20, v8
+; ZIP-NEXT: vmv1r.v v1, v20
+; ZIP-NEXT: vmv1r.v v3, v22
+; ZIP-NEXT: vmv1r.v v5, v24
+; ZIP-NEXT: vmv1r.v v7, v26
+; ZIP-NEXT: add a3, a0, a2
+; ZIP-NEXT: vmv1r.v v2, v10
+; ZIP-NEXT: add a4, a1, a2
+; ZIP-NEXT: slli a5, a2, 2
+; ZIP-NEXT: vmv1r.v v4, v14
+; ZIP-NEXT: slli a6, a2, 4
+; ZIP-NEXT: add a7, a4, a2
+; ZIP-NEXT: vmv1r.v v6, v18
+; ZIP-NEXT: sub a5, a6, a5
+; ZIP-NEXT: vmv1r.v v22, v11
+; ZIP-NEXT: add a6, a7, a2
+; ZIP-NEXT: vmv1r.v v24, v15
+; ZIP-NEXT: vsseg7e32.v v1, (a0)
+; ZIP-NEXT: vmv1r.v v26, v19
+; ZIP-NEXT: vsseg7e32.v v21, (a1)
+; ZIP-NEXT: vl1re32.v v18, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re32.v v19, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re32.v v20, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re32.v v21, (a6)
+; ZIP-NEXT: add a6, a3, a2
+; ZIP-NEXT: vl1re32.v v10, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re32.v v11, (a6)
+; ZIP-NEXT: vl1re32.v v8, (a0)
+; ZIP-NEXT: vl1re32.v v16, (a4)
+; ZIP-NEXT: vl1re32.v v9, (a3)
+; ZIP-NEXT: vl1re32.v v17, (a7)
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: li a3, 14
+; ZIP-NEXT: mul a0, a0, a3
+; ZIP-NEXT: add a0, sp, a0
+; ZIP-NEXT: addi a0, a0, 64
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re32.v v12, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re32.v v13, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: slli a2, a2, 3
+; ZIP-NEXT: add a2, a0, a2
+; ZIP-NEXT: vl1re32.v v14, (a6)
+; ZIP-NEXT: vl1re32.v v15, (a1)
+; ZIP-NEXT: add a5, a0, a5
+; ZIP-NEXT: vs2r.v v20, (a5)
+; ZIP-NEXT: vs4r.v v16, (a2)
+; ZIP-NEXT: vs8r.v v8, (a0)
+; ZIP-NEXT: vl8re32.v v16, (a2)
+; ZIP-NEXT: vl8re32.v v8, (a0)
+; ZIP-NEXT: addi sp, s0, -80
+; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZIP-NEXT: addi sp, sp, 80
+; ZIP-NEXT: ret
+ %res = call <vscale x 28 x i32> @llvm.vector.interleave7.nxv28i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, <vscale x 4 x i32> %e, <vscale x 4 x i32> %f, <vscale x 4 x i32> %g)
+ ret <vscale x 28 x i32> %res
+}
+
+define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d, <vscale x 2 x i64> %e, <vscale x 2 x i64> %f, <vscale x 2 x i64> %g) nounwind {
+;
+; RV32-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -80
+; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 80
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: slli a0, a0, 5
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT: vmv2r.v v26, v20
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv2r.v v24, v16
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 3
+; RV32-NEXT: sub a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 64
+; RV32-NEXT: vmv2r.v v22, v12
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: vmv2r.v v20, v8
+; RV32-NEXT: vmv1r.v v1, v20
+; RV32-NEXT: vmv1r.v v3, v22
+; RV32-NEXT: vmv1r.v v5, v24
+; RV32-NEXT: vmv1r.v v7, v26
+; RV32-NEXT: add a3, a0, a2
+; RV32-NEXT: vmv1r.v v2, v10
+; RV32-NEXT: add a4, a1, a2
+; RV32-NEXT: slli a5, a2, 2
+; RV32-NEXT: vmv1r.v v4, v14
+; RV32-NEXT: slli a6, a2, 4
+; RV32-NEXT: add a7, a4, a2
+; RV32-NEXT: vmv1r.v v6, v18
+; RV32-NEXT: sub a5, a6, a5
+; RV32-NEXT: vmv1r.v v22, v11
+; RV32-NEXT: add a6, a7, a2
+; RV32-NEXT: vmv1r.v v24, v15
+; RV32-NEXT: vsseg7e64.v v1, (a0)
+; RV32-NEXT: vmv1r.v v26, v19
+; RV32-NEXT: vsseg7e64.v v21, (a1)
+; RV32-NEXT: vl1re64.v v18, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re64.v v19, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re64.v v20, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re64.v v21, (a6)
+; RV32-NEXT: add a6, a3, a2
+; RV32-NEXT: vl1re64.v v10, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re64.v v11, (a6)
+; RV32-NEXT: vl1re64.v v8, (a0)
+; RV32-NEXT: vl1re64.v v16, (a4)
+; RV32-NEXT: vl1re64.v v9, (a3)
+; RV32-NEXT: vl1re64.v v17, (a7)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a3, 14
+; RV32-NEXT: mul a0, a0, a3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 64
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re64.v v12, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re64.v v13, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a0, a2
+; RV32-NEXT: vl1re64.v v14, (a6)
+; RV32-NEXT: vl1re64.v v15, (a1)
+; RV32-NEXT: add a5, a0, a5
+; RV32-NEXT: vs2r.v v20, (a5)
+; RV32-NEXT: vs4r.v v16, (a2)
+; RV32-NEXT: vs8r.v v8, (a0)
+; RV32-NEXT: vl8re64.v v16, (a2)
+; RV32-NEXT: vl8re64.v v8, (a0)
+; RV32-NEXT: addi sp, s0, -80
+; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 80
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -80
+; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi s0, sp, 80
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: slli a0, a0, 5
+; RV64-NEXT: sub sp, sp, a0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv2r.v v26, v20
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv2r.v v24, v16
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a2, a1, 3
+; RV64-NEXT: sub a1, a2, a1
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 64
+; RV64-NEXT: vmv2r.v v22, v12
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: vmv2r.v v20, v8
+; RV64-NEXT: vmv1r.v v1, v20
+; RV64-NEXT: vmv1r.v v3, v22
+; RV64-NEXT: vmv1r.v v5, v24
+; RV64-NEXT: vmv1r.v v7, v26
+; RV64-NEXT: add a3, a0, a2
+; RV64-NEXT: vmv1r.v v2, v10
+; RV64-NEXT: add a4, a1, a2
+; RV64-NEXT: slli a5, a2, 2
+; RV64-NEXT: vmv1r.v v4, v14
+; RV64-NEXT: slli a6, a2, 4
+; RV64-NEXT: add a7, a4, a2
+; RV64-NEXT: vmv1r.v v6, v18
+; RV64-NEXT: sub a5, a6, a5
+; RV64-NEXT: vmv1r.v v22, v11
+; RV64-NEXT: add a6, a7, a2
+; RV64-NEXT: vmv1r.v v24, v15
+; RV64-NEXT: vsseg7e64.v v1, (a0)
+; RV64-NEXT: vmv1r.v v26, v19
+; RV64-NEXT: vsseg7e64.v v21, (a1)
+; RV64-NEXT: vl1re64.v v18, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re64.v v19, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re64.v v20, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re64.v v21, (a6)
+; RV64-NEXT: add a6, a3, a2
+; RV64-NEXT: vl1re64.v v10, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re64.v v11, (a6)
+; RV64-NEXT: vl1re64.v v8, (a0)
+; RV64-NEXT: vl1re64.v v16, (a4)
+; RV64-NEXT: vl1re64.v v9, (a3)
+; RV64-NEXT: vl1re64.v v17, (a7)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a3, 14
+; RV64-NEXT: mul a0, a0, a3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 64
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re64.v v12, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re64.v v13, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a0, a2
+; RV64-NEXT: vl1re64.v v14, (a6)
+; RV64-NEXT: vl1re64.v v15, (a1)
+; RV64-NEXT: add a5, a0, a5
+; RV64-NEXT: vs2r.v v20, (a5)
+; RV64-NEXT: vs4r.v v16, (a2)
+; RV64-NEXT: vs8r.v v8, (a0)
+; RV64-NEXT: vl8re64.v v16, (a2)
+; RV64-NEXT: vl8re64.v v8, (a0)
+; RV64-NEXT: addi sp, s0, -80
+; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 80
+; RV64-NEXT: ret
+;
+; ZVBB-RV32-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; ZVBB-RV32: # %bb.0:
+; ZVBB-RV32-NEXT: addi sp, sp, -80
+; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: addi s0, sp, 80
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: slli a0, a0, 5
+; ZVBB-RV32-NEXT: sub sp, sp, a0
+; ZVBB-RV32-NEXT: andi sp, sp, -64
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV32-NEXT: vmv2r.v v26, v20
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv2r.v v24, v16
+; ZVBB-RV32-NEXT: csrr a1, vlenb
+; ZVBB-RV32-NEXT: slli a2, a1, 3
+; ZVBB-RV32-NEXT: sub a1, a2, a1
+; ZVBB-RV32-NEXT: add a1, sp, a1
+; ZVBB-RV32-NEXT: addi a1, a1, 64
+; ZVBB-RV32-NEXT: vmv2r.v v22, v12
+; ZVBB-RV32-NEXT: csrr a2, vlenb
+; ZVBB-RV32-NEXT: vmv2r.v v20, v8
+; ZVBB-RV32-NEXT: vmv1r.v v1, v20
+; ZVBB-RV32-NEXT: vmv1r.v v3, v22
+; ZVBB-RV32-NEXT: vmv1r.v v5, v24
+; ZVBB-RV32-NEXT: vmv1r.v v7, v26
+; ZVBB-RV32-NEXT: add a3, a0, a2
+; ZVBB-RV32-NEXT: vmv1r.v v2, v10
+; ZVBB-RV32-NEXT: add a4, a1, a2
+; ZVBB-RV32-NEXT: slli a5, a2, 2
+; ZVBB-RV32-NEXT: vmv1r.v v4, v14
+; ZVBB-RV32-NEXT: slli a6, a2, 4
+; ZVBB-RV32-NEXT: add a7, a4, a2
+; ZVBB-RV32-NEXT: vmv1r.v v6, v18
+; ZVBB-RV32-NEXT: sub a5, a6, a5
+; ZVBB-RV32-NEXT: vmv1r.v v22, v11
+; ZVBB-RV32-NEXT: add a6, a7, a2
+; ZVBB-RV32-NEXT: vmv1r.v v24, v15
+; ZVBB-RV32-NEXT: vsseg7e64.v v1, (a0)
+; ZVBB-RV32-NEXT: vmv1r.v v26, v19
+; ZVBB-RV32-NEXT: vsseg7e64.v v21, (a1)
+; ZVBB-RV32-NEXT: vl1re64.v v18, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v19, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v20, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v21, (a6)
+; ZVBB-RV32-NEXT: add a6, a3, a2
+; ZVBB-RV32-NEXT: vl1re64.v v10, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v11, (a6)
+; ZVBB-RV32-NEXT: vl1re64.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re64.v v16, (a4)
+; ZVBB-RV32-NEXT: vl1re64.v v9, (a3)
+; ZVBB-RV32-NEXT: vl1re64.v v17, (a7)
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a3, 14
+; ZVBB-RV32-NEXT: mul a0, a0, a3
+; ZVBB-RV32-NEXT: add a0, sp, a0
+; ZVBB-RV32-NEXT: addi a0, a0, 64
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v12, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v13, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: slli a2, a2, 3
+; ZVBB-RV32-NEXT: add a2, a0, a2
+; ZVBB-RV32-NEXT: vl1re64.v v14, (a6)
+; ZVBB-RV32-NEXT: vl1re64.v v15, (a1)
+; ZVBB-RV32-NEXT: add a5, a0, a5
+; ZVBB-RV32-NEXT: vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT: vs4r.v v16, (a2)
+; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT: vl8re64.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8re64.v v8, (a0)
+; ZVBB-RV32-NEXT: addi sp, s0, -80
+; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: addi sp, sp, 80
+; ZVBB-RV32-NEXT: ret
+;
+; ZVBB-RV64-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; ZVBB-RV64: # %bb.0:
+; ZVBB-RV64-NEXT: addi sp, sp, -80
+; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: addi s0, sp, 80
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: slli a0, a0, 5
+; ZVBB-RV64-NEXT: sub sp, sp, a0
+; ZVBB-RV64-NEXT: andi sp, sp, -64
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV64-NEXT: vmv2r.v v26, v20
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv2r.v v24, v16
+; ZVBB-RV64-NEXT: csrr a1, vlenb
+; ZVBB-RV64-NEXT: slli a2, a1, 3
+; ZVBB-RV64-NEXT: sub a1, a2, a1
+; ZVBB-RV64-NEXT: add a1, sp, a1
+; ZVBB-RV64-NEXT: addi a1, a1, 64
+; ZVBB-RV64-NEXT: vmv2r.v v22, v12
+; ZVBB-RV64-NEXT: csrr a2, vlenb
+; ZVBB-RV64-NEXT: vmv2r.v v20, v8
+; ZVBB-RV64-NEXT: vmv1r.v v1, v20
+; ZVBB-RV64-NEXT: vmv1r.v v3, v22
+; ZVBB-RV64-NEXT: vmv1r.v v5, v24
+; ZVBB-RV64-NEXT: vmv1r.v v7, v26
+; ZVBB-RV64-NEXT: add a3, a0, a2
+; ZVBB-RV64-NEXT: vmv1r.v v2, v10
+; ZVBB-RV64-NEXT: add a4, a1, a2
+; ZVBB-RV64-NEXT: slli a5, a2, 2
+; ZVBB-RV64-NEXT: vmv1r.v v4, v14
+; ZVBB-RV64-NEXT: slli a6, a2, 4
+; ZVBB-RV64-NEXT: add a7, a4, a2
+; ZVBB-RV64-NEXT: vmv1r.v v6, v18
+; ZVBB-RV64-NEXT: sub a5, a6, a5
+; ZVBB-RV64-NEXT: vmv1r.v v22, v11
+; ZVBB-RV64-NEXT: add a6, a7, a2
+; ZVBB-RV64-NEXT: vmv1r.v v24, v15
+; ZVBB-RV64-NEXT: vsseg7e64.v v1, (a0)
+; ZVBB-RV64-NEXT: vmv1r.v v26, v19
+; ZVBB-RV64-NEXT: vsseg7e64.v v21, (a1)
+; ZVBB-RV64-NEXT: vl1re64.v v18, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v19, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v20, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v21, (a6)
+; ZVBB-RV64-NEXT: add a6, a3, a2
+; ZVBB-RV64-NEXT: vl1re64.v v10, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v11, (a6)
+; ZVBB-RV64-NEXT: vl1re64.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re64.v v16, (a4)
+; ZVBB-RV64-NEXT: vl1re64.v v9, (a3)
+; ZVBB-RV64-NEXT: vl1re64.v v17, (a7)
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a3, 14
+; ZVBB-RV64-NEXT: mul a0, a0, a3
+; ZVBB-RV64-NEXT: add a0, sp, a0
+; ZVBB-RV64-NEXT: addi a0, a0, 64
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v12, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v13, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: slli a2, a2, 3
+; ZVBB-RV64-NEXT: add a2, a0, a2
+; ZVBB-RV64-NEXT: vl1re64.v v14, (a6)
+; ZVBB-RV64-NEXT: vl1re64.v v15, (a1)
+; ZVBB-RV64-NEXT: add a5, a0, a5
+; ZVBB-RV64-NEXT: vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT: vs4r.v v16, (a2)
+; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT: vl8re64.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8re64.v v8, (a0)
+; ZVBB-RV64-NEXT: addi sp, s0, -80
+; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: addi sp, sp, 80
+; ZVBB-RV64-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; ZIP: # %bb.0:
+; ZIP-NEXT: addi sp, sp, -80
+; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZIP-NEXT: addi s0, sp, 80
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: slli a0, a0, 5
+; ZIP-NEXT: sub sp, sp, a0
+; ZIP-NEXT: andi sp, sp, -64
+; ZIP-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZIP-NEXT: vmv2r.v v26, v20
+; ZIP-NEXT: addi a0, sp, 64
+; ZIP-NEXT: vmv2r.v v24, v16
+; ZIP-NEXT: csrr a1, vlenb
+; ZIP-NEXT: slli a2, a1, 3
+; ZIP-NEXT: sub a1, a2, a1
+; ZIP-NEXT: add a1, sp, a1
+; ZIP-NEXT: addi a1, a1, 64
+; ZIP-NEXT: vmv2r.v v22, v12
+; ZIP-NEXT: csrr a2, vlenb
+; ZIP-NEXT: vmv2r.v v20, v8
+; ZIP-NEXT: vmv1r.v v1, v20
+; ZIP-NEXT: vmv1r.v v3, v22
+; ZIP-NEXT: vmv1r.v v5, v24
+; ZIP-NEXT: vmv1r.v v7, v26
+; ZIP-NEXT: add a3, a0, a2
+; ZIP-NEXT: vmv1r.v v2, v10
+; ZIP-NEXT: add a4, a1, a2
+; ZIP-NEXT: slli a5, a2, 2
+; ZIP-NEXT: vmv1r.v v4, v14
+; ZIP-NEXT: slli a6, a2, 4
+; ZIP-NEXT: add a7, a4, a2
+; ZIP-NEXT: vmv1r.v v6, v18
+; ZIP-NEXT: sub a5, a6, a5
+; ZIP-NEXT: vmv1r.v v22, v11
+; ZIP-NEXT: add a6, a7, a2
+; ZIP-NEXT: vmv1r.v v24, v15
+; ZIP-NEXT: vsseg7e64.v v1, (a0)
+; ZIP-NEXT: vmv1r.v v26, v19
+; ZIP-NEXT: vsseg7e64.v v21, (a1)
+; ZIP-NEXT: vl1re64.v v18, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re64.v v19, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re64.v v20, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re64.v v21, (a6)
+; ZIP-NEXT: add a6, a3, a2
+; ZIP-NEXT: vl1re64.v v10, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re64.v v11, (a6)
+; ZIP-NEXT: vl1re64.v v8, (a0)
+; ZIP-NEXT: vl1re64.v v16, (a4)
+; ZIP-NEXT: vl1re64.v v9, (a3)
+; ZIP-NEXT: vl1re64.v v17, (a7)
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: li a3, 14
+; ZIP-NEXT: mul a0, a0, a3
+; ZIP-NEXT: add a0, sp, a0
+; ZIP-NEXT: addi a0, a0, 64
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re64.v v12, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re64.v v13, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: slli a2, a2, 3
+; ZIP-NEXT: add a2, a0, a2
+; ZIP-NEXT: vl1re64.v v14, (a6)
+; ZIP-NEXT: vl1re64.v v15, (a1)
+; ZIP-NEXT: add a5, a0, a5
+; ZIP-NEXT: vs2r.v v20, (a5)
+; ZIP-NEXT: vs4r.v v16, (a2)
+; ZIP-NEXT: vs8r.v v8, (a0)
+; ZIP-NEXT: vl8re64.v v16, (a2)
+; ZIP-NEXT: vl8re64.v v8, (a0)
+; ZIP-NEXT: addi sp, s0, -80
+; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZIP-NEXT: addi sp, sp, 80
+; ZIP-NEXT: ret
+ %res = call <vscale x 14 x i64> @llvm.vector.interleave7.nxv14i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d, <vscale x 2 x i64> %e, <vscale x 2 x i64> %f, <vscale x 2 x i64> %g)
+ ret <vscale x 14 x i64> %res
+}
+
+define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e, <vscale x 16 x i1> %f, <vscale x 16 x i1> %g, <vscale x 16 x i1> %h) nounwind {
+; CHECK-LABEL: vector_interleave_nxv128i1_nxv16i1:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmv.v.i v22, 0
+; CHECK-NEXT: vmerge.vim v24, v22, 1, v0
+; CHECK-NEXT: vmv1r.v v0, v8
+; CHECK-NEXT: vmerge.vim v16, v22, 1, v0
+; CHECK-NEXT: vmv1r.v v1, v24
+; CHECK-NEXT: vmv1r.v v2, v16
+; CHECK-NEXT: vmv1r.v v0, v9
+; CHECK-NEXT: vmerge.vim v26, v22, 1, v0
+; CHECK-NEXT: vmv1r.v v0, v10
+; CHECK-NEXT: vmerge.vim v18, v22, 1, v0
+; CHECK-NEXT: vmv1r.v v3, v26
+; CHECK-NEXT: vmv1r.v v4, v18
+; CHECK-NEXT: vmv1r.v v0, v11
+; CHECK-NEXT: vmerge.vim v8, v22, 1, v0
+; CHECK-NEXT: vmv1r.v v0, v12
+; CHECK-NEXT: vmerge.vim v20, v22, 1, v0
+; CHECK-NEXT: vmv1r.v v5, v8
+; CHECK-NEXT: vmv1r.v v6, v20
+; CHECK-NEXT: vmv1r.v v0, v13
+; CHECK-NEXT: vmerge.vim v10, v22, 1, v0
+; CHECK-NEXT: vmv1r.v v0, v14
+; CHECK-NEXT: vmerge.vim v22, v22, 1, v0
+; CHECK-NEXT: vmv1r.v v7, v10
+; CHECK-NEXT: vmv1r.v v8, v22
+; CHECK-NEXT: vmv1r.v v16, v25
+; CHECK-NEXT: addi a2, sp, 16
+; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT: vsseg8e8.v v1, (a2)
+; CHECK-NEXT: vmv1r.v v18, v27
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: add a3, a2, a0
+; CHECK-NEXT: add a4, a1, a0
+; CHECK-NEXT: add a5, a3, a0
+; CHECK-NEXT: add a6, a4, a0
+; CHECK-NEXT: add a7, a5, a0
+; CHECK-NEXT: add t0, a6, a0
+; CHECK-NEXT: add t1, a7, a0
+; CHECK-NEXT: add t2, t0, a0
+; CHECK-NEXT: vmv1r.v v20, v9
+; CHECK-NEXT: add t3, t1, a0
+; CHECK-NEXT: vmv1r.v v22, v11
+; CHECK-NEXT: vsseg8e8.v v16, (a1)
+; CHECK-NEXT: vl1r.v v10, (t1)
+; CHECK-NEXT: add t1, t2, a0
+; CHECK-NEXT: vl1r.v v12, (a5)
+; CHECK-NEXT: add a5, t3, a0
+; CHECK-NEXT: vl1r.v v14, (a2)
+; CHECK-NEXT: add a2, t1, a0
+; CHECK-NEXT: vl1r.v v16, (a5)
+; CHECK-NEXT: add a5, a5, a0
+; CHECK-NEXT: vl1r.v v8, (a2)
+; CHECK-NEXT: add a2, a2, a0
+; CHECK-NEXT: vl1r.v v18, (t2)
+; CHECK-NEXT: vl1r.v v17, (a5)
+; CHECK-NEXT: vl1r.v v11, (t3)
+; CHECK-NEXT: vl1r.v v13, (a7)
+; CHECK-NEXT: vl1r.v v15, (a3)
+; CHECK-NEXT: vsetvli a3, zero, e8, m2, ta, ma
+; CHECK-NEXT: vmsne.vi v20, v16, 0
+; CHECK-NEXT: vmsne.vi v16, v10, 0
+; CHECK-NEXT: vl1r.v v10, (a6)
+; CHECK-NEXT: vmsne.vi v17, v12, 0
+; CHECK-NEXT: vmsne.vi v0, v14, 0
+; CHECK-NEXT: vl1r.v v12, (a1)
+; CHECK-NEXT: vl1r.v v9, (a2)
+; CHECK-NEXT: vl1r.v v19, (t1)
+; CHECK-NEXT: vl1r.v v11, (t0)
+; CHECK-NEXT: vl1r.v v13, (a4)
+; CHECK-NEXT: vmsne.vi v14, v8, 0
+; CHECK-NEXT: vmsne.vi v9, v18, 0
+; CHECK-NEXT: vmsne.vi v15, v10, 0
+; CHECK-NEXT: vmsne.vi v8, v12, 0
+; CHECK-NEXT: srli a1, a0, 2
+; CHECK-NEXT: add a2, a1, a1
+; CHECK-NEXT: vsetvli zero, a2, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v16, v20, a1
+; CHECK-NEXT: vslideup.vx v0, v17, a1
+; CHECK-NEXT: vslideup.vx v9, v14, a1
+; CHECK-NEXT: vslideup.vx v8, v15, a1
+; CHECK-NEXT: srli a0, a0, 1
+; CHECK-NEXT: add a1, a0, a0
+; CHECK-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v0, v16, a0
+; CHECK-NEXT: vslideup.vx v8, v9, a0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv128i1_nxv16i1:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 4
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: vsetvli a0, zero, e8, m2, ta, ma
+; ZVBB-NEXT: vmv.v.i v22, 0
+; ZVBB-NEXT: vmerge.vim v24, v22, 1, v0
+; ZVBB-NEXT: vmv1r.v v0, v8
+; ZVBB-NEXT: vmerge.vim v16, v22, 1, v0
+; ZVBB-NEXT: vmv1r.v v1, v24
+; ZVBB-NEXT: vmv1r.v v2, v16
+; ZVBB-NEXT: vmv1r.v v0, v9
+; ZVBB-NEXT: vmerge.vim v26, v22, 1, v0
+; ZVBB-NEXT: vmv1r.v v0, v10
+; ZVBB-NEXT: vmerge.vim v18, v22, 1, v0
+; ZVBB-NEXT: vmv1r.v v3, v26
+; ZVBB-NEXT: vmv1r.v v4, v18
+; ZVBB-NEXT: vmv1r.v v0, v11
+; ZVBB-NEXT: vmerge.vim v8, v22, 1, v0
+; ZVBB-NEXT: vmv1r.v v0, v12
+; ZVBB-NEXT: vmerge.vim v20, v22, 1, v0
+; ZVBB-NEXT: vmv1r.v v5, v8
+; ZVBB-NEXT: vmv1r.v v6, v20
+; ZVBB-NEXT: vmv1r.v v0, v13
+; ZVBB-NEXT: vmerge.vim v10, v22, 1, v0
+; ZVBB-NEXT: vmv1r.v v0, v14
+; ZVBB-NEXT: vmerge.vim v22, v22, 1, v0
+; ZVBB-NEXT: vmv1r.v v7, v10
+; ZVBB-NEXT: vmv1r.v v8, v22
+; ZVBB-NEXT: vmv1r.v v16, v25
+; ZVBB-NEXT: addi a2, sp, 16
+; ZVBB-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; ZVBB-NEXT: vsseg8e8.v v1, (a2)
+; ZVBB-NEXT: vmv1r.v v18, v27
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 3
+; ZVBB-NEXT: add a1, sp, a1
+; ZVBB-NEXT: addi a1, a1, 16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: add a3, a2, a0
+; ZVBB-NEXT: add a4, a1, a0
+; ZVBB-NEXT: add a5, a3, a0
+; ZVBB-NEXT: add a6, a4, a0
+; ZVBB-NEXT: add a7, a5, a0
+; ZVBB-NEXT: add t0, a6, a0
+; ZVBB-NEXT: add t1, a7, a0
+; ZVBB-NEXT: add t2, t0, a0
+; ZVBB-NEXT: vmv1r.v v20, v9
+; ZVBB-NEXT: add t3, t1, a0
+; ZVBB-NEXT: vmv1r.v v22, v11
+; ZVBB-NEXT: vsseg8e8.v v16, (a1)
+; ZVBB-NEXT: vl1r.v v10, (t1)
+; ZVBB-NEXT: add t1, t2, a0
+; ZVBB-NEXT: vl1r.v v12, (a5)
+; ZVBB-NEXT: add a5, t3, a0
+; ZVBB-NEXT: vl1r.v v14, (a2)
+; ZVBB-NEXT: add a2, t1, a0
+; ZVBB-NEXT: vl1r.v v16, (a5)
+; ZVBB-NEXT: add a5, a5, a0
+; ZVBB-NEXT: vl1r.v v8, (a2)
+; ZVBB-NEXT: add a2, a2, a0
+; ZVBB-NEXT: vl1r.v v18, (t2)
+; ZVBB-NEXT: vl1r.v v17, (a5)
+; ZVBB-NEXT: vl1r.v v11, (t3)
+; ZVBB-NEXT: vl1r.v v13, (a7)
+; ZVBB-NEXT: vl1r.v v15, (a3)
+; ZVBB-NEXT: vsetvli a3, zero, e8, m2, ta, ma
+; ZVBB-NEXT: vmsne.vi v20, v16, 0
+; ZVBB-NEXT: vmsne.vi v16, v10, 0
+; ZVBB-NEXT: vl1r.v v10, (a6)
+; ZVBB-NEXT: vmsne.vi v17, v12, 0
+; ZVBB-NEXT: vmsne.vi v0, v14, 0
+; ZVBB-NEXT: vl1r.v v12, (a1)
+; ZVBB-NEXT: vl1r.v v9, (a2)
+; ZVBB-NEXT: vl1r.v v19, (t1)
+; ZVBB-NEXT: vl1r.v v11, (t0)
+; ZVBB-NEXT: vl1r.v v13, (a4)
+; ZVBB-NEXT: vmsne.vi v14, v8, 0
+; ZVBB-NEXT: vmsne.vi v9, v18, 0
+; ZVBB-NEXT: vmsne.vi v15, v10, 0
+; ZVBB-NEXT: vmsne.vi v8, v12, 0
+; ZVBB-NEXT: srli a1, a0, 2
+; ZVBB-NEXT: add a2, a1, a1
+; ZVBB-NEXT: vsetvli zero, a2, e8, mf2, ta, ma
+; ZVBB-NEXT: vslideup.vx v16, v20, a1
+; ZVBB-NEXT: vslideup.vx v0, v17, a1
+; ZVBB-NEXT: vslideup.vx v9, v14, a1
+; ZVBB-NEXT: vslideup.vx v8, v15, a1
+; ZVBB-NEXT: srli a0, a0, 1
+; ZVBB-NEXT: add a1, a0, a0
+; ZVBB-NEXT: vsetvli zero, a1, e8, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v0, v16, a0
+; ZVBB-NEXT: vslideup.vx v8, v9, a0
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 4
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 128 x i1> @llvm.vector.interleave8.nxv128i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e, <vscale x 16 x i1> %f, <vscale x 16 x i1> %g, <vscale x 16 x i1> %h)
+ ret <vscale x 128 x i1> %res
+}
+
+
+define <vscale x 128 x i8> @vector_interleave_nxv128i8_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, <vscale x 16 x i8> %e, <vscale x 16 x i8> %f, <vscale x 16 x i8> %g, <vscale x 16 x i8> %h) nounwind {
+;
+; CHECK-LABEL: vector_interleave_nxv128i8_nxv16i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT: vmv2r.v v28, v22
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv2r.v v26, v18
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: add a2, a0, a3
+; CHECK-NEXT: add a4, a1, a3
+; CHECK-NEXT: add a5, a2, a3
+; CHECK-NEXT: vmv1r.v v1, v8
+; CHECK-NEXT: vmv2r.v v24, v14
+; CHECK-NEXT: add a6, a4, a3
+; CHECK-NEXT: vmv2r.v v22, v10
+; CHECK-NEXT: vmv1r.v v2, v22
+; CHECK-NEXT: add a7, a5, a3
+; CHECK-NEXT: vmv1r.v v3, v12
+; CHECK-NEXT: add t0, a6, a3
+; CHECK-NEXT: vmv1r.v v4, v24
+; CHECK-NEXT: add t1, a7, a3
+; CHECK-NEXT: vmv1r.v v5, v16
+; CHECK-NEXT: add t2, t0, a3
+; CHECK-NEXT: vmv1r.v v6, v26
+; CHECK-NEXT: add t3, t1, a3
+; CHECK-NEXT: vmv1r.v v7, v20
+; CHECK-NEXT: add t4, t2, a3
+; CHECK-NEXT: vmv1r.v v8, v28
+; CHECK-NEXT: vmv1r.v v22, v9
+; CHECK-NEXT: add t5, t3, a3
+; CHECK-NEXT: vmv1r.v v24, v13
+; CHECK-NEXT: add t6, t4, a3
+; CHECK-NEXT: vmv1r.v v26, v17
+; CHECK-NEXT: vsseg8e8.v v1, (a0)
+; CHECK-NEXT: vmv1r.v v28, v21
+; CHECK-NEXT: vsseg8e8.v v22, (a1)
+; CHECK-NEXT: vl1r.v v14, (t5)
+; CHECK-NEXT: add t5, t5, a3
+; CHECK-NEXT: add a3, t6, a3
+; CHECK-NEXT: vl1r.v v22, (t6)
+; CHECK-NEXT: vl1r.v v15, (t5)
+; CHECK-NEXT: vl1r.v v23, (a3)
+; CHECK-NEXT: vl1r.v v12, (t1)
+; CHECK-NEXT: vl1r.v v20, (t2)
+; CHECK-NEXT: vl1r.v v13, (t3)
+; CHECK-NEXT: vl1r.v v21, (t4)
+; CHECK-NEXT: vl1r.v v10, (a5)
+; CHECK-NEXT: vl1r.v v18, (a6)
+; CHECK-NEXT: vl1r.v v11, (a7)
+; CHECK-NEXT: vl1r.v v19, (t0)
+; CHECK-NEXT: vl1r.v v8, (a0)
+; CHECK-NEXT: vl1r.v v16, (a1)
+; CHECK-NEXT: vl1r.v v9, (a2)
+; CHECK-NEXT: vl1r.v v17, (a4)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv128i8_nxv16i8:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 4
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: vsetvli a0, zero, e8, m1, ta, ma
+; ZVBB-NEXT: vmv2r.v v28, v22
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: vmv2r.v v26, v18
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 3
+; ZVBB-NEXT: add a1, sp, a1
+; ZVBB-NEXT: addi a1, a1, 16
+; ZVBB-NEXT: csrr a3, vlenb
+; ZVBB-NEXT: add a2, a0, a3
+; ZVBB-NEXT: add a4, a1, a3
+; ZVBB-NEXT: add a5, a2, a3
+; ZVBB-NEXT: vmv1r.v v1, v8
+; ZVBB-NEXT: vmv2r.v v24, v14
+; ZVBB-NEXT: add a6, a4, a3
+; ZVBB-NEXT: vmv2r.v v22, v10
+; ZVBB-NEXT: vmv1r.v v2, v22
+; ZVBB-NEXT: add a7, a5, a3
+; ZVBB-NEXT: vmv1r.v v3, v12
+; ZVBB-NEXT: add t0, a6, a3
+; ZVBB-NEXT: vmv1r.v v4, v24
+; ZVBB-NEXT: add t1, a7, a3
+; ZVBB-NEXT: vmv1r.v v5, v16
+; ZVBB-NEXT: add t2, t0, a3
+; ZVBB-NEXT: vmv1r.v v6, v26
+; ZVBB-NEXT: add t3, t1, a3
+; ZVBB-NEXT: vmv1r.v v7, v20
+; ZVBB-NEXT: add t4, t2, a3
+; ZVBB-NEXT: vmv1r.v v8, v28
+; ZVBB-NEXT: vmv1r.v v22, v9
+; ZVBB-NEXT: add t5, t3, a3
+; ZVBB-NEXT: vmv1r.v v24, v13
+; ZVBB-NEXT: add t6, t4, a3
+; ZVBB-NEXT: vmv1r.v v26, v17
+; ZVBB-NEXT: vsseg8e8.v v1, (a0)
+; ZVBB-NEXT: vmv1r.v v28, v21
+; ZVBB-NEXT: vsseg8e8.v v22, (a1)
+; ZVBB-NEXT: vl1r.v v14, (t5)
+; ZVBB-NEXT: add t5, t5, a3
+; ZVBB-NEXT: add a3, t6, a3
+; ZVBB-NEXT: vl1r.v v22, (t6)
+; ZVBB-NEXT: vl1r.v v15, (t5)
+; ZVBB-NEXT: vl1r.v v23, (a3)
+; ZVBB-NEXT: vl1r.v v12, (t1)
+; ZVBB-NEXT: vl1r.v v20, (t2)
+; ZVBB-NEXT: vl1r.v v13, (t3)
+; ZVBB-NEXT: vl1r.v v21, (t4)
+; ZVBB-NEXT: vl1r.v v10, (a5)
+; ZVBB-NEXT: vl1r.v v18, (a6)
+; ZVBB-NEXT: vl1r.v v11, (a7)
+; ZVBB-NEXT: vl1r.v v19, (t0)
+; ZVBB-NEXT: vl1r.v v8, (a0)
+; ZVBB-NEXT: vl1r.v v16, (a1)
+; ZVBB-NEXT: vl1r.v v9, (a2)
+; ZVBB-NEXT: vl1r.v v17, (a4)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 4
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 128 x i8> @llvm.vector.interleave8.nxv128i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, <vscale x 16 x i8> %e, <vscale x 16 x i8> %f, <vscale x 16 x i8> %g, <vscale x 16 x i8> %h)
+ ret <vscale x 128 x i8> %res
+}
+
+
+define <vscale x 64 x i16> @vector_interleave_nxv64i16_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c, <vscale x 8 x i16> %d, <vscale x 8 x i16> %e, <vscale x 8 x i16> %f, <vscale x 8 x i16> %g, <vscale x 8 x i16> %h) nounwind {
+;
+; CHECK-LABEL: vector_interleave_nxv64i16_nxv8i16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv2r.v v28, v22
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv2r.v v26, v18
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: add a2, a0, a3
+; CHECK-NEXT: add a4, a1, a3
+; CHECK-NEXT: add a5, a2, a3
+; CHECK-NEXT: vmv1r.v v1, v8
+; CHECK-NEXT: vmv2r.v v24, v14
+; CHECK-NEXT: add a6, a4, a3
+; CHECK-NEXT: vmv2r.v v22, v10
+; CHECK-NEXT: vmv1r.v v2, v22
+; CHECK-NEXT: add a7, a5, a3
+; CHECK-NEXT: vmv1r.v v3, v12
+; CHECK-NEXT: add t0, a6, a3
+; CHECK-NEXT: vmv1r.v v4, v24
+; CHECK-NEXT: add t1, a7, a3
+; CHECK-NEXT: vmv1r.v v5, v16
+; CHECK-NEXT: add t2, t0, a3
+; CHECK-NEXT: vmv1r.v v6, v26
+; CHECK-NEXT: add t3, t1, a3
+; CHECK-NEXT: vmv1r.v v7, v20
+; CHECK-NEXT: add t4, t2, a3
+; CHECK-NEXT: vmv1r.v v8, v28
+; CHECK-NEXT: vmv1r.v v22, v9
+; CHECK-NEXT: add t5, t3, a3
+; CHECK-NEXT: vmv1r.v v24, v13
+; CHECK-NEXT: add t6, t4, a3
+; CHECK-NEXT: vmv1r.v v26, v17
+; CHECK-NEXT: vsseg8e16.v v1, (a0)
+; CHECK-NEXT: vmv1r.v v28, v21
+; CHECK-NEXT: vsseg8e16.v v22, (a1)
+; CHECK-NEXT: vl1re16.v v14, (t5)
+; CHECK-NEXT: add t5, t5, a3
+; CHECK-NEXT: add a3, t6, a3
+; CHECK-NEXT: vl1re16.v v22, (t6)
+; CHECK-NEXT: vl1re16.v v15, (t5)
+; CHECK-NEXT: vl1re16.v v23, (a3)
+; CHECK-NEXT: vl1re16.v v12, (t1)
+; CHECK-NEXT: vl1re16.v v20, (t2)
+; CHECK-NEXT: vl1re16.v v13, (t3)
+; CHECK-NEXT: vl1re16.v v21, (t4)
+; CHECK-NEXT: vl1re16.v v10, (a5)
+; CHECK-NEXT: vl1re16.v v18, (a6)
+; CHECK-NEXT: vl1re16.v v11, (a7)
+; CHECK-NEXT: vl1re16.v v19, (t0)
+; CHECK-NEXT: vl1re16.v v8, (a0)
+; CHECK-NEXT: vl1re16.v v16, (a1)
+; CHECK-NEXT: vl1re16.v v9, (a2)
+; CHECK-NEXT: vl1re16.v v17, (a4)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv64i16_nxv8i16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 4
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-NEXT: vmv2r.v v28, v22
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: vmv2r.v v26, v18
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 3
+; ZVBB-NEXT: add a1, sp, a1
+; ZVBB-NEXT: addi a1, a1, 16
+; ZVBB-NEXT: csrr a3, vlenb
+; ZVBB-NEXT: add a2, a0, a3
+; ZVBB-NEXT: add a4, a1, a3
+; ZVBB-NEXT: add a5, a2, a3
+; ZVBB-NEXT: vmv1r.v v1, v8
+; ZVBB-NEXT: vmv2r.v v24, v14
+; ZVBB-NEXT: add a6, a4, a3
+; ZVBB-NEXT: vmv2r.v v22, v10
+; ZVBB-NEXT: vmv1r.v v2, v22
+; ZVBB-NEXT: add a7, a5, a3
+; ZVBB-NEXT: vmv1r.v v3, v12
+; ZVBB-NEXT: add t0, a6, a3
+; ZVBB-NEXT: vmv1r.v v4, v24
+; ZVBB-NEXT: add t1, a7, a3
+; ZVBB-NEXT: vmv1r.v v5, v16
+; ZVBB-NEXT: add t2, t0, a3
+; ZVBB-NEXT: vmv1r.v v6, v26
+; ZVBB-NEXT: add t3, t1, a3
+; ZVBB-NEXT: vmv1r.v v7, v20
+; ZVBB-NEXT: add t4, t2, a3
+; ZVBB-NEXT: vmv1r.v v8, v28
+; ZVBB-NEXT: vmv1r.v v22, v9
+; ZVBB-NEXT: add t5, t3, a3
+; ZVBB-NEXT: vmv1r.v v24, v13
+; ZVBB-NEXT: add t6, t4, a3
+; ZVBB-NEXT: vmv1r.v v26, v17
+; ZVBB-NEXT: vsseg8e16.v v1, (a0)
+; ZVBB-NEXT: vmv1r.v v28, v21
+; ZVBB-NEXT: vsseg8e16.v v22, (a1)
+; ZVBB-NEXT: vl1re16.v v14, (t5)
+; ZVBB-NEXT: add t5, t5, a3
+; ZVBB-NEXT: add a3, t6, a3
+; ZVBB-NEXT: vl1re16.v v22, (t6)
+; ZVBB-NEXT: vl1re16.v v15, (t5)
+; ZVBB-NEXT: vl1re16.v v23, (a3)
+; ZVBB-NEXT: vl1re16.v v12, (t1)
+; ZVBB-NEXT: vl1re16.v v20, (t2)
+; ZVBB-NEXT: vl1re16.v v13, (t3)
+; ZVBB-NEXT: vl1re16.v v21, (t4)
+; ZVBB-NEXT: vl1re16.v v10, (a5)
+; ZVBB-NEXT: vl1re16.v v18, (a6)
+; ZVBB-NEXT: vl1re16.v v11, (a7)
+; ZVBB-NEXT: vl1re16.v v19, (t0)
+; ZVBB-NEXT: vl1re16.v v8, (a0)
+; ZVBB-NEXT: vl1re16.v v16, (a1)
+; ZVBB-NEXT: vl1re16.v v9, (a2)
+; ZVBB-NEXT: vl1re16.v v17, (a4)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 4
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 64 x i16> @llvm.vector.interleave8.nxv64i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c, <vscale x 8 x i16> %d, <vscale x 8 x i16> %e, <vscale x 8 x i16> %f, <vscale x 8 x i16> %g, <vscale x 8 x i16> %h)
+ ret <vscale x 64 x i16> %res
+}
+
+
+define <vscale x 32 x i32> @vector_interleave_nxv32i32_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, <vscale x 4 x i32> %e, <vscale x 4 x i32> %f, <vscale x 4 x i32> %g, <vscale x 4 x i32> %h) nounwind {
+;
+; CHECK-LABEL: vector_interleave_nxv32i32_nxv4i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv2r.v v28, v22
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv2r.v v26, v18
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: add a2, a0, a3
+; CHECK-NEXT: add a4, a1, a3
+; CHECK-NEXT: add a5, a2, a3
+; CHECK-NEXT: vmv1r.v v1, v8
+; CHECK-NEXT: vmv2r.v v24, v14
+; CHECK-NEXT: add a6, a4, a3
+; CHECK-NEXT: vmv2r.v v22, v10
+; CHECK-NEXT: vmv1r.v v2, v22
+; CHECK-NEXT: add a7, a5, a3
+; CHECK-NEXT: vmv1r.v v3, v12
+; CHECK-NEXT: add t0, a6, a3
+; CHECK-NEXT: vmv1r.v v4, v24
+; CHECK-NEXT: add t1, a7, a3
+; CHECK-NEXT: vmv1r.v v5, v16
+; CHECK-NEXT: add t2, t0, a3
+; CHECK-NEXT: vmv1r.v v6, v26
+; CHECK-NEXT: add t3, t1, a3
+; CHECK-NEXT: vmv1r.v v7, v20
+; CHECK-NEXT: add t4, t2, a3
+; CHECK-NEXT: vmv1r.v v8, v28
+; CHECK-NEXT: vmv1r.v v22, v9
+; CHECK-NEXT: add t5, t3, a3
+; CHECK-NEXT: vmv1r.v v24, v13
+; CHECK-NEXT: add t6, t4, a3
+; CHECK-NEXT: vmv1r.v v26, v17
+; CHECK-NEXT: vsseg8e32.v v1, (a0)
+; CHECK-NEXT: vmv1r.v v28, v21
+; CHECK-NEXT: vsseg8e32.v v22, (a1)
+; CHECK-NEXT: vl1re32.v v14, (t5)
+; CHECK-NEXT: add t5, t5, a3
+; CHECK-NEXT: add a3, t6, a3
+; CHECK-NEXT: vl1re32.v v22, (t6)
+; CHECK-NEXT: vl1re32.v v15, (t5)
+; CHECK-NEXT: vl1re32.v v23, (a3)
+; CHECK-NEXT: vl1re32.v v12, (t1)
+; CHECK-NEXT: vl1re32.v v20, (t2)
+; CHECK-NEXT: vl1re32.v v13, (t3)
+; CHECK-NEXT: vl1re32.v v21, (t4)
+; CHECK-NEXT: vl1re32.v v10, (a5)
+; CHECK-NEXT: vl1re32.v v18, (a6)
+; CHECK-NEXT: vl1re32.v v11, (a7)
+; CHECK-NEXT: vl1re32.v v19, (t0)
+; CHECK-NEXT: vl1re32.v v8, (a0)
+; CHECK-NEXT: vl1re32.v v16, (a1)
+; CHECK-NEXT: vl1re32.v v9, (a2)
+; CHECK-NEXT: vl1re32.v v17, (a4)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv32i32_nxv4i32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 4
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-NEXT: vmv2r.v v28, v22
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: vmv2r.v v26, v18
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 3
+; ZVBB-NEXT: add a1, sp, a1
+; ZVBB-NEXT: addi a1, a1, 16
+; ZVBB-NEXT: csrr a3, vlenb
+; ZVBB-NEXT: add a2, a0, a3
+; ZVBB-NEXT: add a4, a1, a3
+; ZVBB-NEXT: add a5, a2, a3
+; ZVBB-NEXT: vmv1r.v v1, v8
+; ZVBB-NEXT: vmv2r.v v24, v14
+; ZVBB-NEXT: add a6, a4, a3
+; ZVBB-NEXT: vmv2r.v v22, v10
+; ZVBB-NEXT: vmv1r.v v2, v22
+; ZVBB-NEXT: add a7, a5, a3
+; ZVBB-NEXT: vmv1r.v v3, v12
+; ZVBB-NEXT: add t0, a6, a3
+; ZVBB-NEXT: vmv1r.v v4, v24
+; ZVBB-NEXT: add t1, a7, a3
+; ZVBB-NEXT: vmv1r.v v5, v16
+; ZVBB-NEXT: add t2, t0, a3
+; ZVBB-NEXT: vmv1r.v v6, v26
+; ZVBB-NEXT: add t3, t1, a3
+; ZVBB-NEXT: vmv1r.v v7, v20
+; ZVBB-NEXT: add t4, t2, a3
+; ZVBB-NEXT: vmv1r.v v8, v28
+; ZVBB-NEXT: vmv1r.v v22, v9
+; ZVBB-NEXT: add t5, t3, a3
+; ZVBB-NEXT: vmv1r.v v24, v13
+; ZVBB-NEXT: add t6, t4, a3
+; ZVBB-NEXT: vmv1r.v v26, v17
+; ZVBB-NEXT: vsseg8e32.v v1, (a0)
+; ZVBB-NEXT: vmv1r.v v28, v21
+; ZVBB-NEXT: vsseg8e32.v v22, (a1)
+; ZVBB-NEXT: vl1re32.v v14, (t5)
+; ZVBB-NEXT: add t5, t5, a3
+; ZVBB-NEXT: add a3, t6, a3
+; ZVBB-NEXT: vl1re32.v v22, (t6)
+; ZVBB-NEXT: vl1re32.v v15, (t5)
+; ZVBB-NEXT: vl1re32.v v23, (a3)
+; ZVBB-NEXT: vl1re32.v v12, (t1)
+; ZVBB-NEXT: vl1re32.v v20, (t2)
+; ZVBB-NEXT: vl1re32.v v13, (t3)
+; ZVBB-NEXT: vl1re32.v v21, (t4)
+; ZVBB-NEXT: vl1re32.v v10, (a5)
+; ZVBB-NEXT: vl1re32.v v18, (a6)
+; ZVBB-NEXT: vl1re32.v v11, (a7)
+; ZVBB-NEXT: vl1re32.v v19, (t0)
+; ZVBB-NEXT: vl1re32.v v8, (a0)
+; ZVBB-NEXT: vl1re32.v v16, (a1)
+; ZVBB-NEXT: vl1re32.v v9, (a2)
+; ZVBB-NEXT: vl1re32.v v17, (a4)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 4
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 32 x i32> @llvm.vector.interleave8.nxv32i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, <vscale x 4 x i32> %e, <vscale x 4 x i32> %f, <vscale x 4 x i32> %g, <vscale x 4 x i32> %h)
+ ret <vscale x 32 x i32> %res
+}
+
+define <vscale x 16 x i64> @vector_interleave_nxv16i64_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d, <vscale x 2 x i64> %e, <vscale x 2 x i64> %f, <vscale x 2 x i64> %g, <vscale x 2 x i64> %h) nounwind {
+;
+; CHECK-LABEL: vector_interleave_nxv16i64_nxv2i64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmv2r.v v28, v22
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv2r.v v26, v18
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: add a2, a0, a3
+; CHECK-NEXT: add a4, a1, a3
+; CHECK-NEXT: add a5, a2, a3
+; CHECK-NEXT: vmv1r.v v1, v8
+; CHECK-NEXT: vmv2r.v v24, v14
+; CHECK-NEXT: add a6, a4, a3
+; CHECK-NEXT: vmv2r.v v22, v10
+; CHECK-NEXT: vmv1r.v v2, v22
+; CHECK-NEXT: add a7, a5, a3
+; CHECK-NEXT: vmv1r.v v3, v12
+; CHECK-NEXT: add t0, a6, a3
+; CHECK-NEXT: vmv1r.v v4, v24
+; CHECK-NEXT: add t1, a7, a3
+; CHECK-NEXT: vmv1r.v v5, v16
+; CHECK-NEXT: add t2, t0, a3
+; CHECK-NEXT: vmv1r.v v6, v26
+; CHECK-NEXT: add t3, t1, a3
+; CHECK-NEXT: vmv1r.v v7, v20
+; CHECK-NEXT: add t4, t2, a3
+; CHECK-NEXT: vmv1r.v v8, v28
+; CHECK-NEXT: vmv1r.v v22, v9
+; CHECK-NEXT: add t5, t3, a3
+; CHECK-NEXT: vmv1r.v v24, v13
+; CHECK-NEXT: add t6, t4, a3
+; CHECK-NEXT: vmv1r.v v26, v17
+; CHECK-NEXT: vsseg8e64.v v1, (a0)
+; CHECK-NEXT: vmv1r.v v28, v21
+; CHECK-NEXT: vsseg8e64.v v22, (a1)
+; CHECK-NEXT: vl1re64.v v14, (t5)
+; CHECK-NEXT: add t5, t5, a3
+; CHECK-NEXT: add a3, t6, a3
+; CHECK-NEXT: vl1re64.v v22, (t6)
+; CHECK-NEXT: vl1re64.v v15, (t5)
+; CHECK-NEXT: vl1re64.v v23, (a3)
+; CHECK-NEXT: vl1re64.v v12, (t1)
+; CHECK-NEXT: vl1re64.v v20, (t2)
+; CHECK-NEXT: vl1re64.v v13, (t3)
+; CHECK-NEXT: vl1re64.v v21, (t4)
+; CHECK-NEXT: vl1re64.v v10, (a5)
+; CHECK-NEXT: vl1re64.v v18, (a6)
+; CHECK-NEXT: vl1re64.v v11, (a7)
+; CHECK-NEXT: vl1re64.v v19, (t0)
+; CHECK-NEXT: vl1re64.v v8, (a0)
+; CHECK-NEXT: vl1re64.v v16, (a1)
+; CHECK-NEXT: vl1re64.v v9, (a2)
+; CHECK-NEXT: vl1re64.v v17, (a4)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16i64_nxv2i64:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 4
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-NEXT: vmv2r.v v28, v22
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: vmv2r.v v26, v18
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 3
+; ZVBB-NEXT: add a1, sp, a1
+; ZVBB-NEXT: addi a1, a1, 16
+; ZVBB-NEXT: csrr a3, vlenb
+; ZVBB-NEXT: add a2, a0, a3
+; ZVBB-NEXT: add a4, a1, a3
+; ZVBB-NEXT: add a5, a2, a3
+; ZVBB-NEXT: vmv1r.v v1, v8
+; ZVBB-NEXT: vmv2r.v v24, v14
+; ZVBB-NEXT: add a6, a4, a3
+; ZVBB-NEXT: vmv2r.v v22, v10
+; ZVBB-NEXT: vmv1r.v v2, v22
+; ZVBB-NEXT: add a7, a5, a3
+; ZVBB-NEXT: vmv1r.v v3, v12
+; ZVBB-NEXT: add t0, a6, a3
+; ZVBB-NEXT: vmv1r.v v4, v24
+; ZVBB-NEXT: add t1, a7, a3
+; ZVBB-NEXT: vmv1r.v v5, v16
+; ZVBB-NEXT: add t2, t0, a3
+; ZVBB-NEXT: vmv1r.v v6, v26
+; ZVBB-NEXT: add t3, t1, a3
+; ZVBB-NEXT: vmv1r.v v7, v20
+; ZVBB-NEXT: add t4, t2, a3
+; ZVBB-NEXT: vmv1r.v v8, v28
+; ZVBB-NEXT: vmv1r.v v22, v9
+; ZVBB-NEXT: add t5, t3, a3
+; ZVBB-NEXT: vmv1r.v v24, v13
+; ZVBB-NEXT: add t6, t4, a3
+; ZVBB-NEXT: vmv1r.v v26, v17
+; ZVBB-NEXT: vsseg8e64.v v1, (a0)
+; ZVBB-NEXT: vmv1r.v v28, v21
+; ZVBB-NEXT: vsseg8e64.v v22, (a1)
+; ZVBB-NEXT: vl1re64.v v14, (t5)
+; ZVBB-NEXT: add t5, t5, a3
+; ZVBB-NEXT: add a3, t6, a3
+; ZVBB-NEXT: vl1re64.v v22, (t6)
+; ZVBB-NEXT: vl1re64.v v15, (t5)
+; ZVBB-NEXT: vl1re64.v v23, (a3)
+; ZVBB-NEXT: vl1re64.v v12, (t1)
+; ZVBB-NEXT: vl1re64.v v20, (t2)
+; ZVBB-NEXT: vl1re64.v v13, (t3)
+; ZVBB-NEXT: vl1re64.v v21, (t4)
+; ZVBB-NEXT: vl1re64.v v10, (a5)
+; ZVBB-NEXT: vl1re64.v v18, (a6)
+; ZVBB-NEXT: vl1re64.v v11, (a7)
+; ZVBB-NEXT: vl1re64.v v19, (t0)
+; ZVBB-NEXT: vl1re64.v v8, (a0)
+; ZVBB-NEXT: vl1re64.v v16, (a1)
+; ZVBB-NEXT: vl1re64.v v9, (a2)
+; ZVBB-NEXT: vl1re64.v v17, (a4)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 4
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 16 x i64> @llvm.vector.interleave8.nxv16i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d, <vscale x 2 x i64> %e, <vscale x 2 x i64> %f, <vscale x 2 x i64> %g, <vscale x 2 x i64> %h)
+ ret <vscale x 16 x i64> %res
+}
+
+; Floats
+
+define <vscale x 4 x bfloat> @vector_interleave_nxv4bf16_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; V-LABEL: vector_interleave_nxv4bf16_nxv2bf16:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; V-NEXT: vwaddu.vv v10, v8, v9
+; V-NEXT: li a0, -1
+; V-NEXT: csrr a1, vlenb
+; V-NEXT: vwmaccu.vx v10, a0, v9
+; V-NEXT: srli a1, a1, 2
+; V-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; V-NEXT: vslidedown.vx v8, v10, a1
+; V-NEXT: add a0, a1, a1
+; V-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; V-NEXT: vslideup.vx v10, v8, a1
+; V-NEXT: vmv.v.v v8, v10
+; V-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv4bf16_nxv2bf16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vwsll.vi v10, v9, 16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: vwaddu.wv v10, v10, v8
+; ZVBB-NEXT: srli a0, a0, 2
+; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; ZVBB-NEXT: vslidedown.vx v8, v10, a0
+; ZVBB-NEXT: add a1, a0, a0
+; ZVBB-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v10, v8, a0
+; ZVBB-NEXT: vmv.v.v v8, v10
+; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv4bf16_nxv2bf16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZIP-NEXT: ri.vzip2b.vv v11, v8, v9
+; ZIP-NEXT: ri.vzip2a.vv v10, v8, v9
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: srli a0, a0, 2
+; ZIP-NEXT: add a1, a0, a0
+; ZIP-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; ZIP-NEXT: vslideup.vx v10, v11, a0
+; ZIP-NEXT: vmv.v.v v8, v10
+; ZIP-NEXT: ret
+ %res = call <vscale x 4 x bfloat> @llvm.vector.interleave2.nxv4bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
+ ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @vector_interleave_nxv8bf16_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; V-LABEL: vector_interleave_nxv8bf16_nxv4bf16:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; V-NEXT: vmv1r.v v10, v9
+; V-NEXT: vmv1r.v v11, v8
+; V-NEXT: vwaddu.vv v8, v11, v10
+; V-NEXT: li a0, -1
+; V-NEXT: vwmaccu.vx v8, a0, v10
+; V-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv8bf16_nxv4bf16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-NEXT: vmv1r.v v10, v9
+; ZVBB-NEXT: vmv1r.v v11, v8
+; ZVBB-NEXT: vwsll.vi v8, v10, 16
+; ZVBB-NEXT: vwaddu.wv v8, v8, v11
+; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv8bf16_nxv4bf16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT: vmv1r.v v10, v9
+; ZIP-NEXT: vmv1r.v v11, v8
+; ZIP-NEXT: ri.vzip2b.vv v9, v8, v10
+; ZIP-NEXT: ri.vzip2a.vv v8, v11, v10
+; ZIP-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.vector.interleave2.nxv8bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b) {
+; V-LABEL: vector_interleave_nxv4f16_nxv2f16:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; V-NEXT: vwaddu.vv v10, v8, v9
+; V-NEXT: li a0, -1
+; V-NEXT: csrr a1, vlenb
+; V-NEXT: vwmaccu.vx v10, a0, v9
+; V-NEXT: srli a1, a1, 2
+; V-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; V-NEXT: vslidedown.vx v8, v10, a1
+; V-NEXT: add a0, a1, a1
+; V-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; V-NEXT: vslideup.vx v10, v8, a1
+; V-NEXT: vmv.v.v v8, v10
+; V-NEXT: ret
;
; ZVBB-LABEL: vector_interleave_nxv4f16_nxv2f16:
; ZVBB: # %bb.0:
-; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; ZVBB-NEXT: vwsll.vi v10, v9, 16
+; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vwsll.vi v10, v9, 16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: vwaddu.wv v10, v10, v8
+; ZVBB-NEXT: srli a0, a0, 2
+; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, ma
+; ZVBB-NEXT: vslidedown.vx v8, v10, a0
+; ZVBB-NEXT: add a1, a0, a0
+; ZVBB-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v10, v8, a0
+; ZVBB-NEXT: vmv.v.v v8, v10
+; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv4f16_nxv2f16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZIP-NEXT: ri.vzip2b.vv v11, v8, v9
+; ZIP-NEXT: ri.vzip2a.vv v10, v8, v9
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: srli a0, a0, 2
+; ZIP-NEXT: add a1, a0, a0
+; ZIP-NEXT: vsetvli zero, a1, e16, m1, ta, ma
+; ZIP-NEXT: vslideup.vx v10, v11, a0
+; ZIP-NEXT: vmv.v.v v8, v10
+; ZIP-NEXT: ret
+ %res = call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
+ ret <vscale x 4 x half> %res
+}
+
+define <vscale x 8 x half> @vector_interleave_nxv8f16_nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
+; V-LABEL: vector_interleave_nxv8f16_nxv4f16:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; V-NEXT: vmv1r.v v10, v9
+; V-NEXT: vmv1r.v v11, v8
+; V-NEXT: vwaddu.vv v8, v11, v10
+; V-NEXT: li a0, -1
+; V-NEXT: vwmaccu.vx v8, a0, v10
+; V-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv8f16_nxv4f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-NEXT: vmv1r.v v10, v9
+; ZVBB-NEXT: vmv1r.v v11, v8
+; ZVBB-NEXT: vwsll.vi v8, v10, 16
+; ZVBB-NEXT: vwaddu.wv v8, v8, v11
+; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv8f16_nxv4f16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT: vmv1r.v v10, v9
+; ZIP-NEXT: vmv1r.v v11, v8
+; ZIP-NEXT: ri.vzip2b.vv v9, v8, v10
+; ZIP-NEXT: ri.vzip2a.vv v8, v11, v10
+; ZIP-NEXT: ret
+ %res = call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x float> @vector_interleave_nxv4f32_nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b) {
+; V-LABEL: vector_interleave_nxv4f32_nxv2f32:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; V-NEXT: vmv1r.v v10, v9
+; V-NEXT: vmv1r.v v11, v8
+; V-NEXT: vwaddu.vv v8, v11, v10
+; V-NEXT: li a0, -1
+; V-NEXT: vwmaccu.vx v8, a0, v10
+; V-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv4f32_nxv2f32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-NEXT: vmv1r.v v10, v9
+; ZVBB-NEXT: vmv1r.v v11, v8
+; ZVBB-NEXT: li a0, 32
+; ZVBB-NEXT: vwsll.vx v8, v10, a0
+; ZVBB-NEXT: vwaddu.wv v8, v8, v11
+; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv4f32_nxv2f32:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZIP-NEXT: vmv1r.v v10, v9
+; ZIP-NEXT: vmv1r.v v11, v8
+; ZIP-NEXT: ri.vzip2b.vv v9, v8, v10
+; ZIP-NEXT: ri.vzip2a.vv v8, v11, v10
+; ZIP-NEXT: ret
+ %res = call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 16 x bfloat> @vector_interleave_nxv16bf16_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; V-LABEL: vector_interleave_nxv16bf16_nxv8bf16:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; V-NEXT: vmv2r.v v12, v10
+; V-NEXT: vmv2r.v v14, v8
+; V-NEXT: vwaddu.vv v8, v14, v12
+; V-NEXT: li a0, -1
+; V-NEXT: vwmaccu.vx v8, a0, v12
+; V-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16bf16_nxv8bf16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVBB-NEXT: vmv2r.v v12, v10
+; ZVBB-NEXT: vmv2r.v v14, v8
+; ZVBB-NEXT: vwsll.vi v8, v12, 16
+; ZVBB-NEXT: vwaddu.wv v8, v8, v14
+; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv16bf16_nxv8bf16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZIP-NEXT: vmv2r.v v12, v10
+; ZIP-NEXT: vmv2r.v v14, v8
+; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12
+; ZIP-NEXT: ri.vzip2a.vv v8, v14, v12
+; ZIP-NEXT: ret
+ %res = call <vscale x 16 x bfloat> @llvm.vector.interleave2.nxv16bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+ ret <vscale x 16 x bfloat> %res
+}
+
+define <vscale x 16 x half> @vector_interleave_nxv16f16_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; V-LABEL: vector_interleave_nxv16f16_nxv8f16:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; V-NEXT: vmv2r.v v12, v10
+; V-NEXT: vmv2r.v v14, v8
+; V-NEXT: vwaddu.vv v8, v14, v12
+; V-NEXT: li a0, -1
+; V-NEXT: vwmaccu.vx v8, a0, v12
+; V-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16f16_nxv8f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZVBB-NEXT: vmv2r.v v12, v10
+; ZVBB-NEXT: vmv2r.v v14, v8
+; ZVBB-NEXT: vwsll.vi v8, v12, 16
+; ZVBB-NEXT: vwaddu.wv v8, v8, v14
+; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv16f16_nxv8f16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e16, m2, ta, ma
+; ZIP-NEXT: vmv2r.v v12, v10
+; ZIP-NEXT: vmv2r.v v14, v8
+; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12
+; ZIP-NEXT: ri.vzip2a.vv v8, v14, v12
+; ZIP-NEXT: ret
+ %res = call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
+ ret <vscale x 16 x half> %res
+}
+
+define <vscale x 8 x float> @vector_interleave_nxv8f32_nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; V-LABEL: vector_interleave_nxv8f32_nxv4f32:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; V-NEXT: vmv2r.v v12, v10
+; V-NEXT: vmv2r.v v14, v8
+; V-NEXT: vwaddu.vv v8, v14, v12
+; V-NEXT: li a0, -1
+; V-NEXT: vwmaccu.vx v8, a0, v12
+; V-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv8f32_nxv4f32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; ZVBB-NEXT: vmv2r.v v12, v10
+; ZVBB-NEXT: vmv2r.v v14, v8
+; ZVBB-NEXT: li a0, 32
+; ZVBB-NEXT: vwsll.vx v8, v12, a0
+; ZVBB-NEXT: vwaddu.wv v8, v8, v14
+; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv8f32_nxv4f32:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e32, m2, ta, ma
+; ZIP-NEXT: vmv2r.v v12, v10
+; ZIP-NEXT: vmv2r.v v14, v8
+; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12
+; ZIP-NEXT: ri.vzip2a.vv v8, v14, v12
+; ZIP-NEXT: ret
+ %res = call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
+ ret <vscale x 8 x float> %res
+}
+
+define <vscale x 4 x double> @vector_interleave_nxv4f64_nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; V-LABEL: vector_interleave_nxv4f64_nxv2f64:
+; V: # %bb.0:
+; V-NEXT: csrr a0, vlenb
+; V-NEXT: vsetvli a1, zero, e16, m1, ta, mu
+; V-NEXT: vid.v v12
+; V-NEXT: srli a0, a0, 2
+; V-NEXT: vand.vi v13, v12, 1
+; V-NEXT: vmsne.vi v0, v13, 0
+; V-NEXT: vsrl.vi v16, v12, 1
+; V-NEXT: vadd.vx v16, v16, a0, v0.t
+; V-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; V-NEXT: vrgatherei16.vv v12, v8, v16
+; V-NEXT: vmv.v.v v8, v12
+; V-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv4f64_nxv2f64:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, mu
+; ZVBB-NEXT: vid.v v12
+; ZVBB-NEXT: srli a0, a0, 2
+; ZVBB-NEXT: vand.vi v13, v12, 1
+; ZVBB-NEXT: vmsne.vi v0, v13, 0
+; ZVBB-NEXT: vsrl.vi v16, v12, 1
+; ZVBB-NEXT: vadd.vx v16, v16, a0, v0.t
+; ZVBB-NEXT: vsetvli zero, zero, e64, m4, ta, ma
+; ZVBB-NEXT: vrgatherei16.vv v12, v8, v16
+; ZVBB-NEXT: vmv.v.v v8, v12
+; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv4f64_nxv2f64:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e64, m2, ta, ma
+; ZIP-NEXT: vmv2r.v v12, v10
+; ZIP-NEXT: vmv2r.v v14, v8
+; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12
+; ZIP-NEXT: ri.vzip2a.vv v8, v14, v12
+; ZIP-NEXT: ret
+ %res = call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
+ ret <vscale x 4 x double> %res
+}
+
+
+
+define <vscale x 64 x bfloat> @vector_interleave_nxv64bf16_nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) {
+; V-LABEL: vector_interleave_nxv64bf16_nxv32bf16:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; V-NEXT: vmv8r.v v24, v8
+; V-NEXT: vwaddu.vv v8, v24, v16
+; V-NEXT: li a0, -1
+; V-NEXT: vwaddu.vv v0, v28, v20
+; V-NEXT: vwmaccu.vx v8, a0, v16
+; V-NEXT: vwmaccu.vx v0, a0, v20
+; V-NEXT: vmv8r.v v16, v0
+; V-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv64bf16_nxv32bf16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVBB-NEXT: vwsll.vi v24, v16, 16
+; ZVBB-NEXT: vwsll.vi v0, v20, 16
+; ZVBB-NEXT: vwaddu.wv v24, v24, v8
+; ZVBB-NEXT: vwaddu.wv v0, v0, v12
+; ZVBB-NEXT: vmv8r.v v8, v24
+; ZVBB-NEXT: vmv8r.v v16, v0
+; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv64bf16_nxv32bf16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZIP-NEXT: ri.vzip2b.vv v28, v8, v16
+; ZIP-NEXT: ri.vzip2b.vv v4, v12, v20
+; ZIP-NEXT: ri.vzip2a.vv v24, v8, v16
+; ZIP-NEXT: ri.vzip2a.vv v0, v12, v20
+; ZIP-NEXT: vmv8r.v v8, v24
+; ZIP-NEXT: vmv8r.v v16, v0
+; ZIP-NEXT: ret
+ %res = call <vscale x 64 x bfloat> @llvm.vector.interleave2.nxv64bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
+ ret <vscale x 64 x bfloat> %res
+}
+
+define <vscale x 64 x half> @vector_interleave_nxv64f16_nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b) {
+; V-LABEL: vector_interleave_nxv64f16_nxv32f16:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; V-NEXT: vmv8r.v v24, v8
+; V-NEXT: vwaddu.vv v8, v24, v16
+; V-NEXT: li a0, -1
+; V-NEXT: vwaddu.vv v0, v28, v20
+; V-NEXT: vwmaccu.vx v8, a0, v16
+; V-NEXT: vwmaccu.vx v0, a0, v20
+; V-NEXT: vmv8r.v v16, v0
+; V-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv64f16_nxv32f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZVBB-NEXT: vwsll.vi v24, v16, 16
+; ZVBB-NEXT: vwsll.vi v0, v20, 16
+; ZVBB-NEXT: vwaddu.wv v24, v24, v8
+; ZVBB-NEXT: vwaddu.wv v0, v0, v12
+; ZVBB-NEXT: vmv8r.v v8, v24
+; ZVBB-NEXT: vmv8r.v v16, v0
+; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv64f16_nxv32f16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e16, m4, ta, ma
+; ZIP-NEXT: ri.vzip2b.vv v28, v8, v16
+; ZIP-NEXT: ri.vzip2b.vv v4, v12, v20
+; ZIP-NEXT: ri.vzip2a.vv v24, v8, v16
+; ZIP-NEXT: ri.vzip2a.vv v0, v12, v20
+; ZIP-NEXT: vmv8r.v v8, v24
+; ZIP-NEXT: vmv8r.v v16, v0
+; ZIP-NEXT: ret
+ %res = call <vscale x 64 x half> @llvm.vector.interleave2.nxv64f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b)
+ ret <vscale x 64 x half> %res
+}
+
+define <vscale x 32 x float> @vector_interleave_nxv32f32_nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b) {
+; V-LABEL: vector_interleave_nxv32f32_nxv16f32:
+; V: # %bb.0:
+; V-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; V-NEXT: vmv8r.v v24, v8
+; V-NEXT: vwaddu.vv v8, v24, v16
+; V-NEXT: li a0, -1
+; V-NEXT: vwaddu.vv v0, v28, v20
+; V-NEXT: vwmaccu.vx v8, a0, v16
+; V-NEXT: vwmaccu.vx v0, a0, v20
+; V-NEXT: vmv8r.v v16, v0
+; V-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv32f32_nxv16f32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: li a0, 32
+; ZVBB-NEXT: vsetvli a1, zero, e32, m4, ta, ma
+; ZVBB-NEXT: vwsll.vx v24, v16, a0
+; ZVBB-NEXT: vwsll.vx v0, v20, a0
+; ZVBB-NEXT: vwaddu.wv v24, v24, v8
+; ZVBB-NEXT: vwaddu.wv v0, v0, v12
+; ZVBB-NEXT: vmv8r.v v8, v24
+; ZVBB-NEXT: vmv8r.v v16, v0
+; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv32f32_nxv16f32:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e32, m4, ta, ma
+; ZIP-NEXT: ri.vzip2b.vv v28, v8, v16
+; ZIP-NEXT: ri.vzip2b.vv v4, v12, v20
+; ZIP-NEXT: ri.vzip2a.vv v24, v8, v16
+; ZIP-NEXT: ri.vzip2a.vv v0, v12, v20
+; ZIP-NEXT: vmv8r.v v8, v24
+; ZIP-NEXT: vmv8r.v v16, v0
+; ZIP-NEXT: ret
+ %res = call <vscale x 32 x float> @llvm.vector.interleave2.nxv32f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b)
+ ret <vscale x 32 x float> %res
+}
+
+define <vscale x 16 x double> @vector_interleave_nxv16f64_nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b) {
+; V-LABEL: vector_interleave_nxv16f64_nxv8f64:
+; V: # %bb.0:
+; V-NEXT: csrr a0, vlenb
+; V-NEXT: vsetvli a1, zero, e16, m2, ta, mu
+; V-NEXT: vid.v v6
+; V-NEXT: vmv8r.v v24, v8
+; V-NEXT: srli a0, a0, 1
+; V-NEXT: vmv4r.v v28, v16
+; V-NEXT: vmv4r.v v16, v12
+; V-NEXT: vand.vi v8, v6, 1
+; V-NEXT: vmsne.vi v0, v8, 0
+; V-NEXT: vsrl.vi v6, v6, 1
+; V-NEXT: vadd.vx v6, v6, a0, v0.t
+; V-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; V-NEXT: vrgatherei16.vv v8, v24, v6
+; V-NEXT: vrgatherei16.vv v24, v16, v6
+; V-NEXT: vmv.v.v v16, v24
+; V-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16f64_nxv8f64:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: vsetvli a1, zero, e16, m2, ta, mu
+; ZVBB-NEXT: vid.v v6
+; ZVBB-NEXT: vmv8r.v v24, v8
+; ZVBB-NEXT: srli a0, a0, 1
+; ZVBB-NEXT: vmv4r.v v28, v16
+; ZVBB-NEXT: vmv4r.v v16, v12
+; ZVBB-NEXT: vand.vi v8, v6, 1
+; ZVBB-NEXT: vmsne.vi v0, v8, 0
+; ZVBB-NEXT: vsrl.vi v6, v6, 1
+; ZVBB-NEXT: vadd.vx v6, v6, a0, v0.t
+; ZVBB-NEXT: vsetvli zero, zero, e64, m8, ta, ma
+; ZVBB-NEXT: vrgatherei16.vv v8, v24, v6
+; ZVBB-NEXT: vrgatherei16.vv v24, v16, v6
+; ZVBB-NEXT: vmv.v.v v16, v24
+; ZVBB-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv16f64_nxv8f64:
+; ZIP: # %bb.0:
+; ZIP-NEXT: vsetvli a0, zero, e64, m4, ta, ma
+; ZIP-NEXT: ri.vzip2b.vv v28, v8, v16
+; ZIP-NEXT: ri.vzip2b.vv v4, v12, v20
+; ZIP-NEXT: ri.vzip2a.vv v24, v8, v16
+; ZIP-NEXT: ri.vzip2a.vv v0, v12, v20
+; ZIP-NEXT: vmv8r.v v8, v24
+; ZIP-NEXT: vmv8r.v v16, v0
+; ZIP-NEXT: ret
+ %res = call <vscale x 16 x double> @llvm.vector.interleave2.nxv16f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b)
+ ret <vscale x 16 x double> %res
+}
+
+define <vscale x 6 x half> @vector_interleave_nxv6f16_nxv2f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv6f16_nxv2f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a2, a1, 1
+; CHECK-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vsseg3e16.v v8, (a0)
+; CHECK-NEXT: add a3, a0, a2
+; CHECK-NEXT: vle16.v v9, (a3)
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: srli a1, a1, 2
+; CHECK-NEXT: add a0, a1, a1
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v9, a1
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vle16.v v9, (a2)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv6f16_nxv2f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a2, a1, 1
+; ZVBB-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vsseg3e16.v v8, (a0)
+; ZVBB-NEXT: add a3, a0, a2
+; ZVBB-NEXT: vle16.v v9, (a3)
+; ZVBB-NEXT: vle16.v v8, (a0)
+; ZVBB-NEXT: srli a1, a1, 2
+; ZVBB-NEXT: add a0, a1, a1
+; ZVBB-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v8, v9, a1
+; ZVBB-NEXT: add a2, a3, a2
+; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vle16.v v9, (a2)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 6 x half> @llvm.vector.interleave3.nxv6f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2)
+ ret <vscale x 6 x half> %res
+}
+
+define <vscale x 12 x half> @vector_interleave_nxv12f16_nxv4f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv12f16_nxv4f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vsseg3e16.v v8, (a0)
+; CHECK-NEXT: vl1re16.v v8, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl1re16.v v9, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl1re16.v v10, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv12f16_nxv4f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a1, a0, 1
+; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; ZVBB-NEXT: vsseg3e16.v v8, (a0)
+; ZVBB-NEXT: vl1re16.v v8, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl1re16.v v9, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl1re16.v v10, (a0)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a1, a0, 1
+; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 12 x half> @llvm.vector.interleave3.nxv12f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2)
+ ret <vscale x 12 x half> %res
+}
+
+define <vscale x 24 x half> @vector_interleave_nxv24f16_nxv8f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv24f16_nxv8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vsseg3e16.v v8, (a0)
+; CHECK-NEXT: vl2re16.v v8, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl2re16.v v10, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl2re16.v v12, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv24f16_nxv8f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 1
+; ZVBB-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; ZVBB-NEXT: vsseg3e16.v v8, (a0)
+; ZVBB-NEXT: vl2re16.v v8, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl2re16.v v10, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl2re16.v v12, (a0)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 24 x half> @llvm.vector.interleave3.nxv24f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2)
+ ret <vscale x 24 x half> %res
+}
+
+define <vscale x 6 x bfloat> @vector_interleave_nxv6bf16_nxv2bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv6bf16_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a2, a1, 1
+; CHECK-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vsseg3e16.v v8, (a0)
+; CHECK-NEXT: add a3, a0, a2
+; CHECK-NEXT: vle16.v v9, (a3)
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: srli a1, a1, 2
+; CHECK-NEXT: add a0, a1, a1
+; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v9, a1
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vle16.v v9, (a2)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv6bf16_nxv2bf16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a2, a1, 1
+; ZVBB-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vsseg3e16.v v8, (a0)
+; ZVBB-NEXT: add a3, a0, a2
+; ZVBB-NEXT: vle16.v v9, (a3)
+; ZVBB-NEXT: vle16.v v8, (a0)
+; ZVBB-NEXT: srli a1, a1, 2
+; ZVBB-NEXT: add a0, a1, a1
+; ZVBB-NEXT: vsetvli zero, a0, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v8, v9, a1
+; ZVBB-NEXT: add a2, a3, a2
+; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vle16.v v9, (a2)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 6 x bfloat> @llvm.vector.interleave3.nxv6bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2)
+ ret <vscale x 6 x bfloat> %res
+}
+
+define <vscale x 12 x bfloat> @vector_interleave_nxv12bf16_nxv4bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv12bf16_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT: vsseg3e16.v v8, (a0)
+; CHECK-NEXT: vl1re16.v v8, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl1re16.v v9, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl1re16.v v10, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv12bf16_nxv4bf16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a1, a0, 1
+; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: vsetvli a2, zero, e16, m1, ta, ma
+; ZVBB-NEXT: vsseg3e16.v v8, (a0)
+; ZVBB-NEXT: vl1re16.v v8, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl1re16.v v9, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl1re16.v v10, (a0)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a1, a0, 1
+; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 12 x bfloat> @llvm.vector.interleave3.nxv12bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2)
+ ret <vscale x 12 x bfloat> %res
+}
+
+define <vscale x 24 x bfloat> @vector_interleave_nxv24bf16_nxv8bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv24bf16_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT: vsseg3e16.v v8, (a0)
+; CHECK-NEXT: vl2re16.v v8, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl2re16.v v10, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl2re16.v v12, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv24bf16_nxv8bf16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 1
+; ZVBB-NEXT: vsetvli a2, zero, e16, m2, ta, ma
+; ZVBB-NEXT: vsseg3e16.v v8, (a0)
+; ZVBB-NEXT: vl2re16.v v8, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl2re16.v v10, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl2re16.v v12, (a0)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 24 x bfloat> @llvm.vector.interleave3.nxv24bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2)
+ ret <vscale x 24 x bfloat> %res
+}
+
+define <vscale x 3 x float> @vector_interleave_nxv3f32_nxv1f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv3f32_nxv1f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a2, a1, 1
+; CHECK-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsseg3e32.v v8, (a0)
+; CHECK-NEXT: add a3, a0, a2
+; CHECK-NEXT: vle32.v v9, (a3)
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: srli a1, a1, 3
+; CHECK-NEXT: add a0, a1, a1
+; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v9, a1
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vle32.v v9, (a2)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv3f32_nxv1f32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a2, a1, 1
+; ZVBB-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vsseg3e32.v v8, (a0)
+; ZVBB-NEXT: add a3, a0, a2
+; ZVBB-NEXT: vle32.v v9, (a3)
+; ZVBB-NEXT: vle32.v v8, (a0)
+; ZVBB-NEXT: srli a1, a1, 3
+; ZVBB-NEXT: add a0, a1, a1
+; ZVBB-NEXT: vsetvli zero, a0, e32, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v8, v9, a1
+; ZVBB-NEXT: add a2, a3, a2
+; ZVBB-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vle32.v v9, (a2)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 3 x float> @llvm.vector.interleave3.nxv3f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2)
+ ret <vscale x 3 x float> %res
+}
+
+define <vscale x 6 x float> @vector_interleave_nxv6f32_nxv2f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv6f32_nxv2f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg3e32.v v8, (a0)
+; CHECK-NEXT: vl1re32.v v8, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl1re32.v v9, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl1re32.v v10, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv6f32_nxv2f32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a1, a0, 1
+; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: vsetvli a2, zero, e32, m1, ta, ma
+; ZVBB-NEXT: vsseg3e32.v v8, (a0)
+; ZVBB-NEXT: vl1re32.v v8, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl1re32.v v9, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl1re32.v v10, (a0)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a1, a0, 1
+; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 6 x float> @llvm.vector.interleave3.nxv6f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2)
+ ret <vscale x 6 x float> %res
+}
+
+define <vscale x 12 x float> @vector_interleave_nxv12f32_nxv4f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv12f32_nxv4f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: vsetvli a2, zero, e32, m2, ta, ma
+; CHECK-NEXT: vsseg3e32.v v8, (a0)
+; CHECK-NEXT: vl2re32.v v8, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl2re32.v v10, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl2re32.v v12, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv12f32_nxv4f32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 1
+; ZVBB-NEXT: vsetvli a2, zero, e32, m2, ta, ma
+; ZVBB-NEXT: vsseg3e32.v v8, (a0)
+; ZVBB-NEXT: vl2re32.v v8, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl2re32.v v10, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl2re32.v v12, (a0)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 12 x float> @llvm.vector.interleave3.nxv12f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2)
+ ret <vscale x 12 x float> %res
+}
+
+define <vscale x 3 x double> @vector_interleave_nxv3f64_nxv1f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv3f64_nxv1f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; CHECK-NEXT: vsseg3e64.v v8, (a0)
+; CHECK-NEXT: vl1re64.v v8, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl1re64.v v9, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl1re64.v v10, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv3f64_nxv1f64:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a1, a0, 1
+; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: vsetvli a2, zero, e64, m1, ta, ma
+; ZVBB-NEXT: vsseg3e64.v v8, (a0)
+; ZVBB-NEXT: vl1re64.v v8, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl1re64.v v9, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl1re64.v v10, (a0)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a1, a0, 1
+; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 3 x double> @llvm.vector.interleave3.nxv3f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2)
+ ret <vscale x 3 x double> %res
+}
+
+define <vscale x 6 x double> @vector_interleave_nxv6f64_nxv2f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv6f64_nxv2f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; CHECK-NEXT: vsseg3e64.v v8, (a0)
+; CHECK-NEXT: vl2re64.v v8, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl2re64.v v10, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl2re64.v v12, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv6f64_nxv2f64:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 1
+; ZVBB-NEXT: vsetvli a2, zero, e64, m2, ta, ma
+; ZVBB-NEXT: vsseg3e64.v v8, (a0)
+; ZVBB-NEXT: vl2re64.v v8, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl2re64.v v10, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl2re64.v v12, (a0)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 6 x double> @llvm.vector.interleave3.nxv6f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2)
+ ret <vscale x 6 x double> %res
+}
+
+define <vscale x 8 x half> @vector_interleave_nxv8f16_nxv2f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv8f16_nxv2f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a2, a1, 1
+; CHECK-NEXT: add a3, a0, a2
+; CHECK-NEXT: vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vsseg4e16.v v8, (a0)
+; CHECK-NEXT: add a4, a3, a2
+; CHECK-NEXT: add a2, a4, a2
+; CHECK-NEXT: vle16.v v9, (a4)
+; CHECK-NEXT: vle16.v v8, (a2)
+; CHECK-NEXT: srli a1, a1, 2
+; CHECK-NEXT: add a2, a1, a1
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v8, a1
+; CHECK-NEXT: vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vle16.v v10, (a3)
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v10, a1
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv8f16_nxv2f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a2, a1, 1
+; ZVBB-NEXT: add a3, a0, a2
+; ZVBB-NEXT: vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vsseg4e16.v v8, (a0)
+; ZVBB-NEXT: add a4, a3, a2
+; ZVBB-NEXT: add a2, a4, a2
+; ZVBB-NEXT: vle16.v v9, (a4)
+; ZVBB-NEXT: vle16.v v8, (a2)
+; ZVBB-NEXT: srli a1, a1, 2
+; ZVBB-NEXT: add a2, a1, a1
+; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v9, v8, a1
+; ZVBB-NEXT: vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vle16.v v10, (a3)
+; ZVBB-NEXT: vle16.v v8, (a0)
+; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v8, v10, a1
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 8 x half> @llvm.vector.interleave4.nxv8f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3)
+ ret <vscale x 8 x half> %res
+}
+
+define <vscale x 16 x half> @vector_interleave_nxv16f16_nxv4f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv16f16_nxv4f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, ma
+; CHECK-NEXT: vsseg4e16.v v8, (a0)
+; CHECK-NEXT: vl1re16.v v10, (a3)
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vl1re16.v v11, (a1)
+; CHECK-NEXT: vl1re16.v v8, (a0)
+; CHECK-NEXT: vl1re16.v v9, (a2)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16f16_nxv4f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 2
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vsetvli a4, zero, e16, m1, ta, ma
+; ZVBB-NEXT: vsseg4e16.v v8, (a0)
+; ZVBB-NEXT: vl1re16.v v10, (a3)
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vl1re16.v v11, (a1)
+; ZVBB-NEXT: vl1re16.v v8, (a0)
+; ZVBB-NEXT: vl1re16.v v9, (a2)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 2
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 16 x half> @llvm.vector.interleave4.nxv16f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3)
+ ret <vscale x 16 x half> %res
+}
+
+define <vscale x 32 x half> @vector_interleave_nxv32f16_nxv8f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv32f16_nxv8f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: vsetvli a3, zero, e16, m2, ta, ma
+; CHECK-NEXT: vsseg4e16.v v8, (a0)
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vl2re16.v v12, (a3)
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vl2re16.v v14, (a1)
+; CHECK-NEXT: vl2re16.v v8, (a0)
+; CHECK-NEXT: vl2re16.v v10, (a2)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv32f16_nxv8f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 1
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: vsetvli a3, zero, e16, m2, ta, ma
+; ZVBB-NEXT: vsseg4e16.v v8, (a0)
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vl2re16.v v12, (a3)
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vl2re16.v v14, (a1)
+; ZVBB-NEXT: vl2re16.v v8, (a0)
+; ZVBB-NEXT: vl2re16.v v10, (a2)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 32 x half> @llvm.vector.interleave4.nxv32f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3)
+ ret <vscale x 32 x half> %res
+}
+
+define <vscale x 8 x bfloat> @vector_interleave_nxv8bf16_nxv2bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv8bf16_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a2, a1, 1
+; CHECK-NEXT: add a3, a0, a2
+; CHECK-NEXT: vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vsseg4e16.v v8, (a0)
+; CHECK-NEXT: add a4, a3, a2
+; CHECK-NEXT: add a2, a4, a2
+; CHECK-NEXT: vle16.v v9, (a4)
+; CHECK-NEXT: vle16.v v8, (a2)
+; CHECK-NEXT: srli a1, a1, 2
+; CHECK-NEXT: add a2, a1, a1
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v8, a1
+; CHECK-NEXT: vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vle16.v v10, (a3)
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v10, a1
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv8bf16_nxv2bf16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a2, a1, 1
+; ZVBB-NEXT: add a3, a0, a2
+; ZVBB-NEXT: vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vsseg4e16.v v8, (a0)
+; ZVBB-NEXT: add a4, a3, a2
+; ZVBB-NEXT: add a2, a4, a2
+; ZVBB-NEXT: vle16.v v9, (a4)
+; ZVBB-NEXT: vle16.v v8, (a2)
+; ZVBB-NEXT: srli a1, a1, 2
+; ZVBB-NEXT: add a2, a1, a1
+; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v9, v8, a1
+; ZVBB-NEXT: vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vle16.v v10, (a3)
+; ZVBB-NEXT: vle16.v v8, (a0)
+; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v8, v10, a1
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 8 x bfloat> @llvm.vector.interleave4.nxv8bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3)
+ ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 16 x bfloat> @vector_interleave_nxv16bf16_nxv4bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv16bf16_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, ma
+; CHECK-NEXT: vsseg4e16.v v8, (a0)
+; CHECK-NEXT: vl1re16.v v10, (a3)
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vl1re16.v v11, (a1)
+; CHECK-NEXT: vl1re16.v v8, (a0)
+; CHECK-NEXT: vl1re16.v v9, (a2)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16bf16_nxv4bf16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: vwaddu.wv v10, v10, v8
-; ZVBB-NEXT: srli a0, a0, 2
-; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, ma
-; ZVBB-NEXT: vslidedown.vx v8, v10, a0
-; ZVBB-NEXT: add a1, a0, a0
-; ZVBB-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; ZVBB-NEXT: vslideup.vx v10, v8, a0
-; ZVBB-NEXT: vmv.v.v v8, v10
+; ZVBB-NEXT: slli a0, a0, 2
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vsetvli a4, zero, e16, m1, ta, ma
+; ZVBB-NEXT: vsseg4e16.v v8, (a0)
+; ZVBB-NEXT: vl1re16.v v10, (a3)
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vl1re16.v v11, (a1)
+; ZVBB-NEXT: vl1re16.v v8, (a0)
+; ZVBB-NEXT: vl1re16.v v9, (a2)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 2
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
+ %res = call <vscale x 16 x bfloat> @llvm.vector.interleave4.nxv16bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3)
+ ret <vscale x 16 x bfloat> %res
+}
+
+define <vscale x 32 x bfloat> @vector_interleave_nxv32bf16_nxv8bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv32bf16_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: vsetvli a3, zero, e16, m2, ta, ma
+; CHECK-NEXT: vsseg4e16.v v8, (a0)
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vl2re16.v v12, (a3)
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vl2re16.v v14, (a1)
+; CHECK-NEXT: vl2re16.v v8, (a0)
+; CHECK-NEXT: vl2re16.v v10, (a2)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
;
-; ZIP-LABEL: vector_interleave_nxv4f16_nxv2f16:
-; ZIP: # %bb.0:
-; ZIP-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; ZIP-NEXT: ri.vzip2b.vv v11, v8, v9
-; ZIP-NEXT: ri.vzip2a.vv v10, v8, v9
-; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: srli a0, a0, 2
-; ZIP-NEXT: add a1, a0, a0
-; ZIP-NEXT: vsetvli zero, a1, e16, m1, ta, ma
-; ZIP-NEXT: vslideup.vx v10, v11, a0
-; ZIP-NEXT: vmv.v.v v8, v10
-; ZIP-NEXT: ret
- %res = call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
- ret <vscale x 4 x half> %res
+; ZVBB-LABEL: vector_interleave_nxv32bf16_nxv8bf16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 1
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: vsetvli a3, zero, e16, m2, ta, ma
+; ZVBB-NEXT: vsseg4e16.v v8, (a0)
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vl2re16.v v12, (a3)
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vl2re16.v v14, (a1)
+; ZVBB-NEXT: vl2re16.v v8, (a0)
+; ZVBB-NEXT: vl2re16.v v10, (a2)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 32 x bfloat> @llvm.vector.interleave4.nxv32bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3)
+ ret <vscale x 32 x bfloat> %res
}
-define <vscale x 8 x half> @vector_interleave_nxv8f16_nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
-; V-LABEL: vector_interleave_nxv8f16_nxv4f16:
-; V: # %bb.0:
-; V-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; V-NEXT: vmv1r.v v10, v9
-; V-NEXT: vmv1r.v v11, v8
-; V-NEXT: vwaddu.vv v8, v11, v10
-; V-NEXT: li a0, -1
-; V-NEXT: vwmaccu.vx v8, a0, v10
-; V-NEXT: ret
+define <vscale x 4 x float> @vector_interleave_nxv4f32_nxv1f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv4f32_nxv1f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a2, a1, 1
+; CHECK-NEXT: add a3, a0, a2
+; CHECK-NEXT: vsetvli a4, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsseg4e32.v v8, (a0)
+; CHECK-NEXT: add a4, a3, a2
+; CHECK-NEXT: add a2, a4, a2
+; CHECK-NEXT: vle32.v v9, (a4)
+; CHECK-NEXT: vle32.v v8, (a2)
+; CHECK-NEXT: srli a1, a1, 3
+; CHECK-NEXT: add a2, a1, a1
+; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v8, a1
+; CHECK-NEXT: vsetvli a4, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vle32.v v10, (a3)
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v10, a1
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv8f16_nxv4f16:
+; ZVBB-LABEL: vector_interleave_nxv4f32_nxv1f32:
; ZVBB: # %bb.0:
-; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; ZVBB-NEXT: vmv1r.v v10, v9
-; ZVBB-NEXT: vmv1r.v v11, v8
-; ZVBB-NEXT: vwsll.vi v8, v10, 16
-; ZVBB-NEXT: vwaddu.wv v8, v8, v11
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a2, a1, 1
+; ZVBB-NEXT: add a3, a0, a2
+; ZVBB-NEXT: vsetvli a4, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vsseg4e32.v v8, (a0)
+; ZVBB-NEXT: add a4, a3, a2
+; ZVBB-NEXT: add a2, a4, a2
+; ZVBB-NEXT: vle32.v v9, (a4)
+; ZVBB-NEXT: vle32.v v8, (a2)
+; ZVBB-NEXT: srli a1, a1, 3
+; ZVBB-NEXT: add a2, a1, a1
+; ZVBB-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v9, v8, a1
+; ZVBB-NEXT: vsetvli a4, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vle32.v v10, (a3)
+; ZVBB-NEXT: vle32.v v8, (a0)
+; ZVBB-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v8, v10, a1
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
+ %res = call <vscale x 4 x float> @llvm.vector.interleave4.nxv4f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3)
+ ret <vscale x 4 x float> %res
+}
+
+define <vscale x 8 x float> @vector_interleave_nxv8f32_nxv2f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv8f32_nxv2f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vsetvli a4, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg4e32.v v8, (a0)
+; CHECK-NEXT: vl1re32.v v10, (a3)
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vl1re32.v v11, (a1)
+; CHECK-NEXT: vl1re32.v v8, (a0)
+; CHECK-NEXT: vl1re32.v v9, (a2)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
;
-; ZIP-LABEL: vector_interleave_nxv8f16_nxv4f16:
-; ZIP: # %bb.0:
-; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; ZIP-NEXT: vmv1r.v v10, v9
-; ZIP-NEXT: vmv1r.v v11, v8
-; ZIP-NEXT: ri.vzip2b.vv v9, v8, v10
-; ZIP-NEXT: ri.vzip2a.vv v8, v11, v10
-; ZIP-NEXT: ret
- %res = call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
- ret <vscale x 8 x half> %res
+; ZVBB-LABEL: vector_interleave_nxv8f32_nxv2f32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 2
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vsetvli a4, zero, e32, m1, ta, ma
+; ZVBB-NEXT: vsseg4e32.v v8, (a0)
+; ZVBB-NEXT: vl1re32.v v10, (a3)
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vl1re32.v v11, (a1)
+; ZVBB-NEXT: vl1re32.v v8, (a0)
+; ZVBB-NEXT: vl1re32.v v9, (a2)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 2
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 8 x float> @llvm.vector.interleave4.nxv8f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3)
+ ret <vscale x 8 x float> %res
}
-define <vscale x 4 x float> @vector_interleave_nxv4f32_nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b) {
-; V-LABEL: vector_interleave_nxv4f32_nxv2f32:
-; V: # %bb.0:
-; V-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; V-NEXT: vmv1r.v v10, v9
-; V-NEXT: vmv1r.v v11, v8
-; V-NEXT: vwaddu.vv v8, v11, v10
-; V-NEXT: li a0, -1
-; V-NEXT: vwmaccu.vx v8, a0, v10
-; V-NEXT: ret
+define <vscale x 16 x float> @vector_interleave_nxv16f32_nxv4f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv16f32_nxv4f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: vsetvli a3, zero, e32, m2, ta, ma
+; CHECK-NEXT: vsseg4e32.v v8, (a0)
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vl2re32.v v12, (a3)
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vl2re32.v v14, (a1)
+; CHECK-NEXT: vl2re32.v v8, (a0)
+; CHECK-NEXT: vl2re32.v v10, (a2)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv4f32_nxv2f32:
+; ZVBB-LABEL: vector_interleave_nxv16f32_nxv4f32:
; ZVBB: # %bb.0:
-; ZVBB-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; ZVBB-NEXT: vmv1r.v v10, v9
-; ZVBB-NEXT: vmv1r.v v11, v8
-; ZVBB-NEXT: li a0, 32
-; ZVBB-NEXT: vwsll.vx v8, v10, a0
-; ZVBB-NEXT: vwaddu.wv v8, v8, v11
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 1
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: vsetvli a3, zero, e32, m2, ta, ma
+; ZVBB-NEXT: vsseg4e32.v v8, (a0)
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vl2re32.v v12, (a3)
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vl2re32.v v14, (a1)
+; ZVBB-NEXT: vl2re32.v v8, (a0)
+; ZVBB-NEXT: vl2re32.v v10, (a2)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
-;
-; ZIP-LABEL: vector_interleave_nxv4f32_nxv2f32:
-; ZIP: # %bb.0:
-; ZIP-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; ZIP-NEXT: vmv1r.v v10, v9
-; ZIP-NEXT: vmv1r.v v11, v8
-; ZIP-NEXT: ri.vzip2b.vv v9, v8, v10
-; ZIP-NEXT: ri.vzip2a.vv v8, v11, v10
-; ZIP-NEXT: ret
- %res = call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
- ret <vscale x 4 x float> %res
+ %res = call <vscale x 16 x float> @llvm.vector.interleave4.nxv16f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3)
+ ret <vscale x 16 x float> %res
}
-define <vscale x 16 x bfloat> @vector_interleave_nxv16bf16_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
-; V-LABEL: vector_interleave_nxv16bf16_nxv8bf16:
-; V: # %bb.0:
-; V-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; V-NEXT: vmv2r.v v12, v10
-; V-NEXT: vmv2r.v v14, v8
-; V-NEXT: vwaddu.vv v8, v14, v12
-; V-NEXT: li a0, -1
-; V-NEXT: vwmaccu.vx v8, a0, v12
-; V-NEXT: ret
+define <vscale x 4 x double> @vector_interleave_nxv4f64_nxv1f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv4f64_nxv1f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vsetvli a4, zero, e64, m1, ta, ma
+; CHECK-NEXT: vsseg4e64.v v8, (a0)
+; CHECK-NEXT: vl1re64.v v10, (a3)
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vl1re64.v v11, (a1)
+; CHECK-NEXT: vl1re64.v v8, (a0)
+; CHECK-NEXT: vl1re64.v v9, (a2)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv16bf16_nxv8bf16:
+; ZVBB-LABEL: vector_interleave_nxv4f64_nxv1f64:
; ZVBB: # %bb.0:
-; ZVBB-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; ZVBB-NEXT: vmv2r.v v12, v10
-; ZVBB-NEXT: vmv2r.v v14, v8
-; ZVBB-NEXT: vwsll.vi v8, v12, 16
-; ZVBB-NEXT: vwaddu.wv v8, v8, v14
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 2
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vsetvli a4, zero, e64, m1, ta, ma
+; ZVBB-NEXT: vsseg4e64.v v8, (a0)
+; ZVBB-NEXT: vl1re64.v v10, (a3)
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vl1re64.v v11, (a1)
+; ZVBB-NEXT: vl1re64.v v8, (a0)
+; ZVBB-NEXT: vl1re64.v v9, (a2)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 2
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
-;
-; ZIP-LABEL: vector_interleave_nxv16bf16_nxv8bf16:
-; ZIP: # %bb.0:
-; ZIP-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; ZIP-NEXT: vmv2r.v v12, v10
-; ZIP-NEXT: vmv2r.v v14, v8
-; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12
-; ZIP-NEXT: ri.vzip2a.vv v8, v14, v12
-; ZIP-NEXT: ret
- %res = call <vscale x 16 x bfloat> @llvm.vector.interleave2.nxv16bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
- ret <vscale x 16 x bfloat> %res
+ %res = call <vscale x 4 x double> @llvm.vector.interleave4.nxv4f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3)
+ ret <vscale x 4 x double> %res
}
-define <vscale x 16 x half> @vector_interleave_nxv16f16_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
-; V-LABEL: vector_interleave_nxv16f16_nxv8f16:
-; V: # %bb.0:
-; V-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; V-NEXT: vmv2r.v v12, v10
-; V-NEXT: vmv2r.v v14, v8
-; V-NEXT: vwaddu.vv v8, v14, v12
-; V-NEXT: li a0, -1
-; V-NEXT: vwmaccu.vx v8, a0, v12
-; V-NEXT: ret
+define <vscale x 8 x double> @vector_interleave_nxv8f64_nxv2f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv8f64_nxv2f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 1
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: vsetvli a3, zero, e64, m2, ta, ma
+; CHECK-NEXT: vsseg4e64.v v8, (a0)
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vl2re64.v v12, (a3)
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vl2re64.v v14, (a1)
+; CHECK-NEXT: vl2re64.v v8, (a0)
+; CHECK-NEXT: vl2re64.v v10, (a2)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv16f16_nxv8f16:
+; ZVBB-LABEL: vector_interleave_nxv8f64_nxv2f64:
; ZVBB: # %bb.0:
-; ZVBB-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; ZVBB-NEXT: vmv2r.v v12, v10
-; ZVBB-NEXT: vmv2r.v v14, v8
-; ZVBB-NEXT: vwsll.vi v8, v12, 16
-; ZVBB-NEXT: vwaddu.wv v8, v8, v14
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 1
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: vsetvli a3, zero, e64, m2, ta, ma
+; ZVBB-NEXT: vsseg4e64.v v8, (a0)
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vl2re64.v v12, (a3)
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vl2re64.v v14, (a1)
+; ZVBB-NEXT: vl2re64.v v8, (a0)
+; ZVBB-NEXT: vl2re64.v v10, (a2)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
-;
-; ZIP-LABEL: vector_interleave_nxv16f16_nxv8f16:
-; ZIP: # %bb.0:
-; ZIP-NEXT: vsetvli a0, zero, e16, m2, ta, ma
-; ZIP-NEXT: vmv2r.v v12, v10
-; ZIP-NEXT: vmv2r.v v14, v8
-; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12
-; ZIP-NEXT: ri.vzip2a.vv v8, v14, v12
-; ZIP-NEXT: ret
- %res = call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
- ret <vscale x 16 x half> %res
+ %res = call <vscale x 8 x double> @llvm.vector.interleave4.nxv6f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3)
+ ret <vscale x 8 x double> %res
}
-define <vscale x 8 x float> @vector_interleave_nxv8f32_nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
-; V-LABEL: vector_interleave_nxv8f32_nxv4f32:
-; V: # %bb.0:
-; V-NEXT: vsetvli a0, zero, e32, m2, ta, ma
-; V-NEXT: vmv2r.v v12, v10
-; V-NEXT: vmv2r.v v14, v8
-; V-NEXT: vwaddu.vv v8, v14, v12
-; V-NEXT: li a0, -1
-; V-NEXT: vwmaccu.vx v8, a0, v12
-; V-NEXT: ret
+define <vscale x 10 x half> @vector_interleave_nxv10f16_nxv2f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4) nounwind {
+; CHECK-LABEL: vector_interleave_nxv10f16_nxv2f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a2, a1, 1
+; CHECK-NEXT: add a3, a0, a2
+; CHECK-NEXT: add a4, a3, a2
+; CHECK-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vsseg5e16.v v8, (a0)
+; CHECK-NEXT: add a5, a4, a2
+; CHECK-NEXT: vle16.v v8, (a5)
+; CHECK-NEXT: vle16.v v9, (a4)
+; CHECK-NEXT: srli a1, a1, 2
+; CHECK-NEXT: add a4, a1, a1
+; CHECK-NEXT: vle16.v v10, (a3)
+; CHECK-NEXT: vsetvli zero, a4, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v8, a1
+; CHECK-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: vsetvli zero, a4, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v10, a1
+; CHECK-NEXT: add a2, a5, a2
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vle16.v v10, (a2)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv8f32_nxv4f32:
+; ZVBB-LABEL: vector_interleave_nxv10f16_nxv2f16:
; ZVBB: # %bb.0:
-; ZVBB-NEXT: vsetvli a0, zero, e32, m2, ta, ma
-; ZVBB-NEXT: vmv2r.v v12, v10
-; ZVBB-NEXT: vmv2r.v v14, v8
-; ZVBB-NEXT: li a0, 32
-; ZVBB-NEXT: vwsll.vx v8, v12, a0
-; ZVBB-NEXT: vwaddu.wv v8, v8, v14
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a1, a0, 1
+; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a2, a1, 1
+; ZVBB-NEXT: add a3, a0, a2
+; ZVBB-NEXT: add a4, a3, a2
+; ZVBB-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vsseg5e16.v v8, (a0)
+; ZVBB-NEXT: add a5, a4, a2
+; ZVBB-NEXT: vle16.v v8, (a5)
+; ZVBB-NEXT: vle16.v v9, (a4)
+; ZVBB-NEXT: srli a1, a1, 2
+; ZVBB-NEXT: add a4, a1, a1
+; ZVBB-NEXT: vle16.v v10, (a3)
+; ZVBB-NEXT: vsetvli zero, a4, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v9, v8, a1
+; ZVBB-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vle16.v v8, (a0)
+; ZVBB-NEXT: vsetvli zero, a4, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v8, v10, a1
+; ZVBB-NEXT: add a2, a5, a2
+; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vle16.v v10, (a2)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a1, a0, 1
+; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
+ %res = call <vscale x 10 x half> @llvm.vector.interleave5.nxv10f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4)
+ ret <vscale x 10 x half> %res
+}
+
+define <vscale x 20 x half> @vector_interleave_nxv20f16_nxv4f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4) nounwind {
+; CHECK-LABEL: vector_interleave_nxv20f16_nxv4f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 2
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, ma
+; CHECK-NEXT: vsseg5e16.v v8, (a0)
+; CHECK-NEXT: vl1re16.v v10, (a3)
+; CHECK-NEXT: add a3, a3, a1
+; CHECK-NEXT: vl1re16.v v11, (a3)
+; CHECK-NEXT: vl1re16.v v8, (a0)
+; CHECK-NEXT: vl1re16.v v9, (a2)
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vl1re16.v v12, (a1)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 2
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
;
-; ZIP-LABEL: vector_interleave_nxv8f32_nxv4f32:
-; ZIP: # %bb.0:
-; ZIP-NEXT: vsetvli a0, zero, e32, m2, ta, ma
-; ZIP-NEXT: vmv2r.v v12, v10
-; ZIP-NEXT: vmv2r.v v14, v8
-; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12
-; ZIP-NEXT: ri.vzip2a.vv v8, v14, v12
-; ZIP-NEXT: ret
- %res = call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
- ret <vscale x 8 x float> %res
-}
-
-define <vscale x 4 x double> @vector_interleave_nxv4f64_nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
-; V-LABEL: vector_interleave_nxv4f64_nxv2f64:
-; V: # %bb.0:
-; V-NEXT: csrr a0, vlenb
-; V-NEXT: vsetvli a1, zero, e16, m1, ta, mu
-; V-NEXT: vid.v v12
-; V-NEXT: srli a0, a0, 2
-; V-NEXT: vand.vi v13, v12, 1
-; V-NEXT: vmsne.vi v0, v13, 0
-; V-NEXT: vsrl.vi v16, v12, 1
-; V-NEXT: vadd.vx v16, v16, a0, v0.t
-; V-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; V-NEXT: vrgatherei16.vv v12, v8, v16
-; V-NEXT: vmv.v.v v8, v12
-; V-NEXT: ret
-;
-; ZVBB-LABEL: vector_interleave_nxv4f64_nxv2f64:
+; ZVBB-LABEL: vector_interleave_nxv20f16_nxv4f16:
; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: vsetvli a1, zero, e16, m1, ta, mu
-; ZVBB-NEXT: vid.v v12
-; ZVBB-NEXT: srli a0, a0, 2
-; ZVBB-NEXT: vand.vi v13, v12, 1
-; ZVBB-NEXT: vmsne.vi v0, v13, 0
-; ZVBB-NEXT: vsrl.vi v16, v12, 1
-; ZVBB-NEXT: vadd.vx v16, v16, a0, v0.t
-; ZVBB-NEXT: vsetvli zero, zero, e64, m4, ta, ma
-; ZVBB-NEXT: vrgatherei16.vv v12, v8, v16
-; ZVBB-NEXT: vmv.v.v v8, v12
+; ZVBB-NEXT: slli a1, a0, 2
+; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vsetvli a4, zero, e16, m1, ta, ma
+; ZVBB-NEXT: vsseg5e16.v v8, (a0)
+; ZVBB-NEXT: vl1re16.v v10, (a3)
+; ZVBB-NEXT: add a3, a3, a1
+; ZVBB-NEXT: vl1re16.v v11, (a3)
+; ZVBB-NEXT: vl1re16.v v8, (a0)
+; ZVBB-NEXT: vl1re16.v v9, (a2)
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vl1re16.v v12, (a1)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a1, a0, 2
+; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
-;
-; ZIP-LABEL: vector_interleave_nxv4f64_nxv2f64:
-; ZIP: # %bb.0:
-; ZIP-NEXT: vsetvli a0, zero, e64, m2, ta, ma
-; ZIP-NEXT: vmv2r.v v12, v10
-; ZIP-NEXT: vmv2r.v v14, v8
-; ZIP-NEXT: ri.vzip2b.vv v10, v8, v12
-; ZIP-NEXT: ri.vzip2a.vv v8, v14, v12
-; ZIP-NEXT: ret
- %res = call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
- ret <vscale x 4 x double> %res
+ %res = call <vscale x 20 x half> @llvm.vector.interleave5.nxv20f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4)
+ ret <vscale x 20 x half> %res
}
-
-
-define <vscale x 64 x bfloat> @vector_interleave_nxv64bf16_nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) {
-; V-LABEL: vector_interleave_nxv64bf16_nxv32bf16:
-; V: # %bb.0:
-; V-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; V-NEXT: vmv8r.v v24, v8
-; V-NEXT: vwaddu.vv v8, v24, v16
-; V-NEXT: li a0, -1
-; V-NEXT: vwaddu.vv v0, v28, v20
-; V-NEXT: vwmaccu.vx v8, a0, v16
-; V-NEXT: vwmaccu.vx v0, a0, v20
-; V-NEXT: vmv8r.v v16, v0
-; V-NEXT: ret
+define <vscale x 40 x half> @vector_interleave_nxv40f16_nxv8f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4) nounwind {
+; RV32-LABEL: vector_interleave_nxv40f16_nxv8f16:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -80
+; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 80
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 28
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vmv2r.v v20, v16
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv2r.v v18, v12
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 2
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 64
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: vmv2r.v v16, v8
+; RV32-NEXT: vmv2r.v v22, v16
+; RV32-NEXT: vmv2r.v v24, v18
+; RV32-NEXT: vmv1r.v v26, v20
+; RV32-NEXT: add a3, a0, a2
+; RV32-NEXT: vmv1r.v v23, v10
+; RV32-NEXT: add a4, a1, a2
+; RV32-NEXT: add a5, a4, a2
+; RV32-NEXT: vmv1r.v v25, v14
+; RV32-NEXT: add a6, a5, a2
+; RV32-NEXT: vmv1r.v v18, v11
+; RV32-NEXT: vsseg5e16.v v22, (a0)
+; RV32-NEXT: vmv1r.v v20, v15
+; RV32-NEXT: vsseg5e16.v v17, (a1)
+; RV32-NEXT: vl1re16.v v16, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v17, (a6)
+; RV32-NEXT: add a6, a3, a2
+; RV32-NEXT: vl1re16.v v10, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v11, (a6)
+; RV32-NEXT: vl1re16.v v8, (a0)
+; RV32-NEXT: vl1re16.v v9, (a3)
+; RV32-NEXT: vl1re16.v v14, (a4)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a3, 10
+; RV32-NEXT: mul a0, a0, a3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 64
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v15, (a5)
+; RV32-NEXT: vl1re16.v v12, (a6)
+; RV32-NEXT: vl1re16.v v13, (a1)
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a0, a2
+; RV32-NEXT: vs2r.v v16, (a2)
+; RV32-NEXT: vs8r.v v8, (a0)
+; RV32-NEXT: vl8re16.v v16, (a2)
+; RV32-NEXT: vl8re16.v v8, (a0)
+; RV32-NEXT: addi sp, s0, -80
+; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 80
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_nxv40f16_nxv8f16:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -80
+; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi s0, sp, 80
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 28
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: sub sp, sp, a0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vmv2r.v v20, v16
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv2r.v v18, v12
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a2, a1, 2
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 64
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: vmv2r.v v16, v8
+; RV64-NEXT: vmv2r.v v22, v16
+; RV64-NEXT: vmv2r.v v24, v18
+; RV64-NEXT: vmv1r.v v26, v20
+; RV64-NEXT: add a3, a0, a2
+; RV64-NEXT: vmv1r.v v23, v10
+; RV64-NEXT: add a4, a1, a2
+; RV64-NEXT: add a5, a4, a2
+; RV64-NEXT: vmv1r.v v25, v14
+; RV64-NEXT: add a6, a5, a2
+; RV64-NEXT: vmv1r.v v18, v11
+; RV64-NEXT: vsseg5e16.v v22, (a0)
+; RV64-NEXT: vmv1r.v v20, v15
+; RV64-NEXT: vsseg5e16.v v17, (a1)
+; RV64-NEXT: vl1re16.v v16, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v17, (a6)
+; RV64-NEXT: add a6, a3, a2
+; RV64-NEXT: vl1re16.v v10, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v11, (a6)
+; RV64-NEXT: vl1re16.v v8, (a0)
+; RV64-NEXT: vl1re16.v v9, (a3)
+; RV64-NEXT: vl1re16.v v14, (a4)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a3, 10
+; RV64-NEXT: mul a0, a0, a3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 64
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v15, (a5)
+; RV64-NEXT: vl1re16.v v12, (a6)
+; RV64-NEXT: vl1re16.v v13, (a1)
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a0, a2
+; RV64-NEXT: vs2r.v v16, (a2)
+; RV64-NEXT: vs8r.v v8, (a0)
+; RV64-NEXT: vl8re16.v v16, (a2)
+; RV64-NEXT: vl8re16.v v8, (a0)
+; RV64-NEXT: addi sp, s0, -80
+; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 80
+; RV64-NEXT: ret
+;
+; ZVBB-RV32-LABEL: vector_interleave_nxv40f16_nxv8f16:
+; ZVBB-RV32: # %bb.0:
+; ZVBB-RV32-NEXT: addi sp, sp, -80
+; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: addi s0, sp, 80
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a1, 28
+; ZVBB-RV32-NEXT: mul a0, a0, a1
+; ZVBB-RV32-NEXT: sub sp, sp, a0
+; ZVBB-RV32-NEXT: andi sp, sp, -64
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV32-NEXT: vmv2r.v v20, v16
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv2r.v v18, v12
+; ZVBB-RV32-NEXT: csrr a1, vlenb
+; ZVBB-RV32-NEXT: slli a2, a1, 2
+; ZVBB-RV32-NEXT: add a1, a2, a1
+; ZVBB-RV32-NEXT: add a1, sp, a1
+; ZVBB-RV32-NEXT: addi a1, a1, 64
+; ZVBB-RV32-NEXT: csrr a2, vlenb
+; ZVBB-RV32-NEXT: vmv2r.v v16, v8
+; ZVBB-RV32-NEXT: vmv2r.v v22, v16
+; ZVBB-RV32-NEXT: vmv2r.v v24, v18
+; ZVBB-RV32-NEXT: vmv1r.v v26, v20
+; ZVBB-RV32-NEXT: add a3, a0, a2
+; ZVBB-RV32-NEXT: vmv1r.v v23, v10
+; ZVBB-RV32-NEXT: add a4, a1, a2
+; ZVBB-RV32-NEXT: add a5, a4, a2
+; ZVBB-RV32-NEXT: vmv1r.v v25, v14
+; ZVBB-RV32-NEXT: add a6, a5, a2
+; ZVBB-RV32-NEXT: vmv1r.v v18, v11
+; ZVBB-RV32-NEXT: vsseg5e16.v v22, (a0)
+; ZVBB-RV32-NEXT: vmv1r.v v20, v15
+; ZVBB-RV32-NEXT: vsseg5e16.v v17, (a1)
+; ZVBB-RV32-NEXT: vl1re16.v v16, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v17, (a6)
+; ZVBB-RV32-NEXT: add a6, a3, a2
+; ZVBB-RV32-NEXT: vl1re16.v v10, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v11, (a6)
+; ZVBB-RV32-NEXT: vl1re16.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re16.v v9, (a3)
+; ZVBB-RV32-NEXT: vl1re16.v v14, (a4)
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a3, 10
+; ZVBB-RV32-NEXT: mul a0, a0, a3
+; ZVBB-RV32-NEXT: add a0, sp, a0
+; ZVBB-RV32-NEXT: addi a0, a0, 64
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v15, (a5)
+; ZVBB-RV32-NEXT: vl1re16.v v12, (a6)
+; ZVBB-RV32-NEXT: vl1re16.v v13, (a1)
+; ZVBB-RV32-NEXT: slli a2, a2, 3
+; ZVBB-RV32-NEXT: add a2, a0, a2
+; ZVBB-RV32-NEXT: vs2r.v v16, (a2)
+; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT: vl8re16.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8re16.v v8, (a0)
+; ZVBB-RV32-NEXT: addi sp, s0, -80
+; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: addi sp, sp, 80
+; ZVBB-RV32-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv64bf16_nxv32bf16:
-; ZVBB: # %bb.0:
-; ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVBB-NEXT: vwsll.vi v24, v16, 16
-; ZVBB-NEXT: vwsll.vi v0, v20, 16
-; ZVBB-NEXT: vwaddu.wv v24, v24, v8
-; ZVBB-NEXT: vwaddu.wv v0, v0, v12
-; ZVBB-NEXT: vmv8r.v v8, v24
-; ZVBB-NEXT: vmv8r.v v16, v0
-; ZVBB-NEXT: ret
+; ZVBB-RV64-LABEL: vector_interleave_nxv40f16_nxv8f16:
+; ZVBB-RV64: # %bb.0:
+; ZVBB-RV64-NEXT: addi sp, sp, -80
+; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: addi s0, sp, 80
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a1, 28
+; ZVBB-RV64-NEXT: mul a0, a0, a1
+; ZVBB-RV64-NEXT: sub sp, sp, a0
+; ZVBB-RV64-NEXT: andi sp, sp, -64
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV64-NEXT: vmv2r.v v20, v16
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv2r.v v18, v12
+; ZVBB-RV64-NEXT: csrr a1, vlenb
+; ZVBB-RV64-NEXT: slli a2, a1, 2
+; ZVBB-RV64-NEXT: add a1, a2, a1
+; ZVBB-RV64-NEXT: add a1, sp, a1
+; ZVBB-RV64-NEXT: addi a1, a1, 64
+; ZVBB-RV64-NEXT: csrr a2, vlenb
+; ZVBB-RV64-NEXT: vmv2r.v v16, v8
+; ZVBB-RV64-NEXT: vmv2r.v v22, v16
+; ZVBB-RV64-NEXT: vmv2r.v v24, v18
+; ZVBB-RV64-NEXT: vmv1r.v v26, v20
+; ZVBB-RV64-NEXT: add a3, a0, a2
+; ZVBB-RV64-NEXT: vmv1r.v v23, v10
+; ZVBB-RV64-NEXT: add a4, a1, a2
+; ZVBB-RV64-NEXT: add a5, a4, a2
+; ZVBB-RV64-NEXT: vmv1r.v v25, v14
+; ZVBB-RV64-NEXT: add a6, a5, a2
+; ZVBB-RV64-NEXT: vmv1r.v v18, v11
+; ZVBB-RV64-NEXT: vsseg5e16.v v22, (a0)
+; ZVBB-RV64-NEXT: vmv1r.v v20, v15
+; ZVBB-RV64-NEXT: vsseg5e16.v v17, (a1)
+; ZVBB-RV64-NEXT: vl1re16.v v16, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v17, (a6)
+; ZVBB-RV64-NEXT: add a6, a3, a2
+; ZVBB-RV64-NEXT: vl1re16.v v10, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v11, (a6)
+; ZVBB-RV64-NEXT: vl1re16.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re16.v v9, (a3)
+; ZVBB-RV64-NEXT: vl1re16.v v14, (a4)
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a3, 10
+; ZVBB-RV64-NEXT: mul a0, a0, a3
+; ZVBB-RV64-NEXT: add a0, sp, a0
+; ZVBB-RV64-NEXT: addi a0, a0, 64
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v15, (a5)
+; ZVBB-RV64-NEXT: vl1re16.v v12, (a6)
+; ZVBB-RV64-NEXT: vl1re16.v v13, (a1)
+; ZVBB-RV64-NEXT: slli a2, a2, 3
+; ZVBB-RV64-NEXT: add a2, a0, a2
+; ZVBB-RV64-NEXT: vs2r.v v16, (a2)
+; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT: vl8re16.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8re16.v v8, (a0)
+; ZVBB-RV64-NEXT: addi sp, s0, -80
+; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: addi sp, sp, 80
+; ZVBB-RV64-NEXT: ret
;
-; ZIP-LABEL: vector_interleave_nxv64bf16_nxv32bf16:
+; ZIP-LABEL: vector_interleave_nxv40f16_nxv8f16:
; ZIP: # %bb.0:
-; ZIP-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZIP-NEXT: ri.vzip2b.vv v28, v8, v16
-; ZIP-NEXT: ri.vzip2b.vv v4, v12, v20
-; ZIP-NEXT: ri.vzip2a.vv v24, v8, v16
-; ZIP-NEXT: ri.vzip2a.vv v0, v12, v20
-; ZIP-NEXT: vmv8r.v v8, v24
-; ZIP-NEXT: vmv8r.v v16, v0
+; ZIP-NEXT: addi sp, sp, -80
+; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZIP-NEXT: addi s0, sp, 80
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: li a1, 28
+; ZIP-NEXT: mul a0, a0, a1
+; ZIP-NEXT: sub sp, sp, a0
+; ZIP-NEXT: andi sp, sp, -64
+; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT: vmv2r.v v20, v16
+; ZIP-NEXT: addi a0, sp, 64
+; ZIP-NEXT: vmv2r.v v18, v12
+; ZIP-NEXT: csrr a1, vlenb
+; ZIP-NEXT: slli a2, a1, 2
+; ZIP-NEXT: add a1, a2, a1
+; ZIP-NEXT: add a1, sp, a1
+; ZIP-NEXT: addi a1, a1, 64
+; ZIP-NEXT: csrr a2, vlenb
+; ZIP-NEXT: vmv2r.v v16, v8
+; ZIP-NEXT: vmv2r.v v22, v16
+; ZIP-NEXT: vmv2r.v v24, v18
+; ZIP-NEXT: vmv1r.v v26, v20
+; ZIP-NEXT: add a3, a0, a2
+; ZIP-NEXT: vmv1r.v v23, v10
+; ZIP-NEXT: add a4, a1, a2
+; ZIP-NEXT: add a5, a4, a2
+; ZIP-NEXT: vmv1r.v v25, v14
+; ZIP-NEXT: add a6, a5, a2
+; ZIP-NEXT: vmv1r.v v18, v11
+; ZIP-NEXT: vsseg5e16.v v22, (a0)
+; ZIP-NEXT: vmv1r.v v20, v15
+; ZIP-NEXT: vsseg5e16.v v17, (a1)
+; ZIP-NEXT: vl1re16.v v16, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re16.v v17, (a6)
+; ZIP-NEXT: add a6, a3, a2
+; ZIP-NEXT: vl1re16.v v10, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re16.v v11, (a6)
+; ZIP-NEXT: vl1re16.v v8, (a0)
+; ZIP-NEXT: vl1re16.v v9, (a3)
+; ZIP-NEXT: vl1re16.v v14, (a4)
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: li a3, 10
+; ZIP-NEXT: mul a0, a0, a3
+; ZIP-NEXT: add a0, sp, a0
+; ZIP-NEXT: addi a0, a0, 64
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re16.v v15, (a5)
+; ZIP-NEXT: vl1re16.v v12, (a6)
+; ZIP-NEXT: vl1re16.v v13, (a1)
+; ZIP-NEXT: slli a2, a2, 3
+; ZIP-NEXT: add a2, a0, a2
+; ZIP-NEXT: vs2r.v v16, (a2)
+; ZIP-NEXT: vs8r.v v8, (a0)
+; ZIP-NEXT: vl8re16.v v16, (a2)
+; ZIP-NEXT: vl8re16.v v8, (a0)
+; ZIP-NEXT: addi sp, s0, -80
+; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZIP-NEXT: addi sp, sp, 80
; ZIP-NEXT: ret
- %res = call <vscale x 64 x bfloat> @llvm.vector.interleave2.nxv64bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
- ret <vscale x 64 x bfloat> %res
+ %res = call <vscale x 40 x half> @llvm.vector.interleave5.nxv40f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4)
+ ret <vscale x 40 x half> %res
}
-define <vscale x 64 x half> @vector_interleave_nxv64f16_nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b) {
-; V-LABEL: vector_interleave_nxv64f16_nxv32f16:
-; V: # %bb.0:
-; V-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; V-NEXT: vmv8r.v v24, v8
-; V-NEXT: vwaddu.vv v8, v24, v16
-; V-NEXT: li a0, -1
-; V-NEXT: vwaddu.vv v0, v28, v20
-; V-NEXT: vwmaccu.vx v8, a0, v16
-; V-NEXT: vwmaccu.vx v0, a0, v20
-; V-NEXT: vmv8r.v v16, v0
-; V-NEXT: ret
+define <vscale x 10 x bfloat> @vector_interleave_nxv10bf16_nxv2bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4) nounwind {
+; CHECK-LABEL: vector_interleave_nxv10bf16_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a2, a1, 1
+; CHECK-NEXT: add a3, a0, a2
+; CHECK-NEXT: add a4, a3, a2
+; CHECK-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vsseg5e16.v v8, (a0)
+; CHECK-NEXT: add a5, a4, a2
+; CHECK-NEXT: vle16.v v8, (a5)
+; CHECK-NEXT: vle16.v v9, (a4)
+; CHECK-NEXT: srli a1, a1, 2
+; CHECK-NEXT: add a4, a1, a1
+; CHECK-NEXT: vle16.v v10, (a3)
+; CHECK-NEXT: vsetvli zero, a4, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v8, a1
+; CHECK-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: vsetvli zero, a4, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v10, a1
+; CHECK-NEXT: add a2, a5, a2
+; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vle16.v v10, (a2)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv64f16_nxv32f16:
+; ZVBB-LABEL: vector_interleave_nxv10bf16_nxv2bf16:
; ZVBB: # %bb.0:
-; ZVBB-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZVBB-NEXT: vwsll.vi v24, v16, 16
-; ZVBB-NEXT: vwsll.vi v0, v20, 16
-; ZVBB-NEXT: vwaddu.wv v24, v24, v8
-; ZVBB-NEXT: vwaddu.wv v0, v0, v12
-; ZVBB-NEXT: vmv8r.v v8, v24
-; ZVBB-NEXT: vmv8r.v v16, v0
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a1, a0, 1
+; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a2, a1, 1
+; ZVBB-NEXT: add a3, a0, a2
+; ZVBB-NEXT: add a4, a3, a2
+; ZVBB-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vsseg5e16.v v8, (a0)
+; ZVBB-NEXT: add a5, a4, a2
+; ZVBB-NEXT: vle16.v v8, (a5)
+; ZVBB-NEXT: vle16.v v9, (a4)
+; ZVBB-NEXT: srli a1, a1, 2
+; ZVBB-NEXT: add a4, a1, a1
+; ZVBB-NEXT: vle16.v v10, (a3)
+; ZVBB-NEXT: vsetvli zero, a4, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v9, v8, a1
+; ZVBB-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vle16.v v8, (a0)
+; ZVBB-NEXT: vsetvli zero, a4, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v8, v10, a1
+; ZVBB-NEXT: add a2, a5, a2
+; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vle16.v v10, (a2)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a1, a0, 1
+; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
-;
-; ZIP-LABEL: vector_interleave_nxv64f16_nxv32f16:
-; ZIP: # %bb.0:
-; ZIP-NEXT: vsetvli a0, zero, e16, m4, ta, ma
-; ZIP-NEXT: ri.vzip2b.vv v28, v8, v16
-; ZIP-NEXT: ri.vzip2b.vv v4, v12, v20
-; ZIP-NEXT: ri.vzip2a.vv v24, v8, v16
-; ZIP-NEXT: ri.vzip2a.vv v0, v12, v20
-; ZIP-NEXT: vmv8r.v v8, v24
-; ZIP-NEXT: vmv8r.v v16, v0
-; ZIP-NEXT: ret
- %res = call <vscale x 64 x half> @llvm.vector.interleave2.nxv64f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b)
- ret <vscale x 64 x half> %res
+ %res = call <vscale x 10 x bfloat> @llvm.vector.interleave5.nxv10bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4)
+ ret <vscale x 10 x bfloat> %res
}
-define <vscale x 32 x float> @vector_interleave_nxv32f32_nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b) {
-; V-LABEL: vector_interleave_nxv32f32_nxv16f32:
-; V: # %bb.0:
-; V-NEXT: vsetvli a0, zero, e32, m4, ta, ma
-; V-NEXT: vmv8r.v v24, v8
-; V-NEXT: vwaddu.vv v8, v24, v16
-; V-NEXT: li a0, -1
-; V-NEXT: vwaddu.vv v0, v28, v20
-; V-NEXT: vwmaccu.vx v8, a0, v16
-; V-NEXT: vwmaccu.vx v0, a0, v20
-; V-NEXT: vmv8r.v v16, v0
-; V-NEXT: ret
+define <vscale x 20 x bfloat> @vector_interleave_nxv20bf16_nxv4bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4) nounwind {
+; CHECK-LABEL: vector_interleave_nxv20bf16_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 2
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, ma
+; CHECK-NEXT: vsseg5e16.v v8, (a0)
+; CHECK-NEXT: vl1re16.v v10, (a3)
+; CHECK-NEXT: add a3, a3, a1
+; CHECK-NEXT: vl1re16.v v11, (a3)
+; CHECK-NEXT: vl1re16.v v8, (a0)
+; CHECK-NEXT: vl1re16.v v9, (a2)
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vl1re16.v v12, (a1)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a1, a0, 2
+; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv32f32_nxv16f32:
+; ZVBB-LABEL: vector_interleave_nxv20bf16_nxv4bf16:
; ZVBB: # %bb.0:
-; ZVBB-NEXT: li a0, 32
-; ZVBB-NEXT: vsetvli a1, zero, e32, m4, ta, ma
-; ZVBB-NEXT: vwsll.vx v24, v16, a0
-; ZVBB-NEXT: vwsll.vx v0, v20, a0
-; ZVBB-NEXT: vwaddu.wv v24, v24, v8
-; ZVBB-NEXT: vwaddu.wv v0, v0, v12
-; ZVBB-NEXT: vmv8r.v v8, v24
-; ZVBB-NEXT: vmv8r.v v16, v0
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a1, a0, 2
+; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vsetvli a4, zero, e16, m1, ta, ma
+; ZVBB-NEXT: vsseg5e16.v v8, (a0)
+; ZVBB-NEXT: vl1re16.v v10, (a3)
+; ZVBB-NEXT: add a3, a3, a1
+; ZVBB-NEXT: vl1re16.v v11, (a3)
+; ZVBB-NEXT: vl1re16.v v8, (a0)
+; ZVBB-NEXT: vl1re16.v v9, (a2)
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vl1re16.v v12, (a1)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a1, a0, 2
+; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
-;
-; ZIP-LABEL: vector_interleave_nxv32f32_nxv16f32:
-; ZIP: # %bb.0:
-; ZIP-NEXT: vsetvli a0, zero, e32, m4, ta, ma
-; ZIP-NEXT: ri.vzip2b.vv v28, v8, v16
-; ZIP-NEXT: ri.vzip2b.vv v4, v12, v20
-; ZIP-NEXT: ri.vzip2a.vv v24, v8, v16
-; ZIP-NEXT: ri.vzip2a.vv v0, v12, v20
-; ZIP-NEXT: vmv8r.v v8, v24
-; ZIP-NEXT: vmv8r.v v16, v0
-; ZIP-NEXT: ret
- %res = call <vscale x 32 x float> @llvm.vector.interleave2.nxv32f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b)
- ret <vscale x 32 x float> %res
+ %res = call <vscale x 20 x bfloat> @llvm.vector.interleave5.nxv20bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4)
+ ret <vscale x 20 x bfloat> %res
}
-define <vscale x 16 x double> @vector_interleave_nxv16f64_nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b) {
-; V-LABEL: vector_interleave_nxv16f64_nxv8f64:
-; V: # %bb.0:
-; V-NEXT: csrr a0, vlenb
-; V-NEXT: vsetvli a1, zero, e16, m2, ta, mu
-; V-NEXT: vid.v v6
-; V-NEXT: vmv8r.v v24, v8
-; V-NEXT: srli a0, a0, 1
-; V-NEXT: vmv4r.v v28, v16
-; V-NEXT: vmv4r.v v16, v12
-; V-NEXT: vand.vi v8, v6, 1
-; V-NEXT: vmsne.vi v0, v8, 0
-; V-NEXT: vsrl.vi v6, v6, 1
-; V-NEXT: vadd.vx v6, v6, a0, v0.t
-; V-NEXT: vsetvli zero, zero, e64, m8, ta, ma
-; V-NEXT: vrgatherei16.vv v8, v24, v6
-; V-NEXT: vrgatherei16.vv v24, v16, v6
-; V-NEXT: vmv.v.v v16, v24
-; V-NEXT: ret
+define <vscale x 40 x bfloat> @vector_interleave_nxv40bf16_nxv8bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4) nounwind {
+; RV32-LABEL: vector_interleave_nxv40bf16_nxv8bf16:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -80
+; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 80
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 28
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vmv2r.v v20, v16
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv2r.v v18, v12
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 2
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 64
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: vmv2r.v v16, v8
+; RV32-NEXT: vmv2r.v v22, v16
+; RV32-NEXT: vmv2r.v v24, v18
+; RV32-NEXT: vmv1r.v v26, v20
+; RV32-NEXT: add a3, a0, a2
+; RV32-NEXT: vmv1r.v v23, v10
+; RV32-NEXT: add a4, a1, a2
+; RV32-NEXT: add a5, a4, a2
+; RV32-NEXT: vmv1r.v v25, v14
+; RV32-NEXT: add a6, a5, a2
+; RV32-NEXT: vmv1r.v v18, v11
+; RV32-NEXT: vsseg5e16.v v22, (a0)
+; RV32-NEXT: vmv1r.v v20, v15
+; RV32-NEXT: vsseg5e16.v v17, (a1)
+; RV32-NEXT: vl1re16.v v16, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v17, (a6)
+; RV32-NEXT: add a6, a3, a2
+; RV32-NEXT: vl1re16.v v10, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v11, (a6)
+; RV32-NEXT: vl1re16.v v8, (a0)
+; RV32-NEXT: vl1re16.v v9, (a3)
+; RV32-NEXT: vl1re16.v v14, (a4)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a3, 10
+; RV32-NEXT: mul a0, a0, a3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 64
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v15, (a5)
+; RV32-NEXT: vl1re16.v v12, (a6)
+; RV32-NEXT: vl1re16.v v13, (a1)
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a0, a2
+; RV32-NEXT: vs2r.v v16, (a2)
+; RV32-NEXT: vs8r.v v8, (a0)
+; RV32-NEXT: vl8re16.v v16, (a2)
+; RV32-NEXT: vl8re16.v v8, (a0)
+; RV32-NEXT: addi sp, s0, -80
+; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 80
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_nxv40bf16_nxv8bf16:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -80
+; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi s0, sp, 80
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 28
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: sub sp, sp, a0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vmv2r.v v20, v16
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv2r.v v18, v12
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a2, a1, 2
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 64
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: vmv2r.v v16, v8
+; RV64-NEXT: vmv2r.v v22, v16
+; RV64-NEXT: vmv2r.v v24, v18
+; RV64-NEXT: vmv1r.v v26, v20
+; RV64-NEXT: add a3, a0, a2
+; RV64-NEXT: vmv1r.v v23, v10
+; RV64-NEXT: add a4, a1, a2
+; RV64-NEXT: add a5, a4, a2
+; RV64-NEXT: vmv1r.v v25, v14
+; RV64-NEXT: add a6, a5, a2
+; RV64-NEXT: vmv1r.v v18, v11
+; RV64-NEXT: vsseg5e16.v v22, (a0)
+; RV64-NEXT: vmv1r.v v20, v15
+; RV64-NEXT: vsseg5e16.v v17, (a1)
+; RV64-NEXT: vl1re16.v v16, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v17, (a6)
+; RV64-NEXT: add a6, a3, a2
+; RV64-NEXT: vl1re16.v v10, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v11, (a6)
+; RV64-NEXT: vl1re16.v v8, (a0)
+; RV64-NEXT: vl1re16.v v9, (a3)
+; RV64-NEXT: vl1re16.v v14, (a4)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a3, 10
+; RV64-NEXT: mul a0, a0, a3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 64
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v15, (a5)
+; RV64-NEXT: vl1re16.v v12, (a6)
+; RV64-NEXT: vl1re16.v v13, (a1)
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a0, a2
+; RV64-NEXT: vs2r.v v16, (a2)
+; RV64-NEXT: vs8r.v v8, (a0)
+; RV64-NEXT: vl8re16.v v16, (a2)
+; RV64-NEXT: vl8re16.v v8, (a0)
+; RV64-NEXT: addi sp, s0, -80
+; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 80
+; RV64-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv16f64_nxv8f64:
-; ZVBB: # %bb.0:
-; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: vsetvli a1, zero, e16, m2, ta, mu
-; ZVBB-NEXT: vid.v v6
-; ZVBB-NEXT: vmv8r.v v24, v8
-; ZVBB-NEXT: srli a0, a0, 1
-; ZVBB-NEXT: vmv4r.v v28, v16
-; ZVBB-NEXT: vmv4r.v v16, v12
-; ZVBB-NEXT: vand.vi v8, v6, 1
-; ZVBB-NEXT: vmsne.vi v0, v8, 0
-; ZVBB-NEXT: vsrl.vi v6, v6, 1
-; ZVBB-NEXT: vadd.vx v6, v6, a0, v0.t
-; ZVBB-NEXT: vsetvli zero, zero, e64, m8, ta, ma
-; ZVBB-NEXT: vrgatherei16.vv v8, v24, v6
-; ZVBB-NEXT: vrgatherei16.vv v24, v16, v6
-; ZVBB-NEXT: vmv.v.v v16, v24
-; ZVBB-NEXT: ret
+; ZVBB-RV32-LABEL: vector_interleave_nxv40bf16_nxv8bf16:
+; ZVBB-RV32: # %bb.0:
+; ZVBB-RV32-NEXT: addi sp, sp, -80
+; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: addi s0, sp, 80
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a1, 28
+; ZVBB-RV32-NEXT: mul a0, a0, a1
+; ZVBB-RV32-NEXT: sub sp, sp, a0
+; ZVBB-RV32-NEXT: andi sp, sp, -64
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV32-NEXT: vmv2r.v v20, v16
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv2r.v v18, v12
+; ZVBB-RV32-NEXT: csrr a1, vlenb
+; ZVBB-RV32-NEXT: slli a2, a1, 2
+; ZVBB-RV32-NEXT: add a1, a2, a1
+; ZVBB-RV32-NEXT: add a1, sp, a1
+; ZVBB-RV32-NEXT: addi a1, a1, 64
+; ZVBB-RV32-NEXT: csrr a2, vlenb
+; ZVBB-RV32-NEXT: vmv2r.v v16, v8
+; ZVBB-RV32-NEXT: vmv2r.v v22, v16
+; ZVBB-RV32-NEXT: vmv2r.v v24, v18
+; ZVBB-RV32-NEXT: vmv1r.v v26, v20
+; ZVBB-RV32-NEXT: add a3, a0, a2
+; ZVBB-RV32-NEXT: vmv1r.v v23, v10
+; ZVBB-RV32-NEXT: add a4, a1, a2
+; ZVBB-RV32-NEXT: add a5, a4, a2
+; ZVBB-RV32-NEXT: vmv1r.v v25, v14
+; ZVBB-RV32-NEXT: add a6, a5, a2
+; ZVBB-RV32-NEXT: vmv1r.v v18, v11
+; ZVBB-RV32-NEXT: vsseg5e16.v v22, (a0)
+; ZVBB-RV32-NEXT: vmv1r.v v20, v15
+; ZVBB-RV32-NEXT: vsseg5e16.v v17, (a1)
+; ZVBB-RV32-NEXT: vl1re16.v v16, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v17, (a6)
+; ZVBB-RV32-NEXT: add a6, a3, a2
+; ZVBB-RV32-NEXT: vl1re16.v v10, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v11, (a6)
+; ZVBB-RV32-NEXT: vl1re16.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re16.v v9, (a3)
+; ZVBB-RV32-NEXT: vl1re16.v v14, (a4)
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a3, 10
+; ZVBB-RV32-NEXT: mul a0, a0, a3
+; ZVBB-RV32-NEXT: add a0, sp, a0
+; ZVBB-RV32-NEXT: addi a0, a0, 64
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v15, (a5)
+; ZVBB-RV32-NEXT: vl1re16.v v12, (a6)
+; ZVBB-RV32-NEXT: vl1re16.v v13, (a1)
+; ZVBB-RV32-NEXT: slli a2, a2, 3
+; ZVBB-RV32-NEXT: add a2, a0, a2
+; ZVBB-RV32-NEXT: vs2r.v v16, (a2)
+; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT: vl8re16.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8re16.v v8, (a0)
+; ZVBB-RV32-NEXT: addi sp, s0, -80
+; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: addi sp, sp, 80
+; ZVBB-RV32-NEXT: ret
;
-; ZIP-LABEL: vector_interleave_nxv16f64_nxv8f64:
+; ZVBB-RV64-LABEL: vector_interleave_nxv40bf16_nxv8bf16:
+; ZVBB-RV64: # %bb.0:
+; ZVBB-RV64-NEXT: addi sp, sp, -80
+; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: addi s0, sp, 80
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a1, 28
+; ZVBB-RV64-NEXT: mul a0, a0, a1
+; ZVBB-RV64-NEXT: sub sp, sp, a0
+; ZVBB-RV64-NEXT: andi sp, sp, -64
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV64-NEXT: vmv2r.v v20, v16
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv2r.v v18, v12
+; ZVBB-RV64-NEXT: csrr a1, vlenb
+; ZVBB-RV64-NEXT: slli a2, a1, 2
+; ZVBB-RV64-NEXT: add a1, a2, a1
+; ZVBB-RV64-NEXT: add a1, sp, a1
+; ZVBB-RV64-NEXT: addi a1, a1, 64
+; ZVBB-RV64-NEXT: csrr a2, vlenb
+; ZVBB-RV64-NEXT: vmv2r.v v16, v8
+; ZVBB-RV64-NEXT: vmv2r.v v22, v16
+; ZVBB-RV64-NEXT: vmv2r.v v24, v18
+; ZVBB-RV64-NEXT: vmv1r.v v26, v20
+; ZVBB-RV64-NEXT: add a3, a0, a2
+; ZVBB-RV64-NEXT: vmv1r.v v23, v10
+; ZVBB-RV64-NEXT: add a4, a1, a2
+; ZVBB-RV64-NEXT: add a5, a4, a2
+; ZVBB-RV64-NEXT: vmv1r.v v25, v14
+; ZVBB-RV64-NEXT: add a6, a5, a2
+; ZVBB-RV64-NEXT: vmv1r.v v18, v11
+; ZVBB-RV64-NEXT: vsseg5e16.v v22, (a0)
+; ZVBB-RV64-NEXT: vmv1r.v v20, v15
+; ZVBB-RV64-NEXT: vsseg5e16.v v17, (a1)
+; ZVBB-RV64-NEXT: vl1re16.v v16, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v17, (a6)
+; ZVBB-RV64-NEXT: add a6, a3, a2
+; ZVBB-RV64-NEXT: vl1re16.v v10, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v11, (a6)
+; ZVBB-RV64-NEXT: vl1re16.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re16.v v9, (a3)
+; ZVBB-RV64-NEXT: vl1re16.v v14, (a4)
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a3, 10
+; ZVBB-RV64-NEXT: mul a0, a0, a3
+; ZVBB-RV64-NEXT: add a0, sp, a0
+; ZVBB-RV64-NEXT: addi a0, a0, 64
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v15, (a5)
+; ZVBB-RV64-NEXT: vl1re16.v v12, (a6)
+; ZVBB-RV64-NEXT: vl1re16.v v13, (a1)
+; ZVBB-RV64-NEXT: slli a2, a2, 3
+; ZVBB-RV64-NEXT: add a2, a0, a2
+; ZVBB-RV64-NEXT: vs2r.v v16, (a2)
+; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT: vl8re16.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8re16.v v8, (a0)
+; ZVBB-RV64-NEXT: addi sp, s0, -80
+; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: addi sp, sp, 80
+; ZVBB-RV64-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv40bf16_nxv8bf16:
; ZIP: # %bb.0:
-; ZIP-NEXT: vsetvli a0, zero, e64, m4, ta, ma
-; ZIP-NEXT: ri.vzip2b.vv v28, v8, v16
-; ZIP-NEXT: ri.vzip2b.vv v4, v12, v20
-; ZIP-NEXT: ri.vzip2a.vv v24, v8, v16
-; ZIP-NEXT: ri.vzip2a.vv v0, v12, v20
-; ZIP-NEXT: vmv8r.v v8, v24
-; ZIP-NEXT: vmv8r.v v16, v0
+; ZIP-NEXT: addi sp, sp, -80
+; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZIP-NEXT: addi s0, sp, 80
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: li a1, 28
+; ZIP-NEXT: mul a0, a0, a1
+; ZIP-NEXT: sub sp, sp, a0
+; ZIP-NEXT: andi sp, sp, -64
+; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT: vmv2r.v v20, v16
+; ZIP-NEXT: addi a0, sp, 64
+; ZIP-NEXT: vmv2r.v v18, v12
+; ZIP-NEXT: csrr a1, vlenb
+; ZIP-NEXT: slli a2, a1, 2
+; ZIP-NEXT: add a1, a2, a1
+; ZIP-NEXT: add a1, sp, a1
+; ZIP-NEXT: addi a1, a1, 64
+; ZIP-NEXT: csrr a2, vlenb
+; ZIP-NEXT: vmv2r.v v16, v8
+; ZIP-NEXT: vmv2r.v v22, v16
+; ZIP-NEXT: vmv2r.v v24, v18
+; ZIP-NEXT: vmv1r.v v26, v20
+; ZIP-NEXT: add a3, a0, a2
+; ZIP-NEXT: vmv1r.v v23, v10
+; ZIP-NEXT: add a4, a1, a2
+; ZIP-NEXT: add a5, a4, a2
+; ZIP-NEXT: vmv1r.v v25, v14
+; ZIP-NEXT: add a6, a5, a2
+; ZIP-NEXT: vmv1r.v v18, v11
+; ZIP-NEXT: vsseg5e16.v v22, (a0)
+; ZIP-NEXT: vmv1r.v v20, v15
+; ZIP-NEXT: vsseg5e16.v v17, (a1)
+; ZIP-NEXT: vl1re16.v v16, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re16.v v17, (a6)
+; ZIP-NEXT: add a6, a3, a2
+; ZIP-NEXT: vl1re16.v v10, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re16.v v11, (a6)
+; ZIP-NEXT: vl1re16.v v8, (a0)
+; ZIP-NEXT: vl1re16.v v9, (a3)
+; ZIP-NEXT: vl1re16.v v14, (a4)
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: li a3, 10
+; ZIP-NEXT: mul a0, a0, a3
+; ZIP-NEXT: add a0, sp, a0
+; ZIP-NEXT: addi a0, a0, 64
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re16.v v15, (a5)
+; ZIP-NEXT: vl1re16.v v12, (a6)
+; ZIP-NEXT: vl1re16.v v13, (a1)
+; ZIP-NEXT: slli a2, a2, 3
+; ZIP-NEXT: add a2, a0, a2
+; ZIP-NEXT: vs2r.v v16, (a2)
+; ZIP-NEXT: vs8r.v v8, (a0)
+; ZIP-NEXT: vl8re16.v v16, (a2)
+; ZIP-NEXT: vl8re16.v v8, (a0)
+; ZIP-NEXT: addi sp, s0, -80
+; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZIP-NEXT: addi sp, sp, 80
; ZIP-NEXT: ret
- %res = call <vscale x 16 x double> @llvm.vector.interleave2.nxv16f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b)
- ret <vscale x 16 x double> %res
+ %res = call <vscale x 40 x bfloat> @llvm.vector.interleave5.nxv40bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4)
+ ret <vscale x 40 x bfloat> %res
}
-define <vscale x 6 x half> @vector_interleave_nxv6f16_nxv2f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv6f16_nxv2f16:
+define <vscale x 5 x float> @vector_interleave_nxv5f32_nxv1f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4) nounwind {
+; CHECK-LABEL: vector_interleave_nxv5f32_nxv1f32:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: add a0, a1, a0
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a2, a1, 1
-; CHECK-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vsseg3e16.v v8, (a0)
; CHECK-NEXT: add a3, a0, a2
-; CHECK-NEXT: vle16.v v9, (a3)
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: srli a1, a1, 2
-; CHECK-NEXT: add a0, a1, a1
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v9, a1
-; CHECK-NEXT: add a2, a3, a2
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vle16.v v9, (a2)
+; CHECK-NEXT: add a4, a3, a2
+; CHECK-NEXT: vsetvli a5, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsseg5e32.v v8, (a0)
+; CHECK-NEXT: add a5, a4, a2
+; CHECK-NEXT: vle32.v v8, (a5)
+; CHECK-NEXT: vle32.v v9, (a4)
+; CHECK-NEXT: srli a1, a1, 3
+; CHECK-NEXT: add a4, a1, a1
+; CHECK-NEXT: vle32.v v10, (a3)
+; CHECK-NEXT: vsetvli zero, a4, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v8, a1
+; CHECK-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vsetvli zero, a4, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v10, a1
+; CHECK-NEXT: add a2, a5, a2
+; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vle32.v v10, (a2)
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
+; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: add a0, a1, a0
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv6f16_nxv2f16:
+; ZVBB-LABEL: vector_interleave_nxv5f32_nxv1f32:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: slli a1, a0, 1
+; ZVBB-NEXT: add a0, a1, a0
; ZVBB-NEXT: sub sp, sp, a0
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: srli a2, a1, 1
-; ZVBB-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
-; ZVBB-NEXT: vsseg3e16.v v8, (a0)
; ZVBB-NEXT: add a3, a0, a2
-; ZVBB-NEXT: vle16.v v9, (a3)
-; ZVBB-NEXT: vle16.v v8, (a0)
-; ZVBB-NEXT: srli a1, a1, 2
-; ZVBB-NEXT: add a0, a1, a1
-; ZVBB-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; ZVBB-NEXT: vslideup.vx v8, v9, a1
-; ZVBB-NEXT: add a2, a3, a2
-; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; ZVBB-NEXT: vle16.v v9, (a2)
+; ZVBB-NEXT: add a4, a3, a2
+; ZVBB-NEXT: vsetvli a5, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vsseg5e32.v v8, (a0)
+; ZVBB-NEXT: add a5, a4, a2
+; ZVBB-NEXT: vle32.v v8, (a5)
+; ZVBB-NEXT: vle32.v v9, (a4)
+; ZVBB-NEXT: srli a1, a1, 3
+; ZVBB-NEXT: add a4, a1, a1
+; ZVBB-NEXT: vle32.v v10, (a3)
+; ZVBB-NEXT: vsetvli zero, a4, e32, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v9, v8, a1
+; ZVBB-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vle32.v v8, (a0)
+; ZVBB-NEXT: vsetvli zero, a4, e32, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v8, v10, a1
+; ZVBB-NEXT: add a2, a5, a2
+; ZVBB-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vle32.v v10, (a2)
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a0, a0, 1
+; ZVBB-NEXT: slli a1, a0, 1
+; ZVBB-NEXT: add a0, a1, a0
; ZVBB-NEXT: add sp, sp, a0
; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
- %res = call <vscale x 6 x half> @llvm.vector.interleave3.nxv6f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2)
- ret <vscale x 6 x half> %res
+ %res = call <vscale x 5 x float> @llvm.vector.interleave5.nxv5f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4)
+ ret <vscale x 5 x float> %res
}
-define <vscale x 12 x half> @vector_interleave_nxv12f16_nxv4f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv12f16_nxv4f16:
+define <vscale x 10 x float> @vector_interleave_nxv10f32_nxv2f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4) nounwind {
+; CHECK-LABEL: vector_interleave_nxv10f32_nxv2f32:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: slli a1, a0, 2
; CHECK-NEXT: add a0, a1, a0
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT: vsseg3e16.v v8, (a0)
-; CHECK-NEXT: vl1re16.v v8, (a0)
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl1re16.v v9, (a0)
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl1re16.v v10, (a0)
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vsetvli a4, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg5e32.v v8, (a0)
+; CHECK-NEXT: vl1re32.v v10, (a3)
+; CHECK-NEXT: add a3, a3, a1
+; CHECK-NEXT: vl1re32.v v11, (a3)
+; CHECK-NEXT: vl1re32.v v8, (a0)
+; CHECK-NEXT: vl1re32.v v9, (a2)
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vl1re32.v v12, (a1)
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: slli a1, a0, 2
; CHECK-NEXT: add a0, a1, a0
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv12f16_nxv4f16:
+; ZVBB-LABEL: vector_interleave_nxv10f32_nxv2f32:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a1, a0, 1
+; ZVBB-NEXT: slli a1, a0, 2
; ZVBB-NEXT: add a0, a1, a0
; ZVBB-NEXT: sub sp, sp, a0
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
-; ZVBB-NEXT: vsetvli a2, zero, e16, m1, ta, ma
-; ZVBB-NEXT: vsseg3e16.v v8, (a0)
-; ZVBB-NEXT: vl1re16.v v8, (a0)
-; ZVBB-NEXT: add a0, a0, a1
-; ZVBB-NEXT: vl1re16.v v9, (a0)
-; ZVBB-NEXT: add a0, a0, a1
-; ZVBB-NEXT: vl1re16.v v10, (a0)
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vsetvli a4, zero, e32, m1, ta, ma
+; ZVBB-NEXT: vsseg5e32.v v8, (a0)
+; ZVBB-NEXT: vl1re32.v v10, (a3)
+; ZVBB-NEXT: add a3, a3, a1
+; ZVBB-NEXT: vl1re32.v v11, (a3)
+; ZVBB-NEXT: vl1re32.v v8, (a0)
+; ZVBB-NEXT: vl1re32.v v9, (a2)
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vl1re32.v v12, (a1)
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a1, a0, 1
+; ZVBB-NEXT: slli a1, a0, 2
; ZVBB-NEXT: add a0, a1, a0
; ZVBB-NEXT: add sp, sp, a0
; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
- %res = call <vscale x 12 x half> @llvm.vector.interleave3.nxv12f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2)
- ret <vscale x 12 x half> %res
+ %res = call <vscale x 10 x float> @llvm.vector.interleave5.nxv10f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4)
+ ret <vscale x 10 x float> %res
}
-define <vscale x 24 x half> @vector_interleave_nxv24f16_nxv8f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv24f16_nxv8f16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 6
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
-; CHECK-NEXT: vsseg3e16.v v8, (a0)
-; CHECK-NEXT: vl2re16.v v8, (a0)
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl2re16.v v10, (a0)
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl2re16.v v12, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 6
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: ret
+define <vscale x 20 x float> @vector_interleave_nxv20f32_nxv4f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4) nounwind {
+; RV32-LABEL: vector_interleave_nxv20f32_nxv4f32:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -80
+; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 80
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 28
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; RV32-NEXT: vmv2r.v v20, v16
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv2r.v v18, v12
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 2
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 64
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: vmv2r.v v16, v8
+; RV32-NEXT: vmv2r.v v22, v16
+; RV32-NEXT: vmv2r.v v24, v18
+; RV32-NEXT: vmv1r.v v26, v20
+; RV32-NEXT: add a3, a0, a2
+; RV32-NEXT: vmv1r.v v23, v10
+; RV32-NEXT: add a4, a1, a2
+; RV32-NEXT: add a5, a4, a2
+; RV32-NEXT: vmv1r.v v25, v14
+; RV32-NEXT: add a6, a5, a2
+; RV32-NEXT: vmv1r.v v18, v11
+; RV32-NEXT: vsseg5e32.v v22, (a0)
+; RV32-NEXT: vmv1r.v v20, v15
+; RV32-NEXT: vsseg5e32.v v17, (a1)
+; RV32-NEXT: vl1re32.v v16, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re32.v v17, (a6)
+; RV32-NEXT: add a6, a3, a2
+; RV32-NEXT: vl1re32.v v10, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re32.v v11, (a6)
+; RV32-NEXT: vl1re32.v v8, (a0)
+; RV32-NEXT: vl1re32.v v9, (a3)
+; RV32-NEXT: vl1re32.v v14, (a4)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a3, 10
+; RV32-NEXT: mul a0, a0, a3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 64
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re32.v v15, (a5)
+; RV32-NEXT: vl1re32.v v12, (a6)
+; RV32-NEXT: vl1re32.v v13, (a1)
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a0, a2
+; RV32-NEXT: vs2r.v v16, (a2)
+; RV32-NEXT: vs8r.v v8, (a0)
+; RV32-NEXT: vl8re32.v v16, (a2)
+; RV32-NEXT: vl8re32.v v8, (a0)
+; RV32-NEXT: addi sp, s0, -80
+; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 80
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_nxv20f32_nxv4f32:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -80
+; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi s0, sp, 80
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 28
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: sub sp, sp, a0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; RV64-NEXT: vmv2r.v v20, v16
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv2r.v v18, v12
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a2, a1, 2
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 64
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: vmv2r.v v16, v8
+; RV64-NEXT: vmv2r.v v22, v16
+; RV64-NEXT: vmv2r.v v24, v18
+; RV64-NEXT: vmv1r.v v26, v20
+; RV64-NEXT: add a3, a0, a2
+; RV64-NEXT: vmv1r.v v23, v10
+; RV64-NEXT: add a4, a1, a2
+; RV64-NEXT: add a5, a4, a2
+; RV64-NEXT: vmv1r.v v25, v14
+; RV64-NEXT: add a6, a5, a2
+; RV64-NEXT: vmv1r.v v18, v11
+; RV64-NEXT: vsseg5e32.v v22, (a0)
+; RV64-NEXT: vmv1r.v v20, v15
+; RV64-NEXT: vsseg5e32.v v17, (a1)
+; RV64-NEXT: vl1re32.v v16, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re32.v v17, (a6)
+; RV64-NEXT: add a6, a3, a2
+; RV64-NEXT: vl1re32.v v10, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re32.v v11, (a6)
+; RV64-NEXT: vl1re32.v v8, (a0)
+; RV64-NEXT: vl1re32.v v9, (a3)
+; RV64-NEXT: vl1re32.v v14, (a4)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a3, 10
+; RV64-NEXT: mul a0, a0, a3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 64
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re32.v v15, (a5)
+; RV64-NEXT: vl1re32.v v12, (a6)
+; RV64-NEXT: vl1re32.v v13, (a1)
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a0, a2
+; RV64-NEXT: vs2r.v v16, (a2)
+; RV64-NEXT: vs8r.v v8, (a0)
+; RV64-NEXT: vl8re32.v v16, (a2)
+; RV64-NEXT: vl8re32.v v8, (a0)
+; RV64-NEXT: addi sp, s0, -80
+; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 80
+; RV64-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv24f16_nxv8f16:
-; ZVBB: # %bb.0:
-; ZVBB-NEXT: addi sp, sp, -16
-; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: li a1, 6
-; ZVBB-NEXT: mul a0, a0, a1
-; ZVBB-NEXT: sub sp, sp, a0
-; ZVBB-NEXT: addi a0, sp, 16
-; ZVBB-NEXT: csrr a1, vlenb
-; ZVBB-NEXT: slli a1, a1, 1
-; ZVBB-NEXT: vsetvli a2, zero, e16, m2, ta, ma
-; ZVBB-NEXT: vsseg3e16.v v8, (a0)
-; ZVBB-NEXT: vl2re16.v v8, (a0)
-; ZVBB-NEXT: add a0, a0, a1
-; ZVBB-NEXT: vl2re16.v v10, (a0)
-; ZVBB-NEXT: add a0, a0, a1
-; ZVBB-NEXT: vl2re16.v v12, (a0)
-; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: li a1, 6
-; ZVBB-NEXT: mul a0, a0, a1
-; ZVBB-NEXT: add sp, sp, a0
-; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: ret
- %res = call <vscale x 24 x half> @llvm.vector.interleave3.nxv24f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2)
- ret <vscale x 24 x half> %res
-}
-
-define <vscale x 6 x bfloat> @vector_interleave_nxv6bf16_nxv2bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv6bf16_nxv2bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: srli a2, a1, 1
-; CHECK-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vsseg3e16.v v8, (a0)
-; CHECK-NEXT: add a3, a0, a2
-; CHECK-NEXT: vle16.v v9, (a3)
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: srli a1, a1, 2
-; CHECK-NEXT: add a0, a1, a1
-; CHECK-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v9, a1
-; CHECK-NEXT: add a2, a3, a2
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vle16.v v9, (a2)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: ret
+; ZVBB-RV32-LABEL: vector_interleave_nxv20f32_nxv4f32:
+; ZVBB-RV32: # %bb.0:
+; ZVBB-RV32-NEXT: addi sp, sp, -80
+; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: addi s0, sp, 80
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a1, 28
+; ZVBB-RV32-NEXT: mul a0, a0, a1
+; ZVBB-RV32-NEXT: sub sp, sp, a0
+; ZVBB-RV32-NEXT: andi sp, sp, -64
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV32-NEXT: vmv2r.v v20, v16
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv2r.v v18, v12
+; ZVBB-RV32-NEXT: csrr a1, vlenb
+; ZVBB-RV32-NEXT: slli a2, a1, 2
+; ZVBB-RV32-NEXT: add a1, a2, a1
+; ZVBB-RV32-NEXT: add a1, sp, a1
+; ZVBB-RV32-NEXT: addi a1, a1, 64
+; ZVBB-RV32-NEXT: csrr a2, vlenb
+; ZVBB-RV32-NEXT: vmv2r.v v16, v8
+; ZVBB-RV32-NEXT: vmv2r.v v22, v16
+; ZVBB-RV32-NEXT: vmv2r.v v24, v18
+; ZVBB-RV32-NEXT: vmv1r.v v26, v20
+; ZVBB-RV32-NEXT: add a3, a0, a2
+; ZVBB-RV32-NEXT: vmv1r.v v23, v10
+; ZVBB-RV32-NEXT: add a4, a1, a2
+; ZVBB-RV32-NEXT: add a5, a4, a2
+; ZVBB-RV32-NEXT: vmv1r.v v25, v14
+; ZVBB-RV32-NEXT: add a6, a5, a2
+; ZVBB-RV32-NEXT: vmv1r.v v18, v11
+; ZVBB-RV32-NEXT: vsseg5e32.v v22, (a0)
+; ZVBB-RV32-NEXT: vmv1r.v v20, v15
+; ZVBB-RV32-NEXT: vsseg5e32.v v17, (a1)
+; ZVBB-RV32-NEXT: vl1re32.v v16, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v17, (a6)
+; ZVBB-RV32-NEXT: add a6, a3, a2
+; ZVBB-RV32-NEXT: vl1re32.v v10, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v11, (a6)
+; ZVBB-RV32-NEXT: vl1re32.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re32.v v9, (a3)
+; ZVBB-RV32-NEXT: vl1re32.v v14, (a4)
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a3, 10
+; ZVBB-RV32-NEXT: mul a0, a0, a3
+; ZVBB-RV32-NEXT: add a0, sp, a0
+; ZVBB-RV32-NEXT: addi a0, a0, 64
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v15, (a5)
+; ZVBB-RV32-NEXT: vl1re32.v v12, (a6)
+; ZVBB-RV32-NEXT: vl1re32.v v13, (a1)
+; ZVBB-RV32-NEXT: slli a2, a2, 3
+; ZVBB-RV32-NEXT: add a2, a0, a2
+; ZVBB-RV32-NEXT: vs2r.v v16, (a2)
+; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT: vl8re32.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8re32.v v8, (a0)
+; ZVBB-RV32-NEXT: addi sp, s0, -80
+; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: addi sp, sp, 80
+; ZVBB-RV32-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv6bf16_nxv2bf16:
-; ZVBB: # %bb.0:
-; ZVBB-NEXT: addi sp, sp, -16
-; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a0, a0, 1
-; ZVBB-NEXT: sub sp, sp, a0
-; ZVBB-NEXT: addi a0, sp, 16
-; ZVBB-NEXT: csrr a1, vlenb
-; ZVBB-NEXT: srli a2, a1, 1
-; ZVBB-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
-; ZVBB-NEXT: vsseg3e16.v v8, (a0)
-; ZVBB-NEXT: add a3, a0, a2
-; ZVBB-NEXT: vle16.v v9, (a3)
-; ZVBB-NEXT: vle16.v v8, (a0)
-; ZVBB-NEXT: srli a1, a1, 2
-; ZVBB-NEXT: add a0, a1, a1
-; ZVBB-NEXT: vsetvli zero, a0, e16, m1, ta, ma
-; ZVBB-NEXT: vslideup.vx v8, v9, a1
-; ZVBB-NEXT: add a2, a3, a2
-; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; ZVBB-NEXT: vle16.v v9, (a2)
-; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a0, a0, 1
-; ZVBB-NEXT: add sp, sp, a0
-; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: ret
- %res = call <vscale x 6 x bfloat> @llvm.vector.interleave3.nxv6bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2)
- ret <vscale x 6 x bfloat> %res
+; ZVBB-RV64-LABEL: vector_interleave_nxv20f32_nxv4f32:
+; ZVBB-RV64: # %bb.0:
+; ZVBB-RV64-NEXT: addi sp, sp, -80
+; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: addi s0, sp, 80
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a1, 28
+; ZVBB-RV64-NEXT: mul a0, a0, a1
+; ZVBB-RV64-NEXT: sub sp, sp, a0
+; ZVBB-RV64-NEXT: andi sp, sp, -64
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV64-NEXT: vmv2r.v v20, v16
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv2r.v v18, v12
+; ZVBB-RV64-NEXT: csrr a1, vlenb
+; ZVBB-RV64-NEXT: slli a2, a1, 2
+; ZVBB-RV64-NEXT: add a1, a2, a1
+; ZVBB-RV64-NEXT: add a1, sp, a1
+; ZVBB-RV64-NEXT: addi a1, a1, 64
+; ZVBB-RV64-NEXT: csrr a2, vlenb
+; ZVBB-RV64-NEXT: vmv2r.v v16, v8
+; ZVBB-RV64-NEXT: vmv2r.v v22, v16
+; ZVBB-RV64-NEXT: vmv2r.v v24, v18
+; ZVBB-RV64-NEXT: vmv1r.v v26, v20
+; ZVBB-RV64-NEXT: add a3, a0, a2
+; ZVBB-RV64-NEXT: vmv1r.v v23, v10
+; ZVBB-RV64-NEXT: add a4, a1, a2
+; ZVBB-RV64-NEXT: add a5, a4, a2
+; ZVBB-RV64-NEXT: vmv1r.v v25, v14
+; ZVBB-RV64-NEXT: add a6, a5, a2
+; ZVBB-RV64-NEXT: vmv1r.v v18, v11
+; ZVBB-RV64-NEXT: vsseg5e32.v v22, (a0)
+; ZVBB-RV64-NEXT: vmv1r.v v20, v15
+; ZVBB-RV64-NEXT: vsseg5e32.v v17, (a1)
+; ZVBB-RV64-NEXT: vl1re32.v v16, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v17, (a6)
+; ZVBB-RV64-NEXT: add a6, a3, a2
+; ZVBB-RV64-NEXT: vl1re32.v v10, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v11, (a6)
+; ZVBB-RV64-NEXT: vl1re32.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re32.v v9, (a3)
+; ZVBB-RV64-NEXT: vl1re32.v v14, (a4)
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a3, 10
+; ZVBB-RV64-NEXT: mul a0, a0, a3
+; ZVBB-RV64-NEXT: add a0, sp, a0
+; ZVBB-RV64-NEXT: addi a0, a0, 64
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v15, (a5)
+; ZVBB-RV64-NEXT: vl1re32.v v12, (a6)
+; ZVBB-RV64-NEXT: vl1re32.v v13, (a1)
+; ZVBB-RV64-NEXT: slli a2, a2, 3
+; ZVBB-RV64-NEXT: add a2, a0, a2
+; ZVBB-RV64-NEXT: vs2r.v v16, (a2)
+; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT: vl8re32.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8re32.v v8, (a0)
+; ZVBB-RV64-NEXT: addi sp, s0, -80
+; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: addi sp, sp, 80
+; ZVBB-RV64-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv20f32_nxv4f32:
+; ZIP: # %bb.0:
+; ZIP-NEXT: addi sp, sp, -80
+; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZIP-NEXT: addi s0, sp, 80
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: li a1, 28
+; ZIP-NEXT: mul a0, a0, a1
+; ZIP-NEXT: sub sp, sp, a0
+; ZIP-NEXT: andi sp, sp, -64
+; ZIP-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZIP-NEXT: vmv2r.v v20, v16
+; ZIP-NEXT: addi a0, sp, 64
+; ZIP-NEXT: vmv2r.v v18, v12
+; ZIP-NEXT: csrr a1, vlenb
+; ZIP-NEXT: slli a2, a1, 2
+; ZIP-NEXT: add a1, a2, a1
+; ZIP-NEXT: add a1, sp, a1
+; ZIP-NEXT: addi a1, a1, 64
+; ZIP-NEXT: csrr a2, vlenb
+; ZIP-NEXT: vmv2r.v v16, v8
+; ZIP-NEXT: vmv2r.v v22, v16
+; ZIP-NEXT: vmv2r.v v24, v18
+; ZIP-NEXT: vmv1r.v v26, v20
+; ZIP-NEXT: add a3, a0, a2
+; ZIP-NEXT: vmv1r.v v23, v10
+; ZIP-NEXT: add a4, a1, a2
+; ZIP-NEXT: add a5, a4, a2
+; ZIP-NEXT: vmv1r.v v25, v14
+; ZIP-NEXT: add a6, a5, a2
+; ZIP-NEXT: vmv1r.v v18, v11
+; ZIP-NEXT: vsseg5e32.v v22, (a0)
+; ZIP-NEXT: vmv1r.v v20, v15
+; ZIP-NEXT: vsseg5e32.v v17, (a1)
+; ZIP-NEXT: vl1re32.v v16, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re32.v v17, (a6)
+; ZIP-NEXT: add a6, a3, a2
+; ZIP-NEXT: vl1re32.v v10, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re32.v v11, (a6)
+; ZIP-NEXT: vl1re32.v v8, (a0)
+; ZIP-NEXT: vl1re32.v v9, (a3)
+; ZIP-NEXT: vl1re32.v v14, (a4)
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: li a3, 10
+; ZIP-NEXT: mul a0, a0, a3
+; ZIP-NEXT: add a0, sp, a0
+; ZIP-NEXT: addi a0, a0, 64
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re32.v v15, (a5)
+; ZIP-NEXT: vl1re32.v v12, (a6)
+; ZIP-NEXT: vl1re32.v v13, (a1)
+; ZIP-NEXT: slli a2, a2, 3
+; ZIP-NEXT: add a2, a0, a2
+; ZIP-NEXT: vs2r.v v16, (a2)
+; ZIP-NEXT: vs8r.v v8, (a0)
+; ZIP-NEXT: vl8re32.v v16, (a2)
+; ZIP-NEXT: vl8re32.v v8, (a0)
+; ZIP-NEXT: addi sp, s0, -80
+; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZIP-NEXT: addi sp, sp, 80
+; ZIP-NEXT: ret
+ %res = call <vscale x 20 x float> @llvm.vector.interleave5.nxv20f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4)
+ ret <vscale x 20 x float> %res
}
-define <vscale x 12 x bfloat> @vector_interleave_nxv12bf16_nxv4bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv12bf16_nxv4bf16:
+define <vscale x 5 x double> @vector_interleave_nxv5f64_nxv1f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4) nounwind {
+; CHECK-LABEL: vector_interleave_nxv5f64_nxv1f64:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: slli a1, a0, 2
; CHECK-NEXT: add a0, a1, a0
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT: vsseg3e16.v v8, (a0)
-; CHECK-NEXT: vl1re16.v v8, (a0)
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl1re16.v v9, (a0)
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl1re16.v v10, (a0)
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vsetvli a4, zero, e64, m1, ta, ma
+; CHECK-NEXT: vsseg5e64.v v8, (a0)
+; CHECK-NEXT: vl1re64.v v10, (a3)
+; CHECK-NEXT: add a3, a3, a1
+; CHECK-NEXT: vl1re64.v v11, (a3)
+; CHECK-NEXT: vl1re64.v v8, (a0)
+; CHECK-NEXT: vl1re64.v v9, (a2)
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vl1re64.v v12, (a1)
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 1
+; CHECK-NEXT: slli a1, a0, 2
; CHECK-NEXT: add a0, a1, a0
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv12bf16_nxv4bf16:
+; ZVBB-LABEL: vector_interleave_nxv5f64_nxv1f64:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a1, a0, 1
+; ZVBB-NEXT: slli a1, a0, 2
; ZVBB-NEXT: add a0, a1, a0
; ZVBB-NEXT: sub sp, sp, a0
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
-; ZVBB-NEXT: vsetvli a2, zero, e16, m1, ta, ma
-; ZVBB-NEXT: vsseg3e16.v v8, (a0)
-; ZVBB-NEXT: vl1re16.v v8, (a0)
-; ZVBB-NEXT: add a0, a0, a1
-; ZVBB-NEXT: vl1re16.v v9, (a0)
-; ZVBB-NEXT: add a0, a0, a1
-; ZVBB-NEXT: vl1re16.v v10, (a0)
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vsetvli a4, zero, e64, m1, ta, ma
+; ZVBB-NEXT: vsseg5e64.v v8, (a0)
+; ZVBB-NEXT: vl1re64.v v10, (a3)
+; ZVBB-NEXT: add a3, a3, a1
+; ZVBB-NEXT: vl1re64.v v11, (a3)
+; ZVBB-NEXT: vl1re64.v v8, (a0)
+; ZVBB-NEXT: vl1re64.v v9, (a2)
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vl1re64.v v12, (a1)
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a1, a0, 1
+; ZVBB-NEXT: slli a1, a0, 2
; ZVBB-NEXT: add a0, a1, a0
; ZVBB-NEXT: add sp, sp, a0
; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
- %res = call <vscale x 12 x bfloat> @llvm.vector.interleave3.nxv12bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2)
- ret <vscale x 12 x bfloat> %res
+ %res = call <vscale x 5 x double> @llvm.vector.interleave5.nxv5f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4)
+ ret <vscale x 5 x double> %res
}
-define <vscale x 24 x bfloat> @vector_interleave_nxv24bf16_nxv8bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv24bf16_nxv8bf16:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 6
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: vsetvli a2, zero, e16, m2, ta, ma
-; CHECK-NEXT: vsseg3e16.v v8, (a0)
-; CHECK-NEXT: vl2re16.v v8, (a0)
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl2re16.v v10, (a0)
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl2re16.v v12, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 6
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: ret
+define <vscale x 10 x double> @vector_interleave_nxv10f64_nxv2f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4) nounwind {
+; RV32-LABEL: vector_interleave_nxv10f64_nxv2f64:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -80
+; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 80
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 28
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT: vmv2r.v v20, v16
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv2r.v v18, v12
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: slli a2, a1, 2
+; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 64
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: vmv2r.v v16, v8
+; RV32-NEXT: vmv2r.v v22, v16
+; RV32-NEXT: vmv2r.v v24, v18
+; RV32-NEXT: vmv1r.v v26, v20
+; RV32-NEXT: add a3, a0, a2
+; RV32-NEXT: vmv1r.v v23, v10
+; RV32-NEXT: add a4, a1, a2
+; RV32-NEXT: add a5, a4, a2
+; RV32-NEXT: vmv1r.v v25, v14
+; RV32-NEXT: add a6, a5, a2
+; RV32-NEXT: vmv1r.v v18, v11
+; RV32-NEXT: vsseg5e64.v v22, (a0)
+; RV32-NEXT: vmv1r.v v20, v15
+; RV32-NEXT: vsseg5e64.v v17, (a1)
+; RV32-NEXT: vl1re64.v v16, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re64.v v17, (a6)
+; RV32-NEXT: add a6, a3, a2
+; RV32-NEXT: vl1re64.v v10, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re64.v v11, (a6)
+; RV32-NEXT: vl1re64.v v8, (a0)
+; RV32-NEXT: vl1re64.v v9, (a3)
+; RV32-NEXT: vl1re64.v v14, (a4)
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a3, 10
+; RV32-NEXT: mul a0, a0, a3
+; RV32-NEXT: add a0, sp, a0
+; RV32-NEXT: addi a0, a0, 64
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re64.v v15, (a5)
+; RV32-NEXT: vl1re64.v v12, (a6)
+; RV32-NEXT: vl1re64.v v13, (a1)
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a0, a2
+; RV32-NEXT: vs2r.v v16, (a2)
+; RV32-NEXT: vs8r.v v8, (a0)
+; RV32-NEXT: vl8re64.v v16, (a2)
+; RV32-NEXT: vl8re64.v v8, (a0)
+; RV32-NEXT: addi sp, s0, -80
+; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 80
+; RV32-NEXT: ret
+;
+; RV64-LABEL: vector_interleave_nxv10f64_nxv2f64:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -80
+; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi s0, sp, 80
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 28
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: sub sp, sp, a0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv2r.v v20, v16
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv2r.v v18, v12
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: slli a2, a1, 2
+; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 64
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: vmv2r.v v16, v8
+; RV64-NEXT: vmv2r.v v22, v16
+; RV64-NEXT: vmv2r.v v24, v18
+; RV64-NEXT: vmv1r.v v26, v20
+; RV64-NEXT: add a3, a0, a2
+; RV64-NEXT: vmv1r.v v23, v10
+; RV64-NEXT: add a4, a1, a2
+; RV64-NEXT: add a5, a4, a2
+; RV64-NEXT: vmv1r.v v25, v14
+; RV64-NEXT: add a6, a5, a2
+; RV64-NEXT: vmv1r.v v18, v11
+; RV64-NEXT: vsseg5e64.v v22, (a0)
+; RV64-NEXT: vmv1r.v v20, v15
+; RV64-NEXT: vsseg5e64.v v17, (a1)
+; RV64-NEXT: vl1re64.v v16, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re64.v v17, (a6)
+; RV64-NEXT: add a6, a3, a2
+; RV64-NEXT: vl1re64.v v10, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re64.v v11, (a6)
+; RV64-NEXT: vl1re64.v v8, (a0)
+; RV64-NEXT: vl1re64.v v9, (a3)
+; RV64-NEXT: vl1re64.v v14, (a4)
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a3, 10
+; RV64-NEXT: mul a0, a0, a3
+; RV64-NEXT: add a0, sp, a0
+; RV64-NEXT: addi a0, a0, 64
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re64.v v15, (a5)
+; RV64-NEXT: vl1re64.v v12, (a6)
+; RV64-NEXT: vl1re64.v v13, (a1)
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a0, a2
+; RV64-NEXT: vs2r.v v16, (a2)
+; RV64-NEXT: vs8r.v v8, (a0)
+; RV64-NEXT: vl8re64.v v16, (a2)
+; RV64-NEXT: vl8re64.v v8, (a0)
+; RV64-NEXT: addi sp, s0, -80
+; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 80
+; RV64-NEXT: ret
+;
+; ZVBB-RV32-LABEL: vector_interleave_nxv10f64_nxv2f64:
+; ZVBB-RV32: # %bb.0:
+; ZVBB-RV32-NEXT: addi sp, sp, -80
+; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: addi s0, sp, 80
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a1, 28
+; ZVBB-RV32-NEXT: mul a0, a0, a1
+; ZVBB-RV32-NEXT: sub sp, sp, a0
+; ZVBB-RV32-NEXT: andi sp, sp, -64
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV32-NEXT: vmv2r.v v20, v16
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv2r.v v18, v12
+; ZVBB-RV32-NEXT: csrr a1, vlenb
+; ZVBB-RV32-NEXT: slli a2, a1, 2
+; ZVBB-RV32-NEXT: add a1, a2, a1
+; ZVBB-RV32-NEXT: add a1, sp, a1
+; ZVBB-RV32-NEXT: addi a1, a1, 64
+; ZVBB-RV32-NEXT: csrr a2, vlenb
+; ZVBB-RV32-NEXT: vmv2r.v v16, v8
+; ZVBB-RV32-NEXT: vmv2r.v v22, v16
+; ZVBB-RV32-NEXT: vmv2r.v v24, v18
+; ZVBB-RV32-NEXT: vmv1r.v v26, v20
+; ZVBB-RV32-NEXT: add a3, a0, a2
+; ZVBB-RV32-NEXT: vmv1r.v v23, v10
+; ZVBB-RV32-NEXT: add a4, a1, a2
+; ZVBB-RV32-NEXT: add a5, a4, a2
+; ZVBB-RV32-NEXT: vmv1r.v v25, v14
+; ZVBB-RV32-NEXT: add a6, a5, a2
+; ZVBB-RV32-NEXT: vmv1r.v v18, v11
+; ZVBB-RV32-NEXT: vsseg5e64.v v22, (a0)
+; ZVBB-RV32-NEXT: vmv1r.v v20, v15
+; ZVBB-RV32-NEXT: vsseg5e64.v v17, (a1)
+; ZVBB-RV32-NEXT: vl1re64.v v16, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v17, (a6)
+; ZVBB-RV32-NEXT: add a6, a3, a2
+; ZVBB-RV32-NEXT: vl1re64.v v10, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v11, (a6)
+; ZVBB-RV32-NEXT: vl1re64.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re64.v v9, (a3)
+; ZVBB-RV32-NEXT: vl1re64.v v14, (a4)
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a3, 10
+; ZVBB-RV32-NEXT: mul a0, a0, a3
+; ZVBB-RV32-NEXT: add a0, sp, a0
+; ZVBB-RV32-NEXT: addi a0, a0, 64
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v15, (a5)
+; ZVBB-RV32-NEXT: vl1re64.v v12, (a6)
+; ZVBB-RV32-NEXT: vl1re64.v v13, (a1)
+; ZVBB-RV32-NEXT: slli a2, a2, 3
+; ZVBB-RV32-NEXT: add a2, a0, a2
+; ZVBB-RV32-NEXT: vs2r.v v16, (a2)
+; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT: vl8re64.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8re64.v v8, (a0)
+; ZVBB-RV32-NEXT: addi sp, s0, -80
+; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: addi sp, sp, 80
+; ZVBB-RV32-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv24bf16_nxv8bf16:
-; ZVBB: # %bb.0:
-; ZVBB-NEXT: addi sp, sp, -16
-; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: li a1, 6
-; ZVBB-NEXT: mul a0, a0, a1
-; ZVBB-NEXT: sub sp, sp, a0
-; ZVBB-NEXT: addi a0, sp, 16
-; ZVBB-NEXT: csrr a1, vlenb
-; ZVBB-NEXT: slli a1, a1, 1
-; ZVBB-NEXT: vsetvli a2, zero, e16, m2, ta, ma
-; ZVBB-NEXT: vsseg3e16.v v8, (a0)
-; ZVBB-NEXT: vl2re16.v v8, (a0)
-; ZVBB-NEXT: add a0, a0, a1
-; ZVBB-NEXT: vl2re16.v v10, (a0)
-; ZVBB-NEXT: add a0, a0, a1
-; ZVBB-NEXT: vl2re16.v v12, (a0)
-; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: li a1, 6
-; ZVBB-NEXT: mul a0, a0, a1
-; ZVBB-NEXT: add sp, sp, a0
-; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: ret
- %res = call <vscale x 24 x bfloat> @llvm.vector.interleave3.nxv24bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2)
- ret <vscale x 24 x bfloat> %res
-}
-
-define <vscale x 3 x float> @vector_interleave_nxv3f32_nxv1f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv3f32_nxv1f32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: srli a2, a1, 1
-; CHECK-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vsseg3e32.v v8, (a0)
-; CHECK-NEXT: add a3, a0, a2
-; CHECK-NEXT: vle32.v v9, (a3)
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: srli a1, a1, 3
-; CHECK-NEXT: add a0, a1, a1
-; CHECK-NEXT: vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v9, a1
-; CHECK-NEXT: add a2, a3, a2
-; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v9, (a2)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 1
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: ret
+; ZVBB-RV64-LABEL: vector_interleave_nxv10f64_nxv2f64:
+; ZVBB-RV64: # %bb.0:
+; ZVBB-RV64-NEXT: addi sp, sp, -80
+; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: addi s0, sp, 80
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a1, 28
+; ZVBB-RV64-NEXT: mul a0, a0, a1
+; ZVBB-RV64-NEXT: sub sp, sp, a0
+; ZVBB-RV64-NEXT: andi sp, sp, -64
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV64-NEXT: vmv2r.v v20, v16
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv2r.v v18, v12
+; ZVBB-RV64-NEXT: csrr a1, vlenb
+; ZVBB-RV64-NEXT: slli a2, a1, 2
+; ZVBB-RV64-NEXT: add a1, a2, a1
+; ZVBB-RV64-NEXT: add a1, sp, a1
+; ZVBB-RV64-NEXT: addi a1, a1, 64
+; ZVBB-RV64-NEXT: csrr a2, vlenb
+; ZVBB-RV64-NEXT: vmv2r.v v16, v8
+; ZVBB-RV64-NEXT: vmv2r.v v22, v16
+; ZVBB-RV64-NEXT: vmv2r.v v24, v18
+; ZVBB-RV64-NEXT: vmv1r.v v26, v20
+; ZVBB-RV64-NEXT: add a3, a0, a2
+; ZVBB-RV64-NEXT: vmv1r.v v23, v10
+; ZVBB-RV64-NEXT: add a4, a1, a2
+; ZVBB-RV64-NEXT: add a5, a4, a2
+; ZVBB-RV64-NEXT: vmv1r.v v25, v14
+; ZVBB-RV64-NEXT: add a6, a5, a2
+; ZVBB-RV64-NEXT: vmv1r.v v18, v11
+; ZVBB-RV64-NEXT: vsseg5e64.v v22, (a0)
+; ZVBB-RV64-NEXT: vmv1r.v v20, v15
+; ZVBB-RV64-NEXT: vsseg5e64.v v17, (a1)
+; ZVBB-RV64-NEXT: vl1re64.v v16, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v17, (a6)
+; ZVBB-RV64-NEXT: add a6, a3, a2
+; ZVBB-RV64-NEXT: vl1re64.v v10, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v11, (a6)
+; ZVBB-RV64-NEXT: vl1re64.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re64.v v9, (a3)
+; ZVBB-RV64-NEXT: vl1re64.v v14, (a4)
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a3, 10
+; ZVBB-RV64-NEXT: mul a0, a0, a3
+; ZVBB-RV64-NEXT: add a0, sp, a0
+; ZVBB-RV64-NEXT: addi a0, a0, 64
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v15, (a5)
+; ZVBB-RV64-NEXT: vl1re64.v v12, (a6)
+; ZVBB-RV64-NEXT: vl1re64.v v13, (a1)
+; ZVBB-RV64-NEXT: slli a2, a2, 3
+; ZVBB-RV64-NEXT: add a2, a0, a2
+; ZVBB-RV64-NEXT: vs2r.v v16, (a2)
+; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT: vl8re64.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8re64.v v8, (a0)
+; ZVBB-RV64-NEXT: addi sp, s0, -80
+; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: addi sp, sp, 80
+; ZVBB-RV64-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv3f32_nxv1f32:
-; ZVBB: # %bb.0:
-; ZVBB-NEXT: addi sp, sp, -16
-; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a0, a0, 1
-; ZVBB-NEXT: sub sp, sp, a0
-; ZVBB-NEXT: addi a0, sp, 16
-; ZVBB-NEXT: csrr a1, vlenb
-; ZVBB-NEXT: srli a2, a1, 1
-; ZVBB-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
-; ZVBB-NEXT: vsseg3e32.v v8, (a0)
-; ZVBB-NEXT: add a3, a0, a2
-; ZVBB-NEXT: vle32.v v9, (a3)
-; ZVBB-NEXT: vle32.v v8, (a0)
-; ZVBB-NEXT: srli a1, a1, 3
-; ZVBB-NEXT: add a0, a1, a1
-; ZVBB-NEXT: vsetvli zero, a0, e32, m1, ta, ma
-; ZVBB-NEXT: vslideup.vx v8, v9, a1
-; ZVBB-NEXT: add a2, a3, a2
-; ZVBB-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
-; ZVBB-NEXT: vle32.v v9, (a2)
-; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a0, a0, 1
-; ZVBB-NEXT: add sp, sp, a0
-; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: ret
- %res = call <vscale x 3 x float> @llvm.vector.interleave3.nxv3f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2)
- ret <vscale x 3 x float> %res
+; ZIP-LABEL: vector_interleave_nxv10f64_nxv2f64:
+; ZIP: # %bb.0:
+; ZIP-NEXT: addi sp, sp, -80
+; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZIP-NEXT: addi s0, sp, 80
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: li a1, 28
+; ZIP-NEXT: mul a0, a0, a1
+; ZIP-NEXT: sub sp, sp, a0
+; ZIP-NEXT: andi sp, sp, -64
+; ZIP-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZIP-NEXT: vmv2r.v v20, v16
+; ZIP-NEXT: addi a0, sp, 64
+; ZIP-NEXT: vmv2r.v v18, v12
+; ZIP-NEXT: csrr a1, vlenb
+; ZIP-NEXT: slli a2, a1, 2
+; ZIP-NEXT: add a1, a2, a1
+; ZIP-NEXT: add a1, sp, a1
+; ZIP-NEXT: addi a1, a1, 64
+; ZIP-NEXT: csrr a2, vlenb
+; ZIP-NEXT: vmv2r.v v16, v8
+; ZIP-NEXT: vmv2r.v v22, v16
+; ZIP-NEXT: vmv2r.v v24, v18
+; ZIP-NEXT: vmv1r.v v26, v20
+; ZIP-NEXT: add a3, a0, a2
+; ZIP-NEXT: vmv1r.v v23, v10
+; ZIP-NEXT: add a4, a1, a2
+; ZIP-NEXT: add a5, a4, a2
+; ZIP-NEXT: vmv1r.v v25, v14
+; ZIP-NEXT: add a6, a5, a2
+; ZIP-NEXT: vmv1r.v v18, v11
+; ZIP-NEXT: vsseg5e64.v v22, (a0)
+; ZIP-NEXT: vmv1r.v v20, v15
+; ZIP-NEXT: vsseg5e64.v v17, (a1)
+; ZIP-NEXT: vl1re64.v v16, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re64.v v17, (a6)
+; ZIP-NEXT: add a6, a3, a2
+; ZIP-NEXT: vl1re64.v v10, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re64.v v11, (a6)
+; ZIP-NEXT: vl1re64.v v8, (a0)
+; ZIP-NEXT: vl1re64.v v9, (a3)
+; ZIP-NEXT: vl1re64.v v14, (a4)
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: li a3, 10
+; ZIP-NEXT: mul a0, a0, a3
+; ZIP-NEXT: add a0, sp, a0
+; ZIP-NEXT: addi a0, a0, 64
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re64.v v15, (a5)
+; ZIP-NEXT: vl1re64.v v12, (a6)
+; ZIP-NEXT: vl1re64.v v13, (a1)
+; ZIP-NEXT: slli a2, a2, 3
+; ZIP-NEXT: add a2, a0, a2
+; ZIP-NEXT: vs2r.v v16, (a2)
+; ZIP-NEXT: vs8r.v v8, (a0)
+; ZIP-NEXT: vl8re64.v v16, (a2)
+; ZIP-NEXT: vl8re64.v v8, (a0)
+; ZIP-NEXT: addi sp, s0, -80
+; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZIP-NEXT: addi sp, sp, 80
+; ZIP-NEXT: ret
+ %res = call <vscale x 10 x double> @llvm.vector.interleave5.nxv10f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4)
+ ret <vscale x 10 x double> %res
}
-define <vscale x 6 x float> @vector_interleave_nxv6f32_nxv2f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv6f32_nxv2f32:
+define <vscale x 12 x half> @vector_interleave_nxv12f16_nxv2f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4, <vscale x 2 x half> %v5) nounwind {
+; CHECK-LABEL: vector_interleave_nxv12f16_nxv2f16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
@@ -4757,13 +9785,30 @@ define <vscale x 6 x float> @vector_interleave_nxv6f32_nxv2f32(<vscale x 2 x flo
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT: vsseg3e32.v v8, (a0)
-; CHECK-NEXT: vl1re32.v v8, (a0)
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl1re32.v v9, (a0)
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl1re32.v v10, (a0)
+; CHECK-NEXT: srli a2, a1, 1
+; CHECK-NEXT: add a3, a0, a2
+; CHECK-NEXT: add a4, a3, a2
+; CHECK-NEXT: add a5, a4, a2
+; CHECK-NEXT: vsetvli a6, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vsseg6e16.v v8, (a0)
+; CHECK-NEXT: add a6, a5, a2
+; CHECK-NEXT: add a2, a6, a2
+; CHECK-NEXT: vle16.v v10, (a6)
+; CHECK-NEXT: vle16.v v8, (a2)
+; CHECK-NEXT: srli a1, a1, 2
+; CHECK-NEXT: add a2, a1, a1
+; CHECK-NEXT: vle16.v v11, (a5)
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v10, v8, a1
+; CHECK-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vle16.v v9, (a4)
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v11, a1
+; CHECK-NEXT: vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vle16.v v11, (a3)
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v11, a1
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a1, a0, 1
; CHECK-NEXT: add a0, a1, a0
@@ -4771,7 +9816,7 @@ define <vscale x 6 x float> @vector_interleave_nxv6f32_nxv2f32(<vscale x 2 x flo
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv6f32_nxv2f32:
+; ZVBB-LABEL: vector_interleave_nxv12f16_nxv2f16:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: csrr a0, vlenb
@@ -4780,25 +9825,42 @@ define <vscale x 6 x float> @vector_interleave_nxv6f32_nxv2f32(<vscale x 2 x flo
; ZVBB-NEXT: sub sp, sp, a0
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
-; ZVBB-NEXT: vsetvli a2, zero, e32, m1, ta, ma
-; ZVBB-NEXT: vsseg3e32.v v8, (a0)
-; ZVBB-NEXT: vl1re32.v v8, (a0)
-; ZVBB-NEXT: add a0, a0, a1
-; ZVBB-NEXT: vl1re32.v v9, (a0)
-; ZVBB-NEXT: add a0, a0, a1
-; ZVBB-NEXT: vl1re32.v v10, (a0)
+; ZVBB-NEXT: srli a2, a1, 1
+; ZVBB-NEXT: add a3, a0, a2
+; ZVBB-NEXT: add a4, a3, a2
+; ZVBB-NEXT: add a5, a4, a2
+; ZVBB-NEXT: vsetvli a6, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vsseg6e16.v v8, (a0)
+; ZVBB-NEXT: add a6, a5, a2
+; ZVBB-NEXT: add a2, a6, a2
+; ZVBB-NEXT: vle16.v v10, (a6)
+; ZVBB-NEXT: vle16.v v8, (a2)
+; ZVBB-NEXT: srli a1, a1, 2
+; ZVBB-NEXT: add a2, a1, a1
+; ZVBB-NEXT: vle16.v v11, (a5)
+; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v10, v8, a1
+; ZVBB-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vle16.v v9, (a4)
+; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v9, v11, a1
+; ZVBB-NEXT: vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vle16.v v11, (a3)
+; ZVBB-NEXT: vle16.v v8, (a0)
+; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v8, v11, a1
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a1, a0, 1
; ZVBB-NEXT: add a0, a1, a0
; ZVBB-NEXT: add sp, sp, a0
; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
- %res = call <vscale x 6 x float> @llvm.vector.interleave3.nxv6f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2)
- ret <vscale x 6 x float> %res
+ %res = call <vscale x 12 x half> @llvm.vector.interleave6.nxv12f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4, <vscale x 2 x half> %v5)
+ ret <vscale x 12 x half> %res
}
-define <vscale x 12 x float> @vector_interleave_nxv12f32_nxv4f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv12f32_nxv4f32:
+define <vscale x 24 x half> @vector_interleave_nxv24f16_nxv4f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4, <vscale x 4 x half> %v5) nounwind {
+; CHECK-LABEL: vector_interleave_nxv24f16_nxv4f16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
@@ -4807,14 +9869,19 @@ define <vscale x 12 x float> @vector_interleave_nxv12f32_nxv4f32(<vscale x 4 x f
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; CHECK-NEXT: vsseg3e32.v v8, (a0)
-; CHECK-NEXT: vl2re32.v v8, (a0)
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl2re32.v v10, (a0)
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl2re32.v v12, (a0)
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, ma
+; CHECK-NEXT: vsseg6e16.v v8, (a0)
+; CHECK-NEXT: vl1re16.v v10, (a3)
+; CHECK-NEXT: add a3, a3, a1
+; CHECK-NEXT: vl1re16.v v11, (a3)
+; CHECK-NEXT: add a3, a3, a1
+; CHECK-NEXT: vl1re16.v v8, (a0)
+; CHECK-NEXT: vl1re16.v v9, (a2)
+; CHECK-NEXT: vl1re16.v v12, (a3)
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vl1re16.v v13, (a1)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: li a1, 6
; CHECK-NEXT: mul a0, a0, a1
@@ -4822,7 +9889,7 @@ define <vscale x 12 x float> @vector_interleave_nxv12f32_nxv4f32(<vscale x 4 x f
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv12f32_nxv4f32:
+; ZVBB-LABEL: vector_interleave_nxv24f16_nxv4f16:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: csrr a0, vlenb
@@ -4831,128 +9898,400 @@ define <vscale x 12 x float> @vector_interleave_nxv12f32_nxv4f32(<vscale x 4 x f
; ZVBB-NEXT: sub sp, sp, a0
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
-; ZVBB-NEXT: slli a1, a1, 1
-; ZVBB-NEXT: vsetvli a2, zero, e32, m2, ta, ma
-; ZVBB-NEXT: vsseg3e32.v v8, (a0)
-; ZVBB-NEXT: vl2re32.v v8, (a0)
-; ZVBB-NEXT: add a0, a0, a1
-; ZVBB-NEXT: vl2re32.v v10, (a0)
-; ZVBB-NEXT: add a0, a0, a1
-; ZVBB-NEXT: vl2re32.v v12, (a0)
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: vsetvli a4, zero, e16, m1, ta, ma
+; ZVBB-NEXT: vsseg6e16.v v8, (a0)
+; ZVBB-NEXT: vl1re16.v v10, (a3)
+; ZVBB-NEXT: add a3, a3, a1
+; ZVBB-NEXT: vl1re16.v v11, (a3)
+; ZVBB-NEXT: add a3, a3, a1
+; ZVBB-NEXT: vl1re16.v v8, (a0)
+; ZVBB-NEXT: vl1re16.v v9, (a2)
+; ZVBB-NEXT: vl1re16.v v12, (a3)
+; ZVBB-NEXT: add a1, a3, a1
+; ZVBB-NEXT: vl1re16.v v13, (a1)
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: li a1, 6
; ZVBB-NEXT: mul a0, a0, a1
; ZVBB-NEXT: add sp, sp, a0
; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
- %res = call <vscale x 12 x float> @llvm.vector.interleave3.nxv12f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2)
- ret <vscale x 12 x float> %res
+ %res = call <vscale x 24 x half> @llvm.vector.interleave6.nxv24f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4, <vscale x 4 x half> %v5)
+ ret <vscale x 24 x half> %res
}
-define <vscale x 3 x double> @vector_interleave_nxv3f64_nxv1f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv3f64_nxv1f64:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 1
-; CHECK-NEXT: add a0, a1, a0
-; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: vsetvli a2, zero, e64, m1, ta, ma
-; CHECK-NEXT: vsseg3e64.v v8, (a0)
-; CHECK-NEXT: vl1re64.v v8, (a0)
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl1re64.v v9, (a0)
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl1re64.v v10, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 1
-; CHECK-NEXT: add a0, a1, a0
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: ret
+define <vscale x 48 x half> @vector_interleave_nxv48f16_nxv8f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4, <vscale x 8 x half> %v5) nounwind {
+; RV32-LABEL: vector_interleave_nxv48f16_nxv8f16:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -80
+; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 80
+; RV32-NEXT: csrr a0, vlenb
+; RV32-NEXT: li a1, 28
+; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: sub sp, sp, a0
+; RV32-NEXT: andi sp, sp, -64
+; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vmv2r.v v20, v14
+; RV32-NEXT: vmv2r.v v22, v12
+; RV32-NEXT: vmv2r.v v24, v10
+; RV32-NEXT: csrr a1, vlenb
+; RV32-NEXT: li a0, 6
+; RV32-NEXT: mul a1, a1, a0
+; RV32-NEXT: add a1, sp, a1
+; RV32-NEXT: addi a1, a1, 64
+; RV32-NEXT: vmv1r.v v10, v25
+; RV32-NEXT: vmv1r.v v11, v23
+; RV32-NEXT: vmv1r.v v12, v21
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv1r.v v13, v17
+; RV32-NEXT: csrr a2, vlenb
+; RV32-NEXT: vmv1r.v v14, v19
+; RV32-NEXT: vsseg6e16.v v9, (a1)
+; RV32-NEXT: vmv1r.v v9, v24
+; RV32-NEXT: add a5, a1, a2
+; RV32-NEXT: vmv1r.v v10, v22
+; RV32-NEXT: add a3, a0, a2
+; RV32-NEXT: vmv1r.v v11, v20
+; RV32-NEXT: add a4, a3, a2
+; RV32-NEXT: vmv1r.v v12, v16
+; RV32-NEXT: add a6, a5, a2
+; RV32-NEXT: vmv1r.v v13, v18
+; RV32-NEXT: vsseg6e16.v v8, (a0)
+; RV32-NEXT: vl1re16.v v14, (a1)
+; RV32-NEXT: add a1, a6, a2
+; RV32-NEXT: vl1re16.v v15, (a5)
+; RV32-NEXT: add a5, a1, a2
+; RV32-NEXT: vl1re16.v v18, (a5)
+; RV32-NEXT: add a5, a5, a2
+; RV32-NEXT: vl1re16.v v19, (a5)
+; RV32-NEXT: add a5, a4, a2
+; RV32-NEXT: vl1re16.v v16, (a6)
+; RV32-NEXT: add a6, a5, a2
+; RV32-NEXT: vl1re16.v v12, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v13, (a6)
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: li a7, 12
+; RV32-NEXT: mul a6, a6, a7
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 64
+; RV32-NEXT: vl1re16.v v17, (a1)
+; RV32-NEXT: vl1re16.v v10, (a4)
+; RV32-NEXT: vl1re16.v v11, (a5)
+; RV32-NEXT: vl1re16.v v8, (a0)
+; RV32-NEXT: vl1re16.v v9, (a3)
+; RV32-NEXT: slli a2, a2, 3
+; RV32-NEXT: add a2, a6, a2
+; RV32-NEXT: vs4r.v v16, (a2)
+; RV32-NEXT: vs8r.v v8, (a6)
+; RV32-NEXT: vl8re16.v v16, (a2)
+; RV32-NEXT: vl8re16.v v8, (a6)
+; RV32-NEXT: addi sp, s0, -80
+; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT: addi sp, sp, 80
+; RV32-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv3f64_nxv1f64:
-; ZVBB: # %bb.0:
-; ZVBB-NEXT: addi sp, sp, -16
-; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a1, a0, 1
-; ZVBB-NEXT: add a0, a1, a0
-; ZVBB-NEXT: sub sp, sp, a0
-; ZVBB-NEXT: addi a0, sp, 16
-; ZVBB-NEXT: csrr a1, vlenb
-; ZVBB-NEXT: vsetvli a2, zero, e64, m1, ta, ma
-; ZVBB-NEXT: vsseg3e64.v v8, (a0)
-; ZVBB-NEXT: vl1re64.v v8, (a0)
-; ZVBB-NEXT: add a0, a0, a1
-; ZVBB-NEXT: vl1re64.v v9, (a0)
-; ZVBB-NEXT: add a0, a0, a1
-; ZVBB-NEXT: vl1re64.v v10, (a0)
-; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a1, a0, 1
-; ZVBB-NEXT: add a0, a1, a0
-; ZVBB-NEXT: add sp, sp, a0
-; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: ret
- %res = call <vscale x 3 x double> @llvm.vector.interleave3.nxv3f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2)
- ret <vscale x 3 x double> %res
-}
-
-define <vscale x 6 x double> @vector_interleave_nxv6f64_nxv2f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv6f64_nxv2f64:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 6
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: slli a1, a1, 1
-; CHECK-NEXT: vsetvli a2, zero, e64, m2, ta, ma
-; CHECK-NEXT: vsseg3e64.v v8, (a0)
-; CHECK-NEXT: vl2re64.v v8, (a0)
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl2re64.v v10, (a0)
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl2re64.v v12, (a0)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: li a1, 6
-; CHECK-NEXT: mul a0, a0, a1
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: ret
+; RV64-LABEL: vector_interleave_nxv48f16_nxv8f16:
+; RV64: # %bb.0:
+; RV64-NEXT: addi sp, sp, -80
+; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT: addi s0, sp, 80
+; RV64-NEXT: csrr a0, vlenb
+; RV64-NEXT: li a1, 28
+; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: sub sp, sp, a0
+; RV64-NEXT: andi sp, sp, -64
+; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vmv2r.v v20, v14
+; RV64-NEXT: vmv2r.v v22, v12
+; RV64-NEXT: vmv2r.v v24, v10
+; RV64-NEXT: csrr a1, vlenb
+; RV64-NEXT: li a0, 6
+; RV64-NEXT: mul a1, a1, a0
+; RV64-NEXT: add a1, sp, a1
+; RV64-NEXT: addi a1, a1, 64
+; RV64-NEXT: vmv1r.v v10, v25
+; RV64-NEXT: vmv1r.v v11, v23
+; RV64-NEXT: vmv1r.v v12, v21
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv1r.v v13, v17
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: vmv1r.v v14, v19
+; RV64-NEXT: vsseg6e16.v v9, (a1)
+; RV64-NEXT: vmv1r.v v9, v24
+; RV64-NEXT: add a5, a1, a2
+; RV64-NEXT: vmv1r.v v10, v22
+; RV64-NEXT: add a3, a0, a2
+; RV64-NEXT: vmv1r.v v11, v20
+; RV64-NEXT: add a4, a3, a2
+; RV64-NEXT: vmv1r.v v12, v16
+; RV64-NEXT: add a6, a5, a2
+; RV64-NEXT: vmv1r.v v13, v18
+; RV64-NEXT: vsseg6e16.v v8, (a0)
+; RV64-NEXT: vl1re16.v v14, (a1)
+; RV64-NEXT: add a1, a6, a2
+; RV64-NEXT: vl1re16.v v15, (a5)
+; RV64-NEXT: add a5, a1, a2
+; RV64-NEXT: vl1re16.v v18, (a5)
+; RV64-NEXT: add a5, a5, a2
+; RV64-NEXT: vl1re16.v v19, (a5)
+; RV64-NEXT: add a5, a4, a2
+; RV64-NEXT: vl1re16.v v16, (a6)
+; RV64-NEXT: add a6, a5, a2
+; RV64-NEXT: vl1re16.v v12, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v13, (a6)
+; RV64-NEXT: csrr a6, vlenb
+; RV64-NEXT: li a7, 12
+; RV64-NEXT: mul a6, a6, a7
+; RV64-NEXT: add a6, sp, a6
+; RV64-NEXT: addi a6, a6, 64
+; RV64-NEXT: vl1re16.v v17, (a1)
+; RV64-NEXT: vl1re16.v v10, (a4)
+; RV64-NEXT: vl1re16.v v11, (a5)
+; RV64-NEXT: vl1re16.v v8, (a0)
+; RV64-NEXT: vl1re16.v v9, (a3)
+; RV64-NEXT: slli a2, a2, 3
+; RV64-NEXT: add a2, a6, a2
+; RV64-NEXT: vs4r.v v16, (a2)
+; RV64-NEXT: vs8r.v v8, (a6)
+; RV64-NEXT: vl8re16.v v16, (a2)
+; RV64-NEXT: vl8re16.v v8, (a6)
+; RV64-NEXT: addi sp, s0, -80
+; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT: addi sp, sp, 80
+; RV64-NEXT: ret
+;
+; ZVBB-RV32-LABEL: vector_interleave_nxv48f16_nxv8f16:
+; ZVBB-RV32: # %bb.0:
+; ZVBB-RV32-NEXT: addi sp, sp, -80
+; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT: addi s0, sp, 80
+; ZVBB-RV32-NEXT: csrr a0, vlenb
+; ZVBB-RV32-NEXT: li a1, 28
+; ZVBB-RV32-NEXT: mul a0, a0, a1
+; ZVBB-RV32-NEXT: sub sp, sp, a0
+; ZVBB-RV32-NEXT: andi sp, sp, -64
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV32-NEXT: vmv2r.v v20, v14
+; ZVBB-RV32-NEXT: vmv2r.v v22, v12
+; ZVBB-RV32-NEXT: vmv2r.v v24, v10
+; ZVBB-RV32-NEXT: csrr a1, vlenb
+; ZVBB-RV32-NEXT: li a0, 6
+; ZVBB-RV32-NEXT: mul a1, a1, a0
+; ZVBB-RV32-NEXT: add a1, sp, a1
+; ZVBB-RV32-NEXT: addi a1, a1, 64
+; ZVBB-RV32-NEXT: vmv1r.v v10, v25
+; ZVBB-RV32-NEXT: vmv1r.v v11, v23
+; ZVBB-RV32-NEXT: vmv1r.v v12, v21
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv1r.v v13, v17
+; ZVBB-RV32-NEXT: csrr a2, vlenb
+; ZVBB-RV32-NEXT: vmv1r.v v14, v19
+; ZVBB-RV32-NEXT: vsseg6e16.v v9, (a1)
+; ZVBB-RV32-NEXT: vmv1r.v v9, v24
+; ZVBB-RV32-NEXT: add a5, a1, a2
+; ZVBB-RV32-NEXT: vmv1r.v v10, v22
+; ZVBB-RV32-NEXT: add a3, a0, a2
+; ZVBB-RV32-NEXT: vmv1r.v v11, v20
+; ZVBB-RV32-NEXT: add a4, a3, a2
+; ZVBB-RV32-NEXT: vmv1r.v v12, v16
+; ZVBB-RV32-NEXT: add a6, a5, a2
+; ZVBB-RV32-NEXT: vmv1r.v v13, v18
+; ZVBB-RV32-NEXT: vsseg6e16.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re16.v v14, (a1)
+; ZVBB-RV32-NEXT: add a1, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v15, (a5)
+; ZVBB-RV32-NEXT: add a5, a1, a2
+; ZVBB-RV32-NEXT: vl1re16.v v18, (a5)
+; ZVBB-RV32-NEXT: add a5, a5, a2
+; ZVBB-RV32-NEXT: vl1re16.v v19, (a5)
+; ZVBB-RV32-NEXT: add a5, a4, a2
+; ZVBB-RV32-NEXT: vl1re16.v v16, (a6)
+; ZVBB-RV32-NEXT: add a6, a5, a2
+; ZVBB-RV32-NEXT: vl1re16.v v12, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v13, (a6)
+; ZVBB-RV32-NEXT: csrr a6, vlenb
+; ZVBB-RV32-NEXT: li a7, 12
+; ZVBB-RV32-NEXT: mul a6, a6, a7
+; ZVBB-RV32-NEXT: add a6, sp, a6
+; ZVBB-RV32-NEXT: addi a6, a6, 64
+; ZVBB-RV32-NEXT: vl1re16.v v17, (a1)
+; ZVBB-RV32-NEXT: vl1re16.v v10, (a4)
+; ZVBB-RV32-NEXT: vl1re16.v v11, (a5)
+; ZVBB-RV32-NEXT: vl1re16.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re16.v v9, (a3)
+; ZVBB-RV32-NEXT: slli a2, a2, 3
+; ZVBB-RV32-NEXT: add a2, a6, a2
+; ZVBB-RV32-NEXT: vs4r.v v16, (a2)
+; ZVBB-RV32-NEXT: vs8r.v v8, (a6)
+; ZVBB-RV32-NEXT: vl8re16.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8re16.v v8, (a6)
+; ZVBB-RV32-NEXT: addi sp, s0, -80
+; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT: addi sp, sp, 80
+; ZVBB-RV32-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv6f64_nxv2f64:
-; ZVBB: # %bb.0:
-; ZVBB-NEXT: addi sp, sp, -16
-; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: li a1, 6
-; ZVBB-NEXT: mul a0, a0, a1
-; ZVBB-NEXT: sub sp, sp, a0
-; ZVBB-NEXT: addi a0, sp, 16
-; ZVBB-NEXT: csrr a1, vlenb
-; ZVBB-NEXT: slli a1, a1, 1
-; ZVBB-NEXT: vsetvli a2, zero, e64, m2, ta, ma
-; ZVBB-NEXT: vsseg3e64.v v8, (a0)
-; ZVBB-NEXT: vl2re64.v v8, (a0)
-; ZVBB-NEXT: add a0, a0, a1
-; ZVBB-NEXT: vl2re64.v v10, (a0)
-; ZVBB-NEXT: add a0, a0, a1
-; ZVBB-NEXT: vl2re64.v v12, (a0)
-; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: li a1, 6
-; ZVBB-NEXT: mul a0, a0, a1
-; ZVBB-NEXT: add sp, sp, a0
-; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: ret
- %res = call <vscale x 6 x double> @llvm.vector.interleave3.nxv6f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2)
- ret <vscale x 6 x double> %res
+; ZVBB-RV64-LABEL: vector_interleave_nxv48f16_nxv8f16:
+; ZVBB-RV64: # %bb.0:
+; ZVBB-RV64-NEXT: addi sp, sp, -80
+; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT: addi s0, sp, 80
+; ZVBB-RV64-NEXT: csrr a0, vlenb
+; ZVBB-RV64-NEXT: li a1, 28
+; ZVBB-RV64-NEXT: mul a0, a0, a1
+; ZVBB-RV64-NEXT: sub sp, sp, a0
+; ZVBB-RV64-NEXT: andi sp, sp, -64
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV64-NEXT: vmv2r.v v20, v14
+; ZVBB-RV64-NEXT: vmv2r.v v22, v12
+; ZVBB-RV64-NEXT: vmv2r.v v24, v10
+; ZVBB-RV64-NEXT: csrr a1, vlenb
+; ZVBB-RV64-NEXT: li a0, 6
+; ZVBB-RV64-NEXT: mul a1, a1, a0
+; ZVBB-RV64-NEXT: add a1, sp, a1
+; ZVBB-RV64-NEXT: addi a1, a1, 64
+; ZVBB-RV64-NEXT: vmv1r.v v10, v25
+; ZVBB-RV64-NEXT: vmv1r.v v11, v23
+; ZVBB-RV64-NEXT: vmv1r.v v12, v21
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv1r.v v13, v17
+; ZVBB-RV64-NEXT: csrr a2, vlenb
+; ZVBB-RV64-NEXT: vmv1r.v v14, v19
+; ZVBB-RV64-NEXT: vsseg6e16.v v9, (a1)
+; ZVBB-RV64-NEXT: vmv1r.v v9, v24
+; ZVBB-RV64-NEXT: add a5, a1, a2
+; ZVBB-RV64-NEXT: vmv1r.v v10, v22
+; ZVBB-RV64-NEXT: add a3, a0, a2
+; ZVBB-RV64-NEXT: vmv1r.v v11, v20
+; ZVBB-RV64-NEXT: add a4, a3, a2
+; ZVBB-RV64-NEXT: vmv1r.v v12, v16
+; ZVBB-RV64-NEXT: add a6, a5, a2
+; ZVBB-RV64-NEXT: vmv1r.v v13, v18
+; ZVBB-RV64-NEXT: vsseg6e16.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re16.v v14, (a1)
+; ZVBB-RV64-NEXT: add a1, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v15, (a5)
+; ZVBB-RV64-NEXT: add a5, a1, a2
+; ZVBB-RV64-NEXT: vl1re16.v v18, (a5)
+; ZVBB-RV64-NEXT: add a5, a5, a2
+; ZVBB-RV64-NEXT: vl1re16.v v19, (a5)
+; ZVBB-RV64-NEXT: add a5, a4, a2
+; ZVBB-RV64-NEXT: vl1re16.v v16, (a6)
+; ZVBB-RV64-NEXT: add a6, a5, a2
+; ZVBB-RV64-NEXT: vl1re16.v v12, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v13, (a6)
+; ZVBB-RV64-NEXT: csrr a6, vlenb
+; ZVBB-RV64-NEXT: li a7, 12
+; ZVBB-RV64-NEXT: mul a6, a6, a7
+; ZVBB-RV64-NEXT: add a6, sp, a6
+; ZVBB-RV64-NEXT: addi a6, a6, 64
+; ZVBB-RV64-NEXT: vl1re16.v v17, (a1)
+; ZVBB-RV64-NEXT: vl1re16.v v10, (a4)
+; ZVBB-RV64-NEXT: vl1re16.v v11, (a5)
+; ZVBB-RV64-NEXT: vl1re16.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re16.v v9, (a3)
+; ZVBB-RV64-NEXT: slli a2, a2, 3
+; ZVBB-RV64-NEXT: add a2, a6, a2
+; ZVBB-RV64-NEXT: vs4r.v v16, (a2)
+; ZVBB-RV64-NEXT: vs8r.v v8, (a6)
+; ZVBB-RV64-NEXT: vl8re16.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8re16.v v8, (a6)
+; ZVBB-RV64-NEXT: addi sp, s0, -80
+; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT: addi sp, sp, 80
+; ZVBB-RV64-NEXT: ret
+;
+; ZIP-LABEL: vector_interleave_nxv48f16_nxv8f16:
+; ZIP: # %bb.0:
+; ZIP-NEXT: addi sp, sp, -80
+; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
+; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
+; ZIP-NEXT: addi s0, sp, 80
+; ZIP-NEXT: csrr a0, vlenb
+; ZIP-NEXT: li a1, 28
+; ZIP-NEXT: mul a0, a0, a1
+; ZIP-NEXT: sub sp, sp, a0
+; ZIP-NEXT: andi sp, sp, -64
+; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT: vmv2r.v v20, v14
+; ZIP-NEXT: vmv2r.v v22, v12
+; ZIP-NEXT: vmv2r.v v24, v10
+; ZIP-NEXT: csrr a1, vlenb
+; ZIP-NEXT: li a0, 6
+; ZIP-NEXT: mul a1, a1, a0
+; ZIP-NEXT: add a1, sp, a1
+; ZIP-NEXT: addi a1, a1, 64
+; ZIP-NEXT: vmv1r.v v10, v25
+; ZIP-NEXT: vmv1r.v v11, v23
+; ZIP-NEXT: vmv1r.v v12, v21
+; ZIP-NEXT: addi a0, sp, 64
+; ZIP-NEXT: vmv1r.v v13, v17
+; ZIP-NEXT: csrr a2, vlenb
+; ZIP-NEXT: vmv1r.v v14, v19
+; ZIP-NEXT: vsseg6e16.v v9, (a1)
+; ZIP-NEXT: vmv1r.v v9, v24
+; ZIP-NEXT: add a5, a1, a2
+; ZIP-NEXT: vmv1r.v v10, v22
+; ZIP-NEXT: add a3, a0, a2
+; ZIP-NEXT: vmv1r.v v11, v20
+; ZIP-NEXT: add a4, a3, a2
+; ZIP-NEXT: vmv1r.v v12, v16
+; ZIP-NEXT: add a6, a5, a2
+; ZIP-NEXT: vmv1r.v v13, v18
+; ZIP-NEXT: vsseg6e16.v v8, (a0)
+; ZIP-NEXT: vl1re16.v v14, (a1)
+; ZIP-NEXT: add a1, a6, a2
+; ZIP-NEXT: vl1re16.v v15, (a5)
+; ZIP-NEXT: add a5, a1, a2
+; ZIP-NEXT: vl1re16.v v18, (a5)
+; ZIP-NEXT: add a5, a5, a2
+; ZIP-NEXT: vl1re16.v v19, (a5)
+; ZIP-NEXT: add a5, a4, a2
+; ZIP-NEXT: vl1re16.v v16, (a6)
+; ZIP-NEXT: add a6, a5, a2
+; ZIP-NEXT: vl1re16.v v12, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re16.v v13, (a6)
+; ZIP-NEXT: csrr a6, vlenb
+; ZIP-NEXT: li a7, 12
+; ZIP-NEXT: mul a6, a6, a7
+; ZIP-NEXT: add a6, sp, a6
+; ZIP-NEXT: addi a6, a6, 64
+; ZIP-NEXT: vl1re16.v v17, (a1)
+; ZIP-NEXT: vl1re16.v v10, (a4)
+; ZIP-NEXT: vl1re16.v v11, (a5)
+; ZIP-NEXT: vl1re16.v v8, (a0)
+; ZIP-NEXT: vl1re16.v v9, (a3)
+; ZIP-NEXT: slli a2, a2, 3
+; ZIP-NEXT: add a2, a6, a2
+; ZIP-NEXT: vs4r.v v16, (a2)
+; ZIP-NEXT: vs8r.v v8, (a6)
+; ZIP-NEXT: vl8re16.v v16, (a2)
+; ZIP-NEXT: vl8re16.v v8, (a6)
+; ZIP-NEXT: addi sp, s0, -80
+; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
+; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
+; ZIP-NEXT: addi sp, sp, 80
+; ZIP-NEXT: ret
+ %res = call <vscale x 48 x half> @llvm.vector.interleave6.nxv48f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4, <vscale x 8 x half> %v5)
+ ret <vscale x 48 x half> %res
}
-define <vscale x 10 x half> @vector_interleave_nxv10f16_nxv2f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4) nounwind {
-; CHECK-LABEL: vector_interleave_nxv10f16_nxv2f16:
+define <vscale x 12 x bfloat> @vector_interleave_nxv12bf16_nxv2bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4, <vscale x 2 x bfloat> %v5) nounwind {
+; CHECK-LABEL: vector_interleave_nxv12bf16_nxv2bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
@@ -4964,23 +10303,27 @@ define <vscale x 10 x half> @vector_interleave_nxv10f16_nxv2f16(<vscale x 2 x ha
; CHECK-NEXT: srli a2, a1, 1
; CHECK-NEXT: add a3, a0, a2
; CHECK-NEXT: add a4, a3, a2
-; CHECK-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vsseg5e16.v v8, (a0)
; CHECK-NEXT: add a5, a4, a2
-; CHECK-NEXT: vle16.v v8, (a5)
-; CHECK-NEXT: vle16.v v9, (a4)
+; CHECK-NEXT: vsetvli a6, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vsseg6e16.v v8, (a0)
+; CHECK-NEXT: add a6, a5, a2
+; CHECK-NEXT: add a2, a6, a2
+; CHECK-NEXT: vle16.v v10, (a6)
+; CHECK-NEXT: vle16.v v8, (a2)
; CHECK-NEXT: srli a1, a1, 2
-; CHECK-NEXT: add a4, a1, a1
-; CHECK-NEXT: vle16.v v10, (a3)
-; CHECK-NEXT: vsetvli zero, a4, e16, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v9, v8, a1
-; CHECK-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
+; CHECK-NEXT: add a2, a1, a1
+; CHECK-NEXT: vle16.v v11, (a5)
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v10, v8, a1
+; CHECK-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vle16.v v9, (a4)
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v11, a1
+; CHECK-NEXT: vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vle16.v v11, (a3)
; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vsetvli zero, a4, e16, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v10, a1
-; CHECK-NEXT: add a2, a5, a2
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vle16.v v10, (a2)
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v11, a1
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a1, a0, 1
; CHECK-NEXT: add a0, a1, a0
@@ -4988,7 +10331,7 @@ define <vscale x 10 x half> @vector_interleave_nxv10f16_nxv2f16(<vscale x 2 x ha
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv10f16_nxv2f16:
+; ZVBB-LABEL: vector_interleave_nxv12bf16_nxv2bf16:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: csrr a0, vlenb
@@ -5000,93 +10343,101 @@ define <vscale x 10 x half> @vector_interleave_nxv10f16_nxv2f16(<vscale x 2 x ha
; ZVBB-NEXT: srli a2, a1, 1
; ZVBB-NEXT: add a3, a0, a2
; ZVBB-NEXT: add a4, a3, a2
-; ZVBB-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
-; ZVBB-NEXT: vsseg5e16.v v8, (a0)
; ZVBB-NEXT: add a5, a4, a2
-; ZVBB-NEXT: vle16.v v8, (a5)
-; ZVBB-NEXT: vle16.v v9, (a4)
+; ZVBB-NEXT: vsetvli a6, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vsseg6e16.v v8, (a0)
+; ZVBB-NEXT: add a6, a5, a2
+; ZVBB-NEXT: add a2, a6, a2
+; ZVBB-NEXT: vle16.v v10, (a6)
+; ZVBB-NEXT: vle16.v v8, (a2)
; ZVBB-NEXT: srli a1, a1, 2
-; ZVBB-NEXT: add a4, a1, a1
-; ZVBB-NEXT: vle16.v v10, (a3)
-; ZVBB-NEXT: vsetvli zero, a4, e16, m1, ta, ma
-; ZVBB-NEXT: vslideup.vx v9, v8, a1
-; ZVBB-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: add a2, a1, a1
+; ZVBB-NEXT: vle16.v v11, (a5)
+; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v10, v8, a1
+; ZVBB-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vle16.v v9, (a4)
+; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v9, v11, a1
+; ZVBB-NEXT: vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vle16.v v11, (a3)
; ZVBB-NEXT: vle16.v v8, (a0)
-; ZVBB-NEXT: vsetvli zero, a4, e16, m1, ta, ma
-; ZVBB-NEXT: vslideup.vx v8, v10, a1
-; ZVBB-NEXT: add a2, a5, a2
-; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; ZVBB-NEXT: vle16.v v10, (a2)
+; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v8, v11, a1
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a1, a0, 1
; ZVBB-NEXT: add a0, a1, a0
; ZVBB-NEXT: add sp, sp, a0
; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
- %res = call <vscale x 10 x half> @llvm.vector.interleave5.nxv10f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4)
- ret <vscale x 10 x half> %res
+ %res = call <vscale x 12 x bfloat> @llvm.vector.interleave6.nxv12bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4, <vscale x 2 x bfloat> %v5)
+ ret <vscale x 12 x bfloat> %res
}
-define <vscale x 20 x half> @vector_interleave_nxv20f16_nxv4f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4) nounwind {
-; CHECK-LABEL: vector_interleave_nxv20f16_nxv4f16:
+define <vscale x 24 x bfloat> @vector_interleave_nxv24bf16_nxv4bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4, <vscale x 4 x bfloat> %v5) nounwind {
+; CHECK-LABEL: vector_interleave_nxv24bf16_nxv4bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 2
-; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: add a2, a0, a1
; CHECK-NEXT: add a3, a2, a1
; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, ma
-; CHECK-NEXT: vsseg5e16.v v8, (a0)
+; CHECK-NEXT: vsseg6e16.v v8, (a0)
; CHECK-NEXT: vl1re16.v v10, (a3)
; CHECK-NEXT: add a3, a3, a1
; CHECK-NEXT: vl1re16.v v11, (a3)
+; CHECK-NEXT: add a3, a3, a1
; CHECK-NEXT: vl1re16.v v8, (a0)
; CHECK-NEXT: vl1re16.v v9, (a2)
+; CHECK-NEXT: vl1re16.v v12, (a3)
; CHECK-NEXT: add a1, a3, a1
-; CHECK-NEXT: vl1re16.v v12, (a1)
+; CHECK-NEXT: vl1re16.v v13, (a1)
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 2
-; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv20f16_nxv4f16:
+; ZVBB-LABEL: vector_interleave_nxv24bf16_nxv4bf16:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a1, a0, 2
-; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
; ZVBB-NEXT: sub sp, sp, a0
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: add a2, a0, a1
; ZVBB-NEXT: add a3, a2, a1
; ZVBB-NEXT: vsetvli a4, zero, e16, m1, ta, ma
-; ZVBB-NEXT: vsseg5e16.v v8, (a0)
+; ZVBB-NEXT: vsseg6e16.v v8, (a0)
; ZVBB-NEXT: vl1re16.v v10, (a3)
; ZVBB-NEXT: add a3, a3, a1
; ZVBB-NEXT: vl1re16.v v11, (a3)
+; ZVBB-NEXT: add a3, a3, a1
; ZVBB-NEXT: vl1re16.v v8, (a0)
; ZVBB-NEXT: vl1re16.v v9, (a2)
+; ZVBB-NEXT: vl1re16.v v12, (a3)
; ZVBB-NEXT: add a1, a3, a1
-; ZVBB-NEXT: vl1re16.v v12, (a1)
+; ZVBB-NEXT: vl1re16.v v13, (a1)
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a1, a0, 2
-; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
; ZVBB-NEXT: add sp, sp, a0
; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
- %res = call <vscale x 20 x half> @llvm.vector.interleave5.nxv20f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4)
- ret <vscale x 20 x half> %res
+ %res = call <vscale x 24 x bfloat> @llvm.vector.interleave6.nxv24bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4, <vscale x 4 x bfloat> %v5)
+ ret <vscale x 24 x bfloat> %res
}
-define <vscale x 40 x half> @vector_interleave_nxv40f16_nxv8f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4) nounwind {
-; RV32-LABEL: vector_interleave_nxv40f16_nxv8f16:
+define <vscale x 48 x bfloat> @vector_interleave_nxv48bf16_nxv8bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4, <vscale x 8 x bfloat> %v5) nounwind {
+; RV32-LABEL: vector_interleave_nxv48bf16_nxv8bf16:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -80
; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
@@ -5098,61 +10449,68 @@ define <vscale x 40 x half> @vector_interleave_nxv40f16_nxv8f16(<vscale x 8 x ha
; RV32-NEXT: sub sp, sp, a0
; RV32-NEXT: andi sp, sp, -64
; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; RV32-NEXT: vmv2r.v v20, v16
-; RV32-NEXT: addi a0, sp, 64
-; RV32-NEXT: vmv2r.v v18, v12
+; RV32-NEXT: vmv2r.v v20, v14
+; RV32-NEXT: vmv2r.v v22, v12
+; RV32-NEXT: vmv2r.v v24, v10
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a2, a1, 2
-; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: li a0, 6
+; RV32-NEXT: mul a1, a1, a0
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 64
+; RV32-NEXT: vmv1r.v v10, v25
+; RV32-NEXT: vmv1r.v v11, v23
+; RV32-NEXT: vmv1r.v v12, v21
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv1r.v v13, v17
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: vmv2r.v v16, v8
-; RV32-NEXT: vmv2r.v v22, v16
-; RV32-NEXT: vmv2r.v v24, v18
-; RV32-NEXT: vmv1r.v v26, v20
+; RV32-NEXT: vmv1r.v v14, v19
+; RV32-NEXT: vsseg6e16.v v9, (a1)
+; RV32-NEXT: vmv1r.v v9, v24
+; RV32-NEXT: add a5, a1, a2
+; RV32-NEXT: vmv1r.v v10, v22
; RV32-NEXT: add a3, a0, a2
-; RV32-NEXT: vmv1r.v v23, v10
-; RV32-NEXT: add a4, a1, a2
-; RV32-NEXT: add a5, a4, a2
-; RV32-NEXT: vmv1r.v v25, v14
+; RV32-NEXT: vmv1r.v v11, v20
+; RV32-NEXT: add a4, a3, a2
+; RV32-NEXT: vmv1r.v v12, v16
; RV32-NEXT: add a6, a5, a2
-; RV32-NEXT: vmv1r.v v18, v11
-; RV32-NEXT: vsseg5e16.v v22, (a0)
-; RV32-NEXT: vmv1r.v v20, v15
-; RV32-NEXT: vsseg5e16.v v17, (a1)
+; RV32-NEXT: vmv1r.v v13, v18
+; RV32-NEXT: vsseg6e16.v v8, (a0)
+; RV32-NEXT: vl1re16.v v14, (a1)
+; RV32-NEXT: add a1, a6, a2
+; RV32-NEXT: vl1re16.v v15, (a5)
+; RV32-NEXT: add a5, a1, a2
+; RV32-NEXT: vl1re16.v v18, (a5)
+; RV32-NEXT: add a5, a5, a2
+; RV32-NEXT: vl1re16.v v19, (a5)
+; RV32-NEXT: add a5, a4, a2
; RV32-NEXT: vl1re16.v v16, (a6)
+; RV32-NEXT: add a6, a5, a2
+; RV32-NEXT: vl1re16.v v12, (a6)
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re16.v v17, (a6)
-; RV32-NEXT: add a6, a3, a2
-; RV32-NEXT: vl1re16.v v10, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re16.v v11, (a6)
+; RV32-NEXT: vl1re16.v v13, (a6)
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: li a7, 12
+; RV32-NEXT: mul a6, a6, a7
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 64
+; RV32-NEXT: vl1re16.v v17, (a1)
+; RV32-NEXT: vl1re16.v v10, (a4)
+; RV32-NEXT: vl1re16.v v11, (a5)
; RV32-NEXT: vl1re16.v v8, (a0)
; RV32-NEXT: vl1re16.v v9, (a3)
-; RV32-NEXT: vl1re16.v v14, (a4)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a3, 10
-; RV32-NEXT: mul a0, a0, a3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 64
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re16.v v15, (a5)
-; RV32-NEXT: vl1re16.v v12, (a6)
-; RV32-NEXT: vl1re16.v v13, (a1)
; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a0, a2
-; RV32-NEXT: vs2r.v v16, (a2)
-; RV32-NEXT: vs8r.v v8, (a0)
+; RV32-NEXT: add a2, a6, a2
+; RV32-NEXT: vs4r.v v16, (a2)
+; RV32-NEXT: vs8r.v v8, (a6)
; RV32-NEXT: vl8re16.v v16, (a2)
-; RV32-NEXT: vl8re16.v v8, (a0)
+; RV32-NEXT: vl8re16.v v8, (a6)
; RV32-NEXT: addi sp, s0, -80
; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 80
; RV32-NEXT: ret
;
-; RV64-LABEL: vector_interleave_nxv40f16_nxv8f16:
+; RV64-LABEL: vector_interleave_nxv48bf16_nxv8bf16:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -80
; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -5164,61 +10522,68 @@ define <vscale x 40 x half> @vector_interleave_nxv40f16_nxv8f16(<vscale x 8 x ha
; RV64-NEXT: sub sp, sp, a0
; RV64-NEXT: andi sp, sp, -64
; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; RV64-NEXT: vmv2r.v v20, v16
-; RV64-NEXT: addi a0, sp, 64
-; RV64-NEXT: vmv2r.v v18, v12
+; RV64-NEXT: vmv2r.v v20, v14
+; RV64-NEXT: vmv2r.v v22, v12
+; RV64-NEXT: vmv2r.v v24, v10
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a2, a1, 2
-; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: li a0, 6
+; RV64-NEXT: mul a1, a1, a0
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 64
+; RV64-NEXT: vmv1r.v v10, v25
+; RV64-NEXT: vmv1r.v v11, v23
+; RV64-NEXT: vmv1r.v v12, v21
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv1r.v v13, v17
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: vmv2r.v v16, v8
-; RV64-NEXT: vmv2r.v v22, v16
-; RV64-NEXT: vmv2r.v v24, v18
-; RV64-NEXT: vmv1r.v v26, v20
+; RV64-NEXT: vmv1r.v v14, v19
+; RV64-NEXT: vsseg6e16.v v9, (a1)
+; RV64-NEXT: vmv1r.v v9, v24
+; RV64-NEXT: add a5, a1, a2
+; RV64-NEXT: vmv1r.v v10, v22
; RV64-NEXT: add a3, a0, a2
-; RV64-NEXT: vmv1r.v v23, v10
-; RV64-NEXT: add a4, a1, a2
-; RV64-NEXT: add a5, a4, a2
-; RV64-NEXT: vmv1r.v v25, v14
+; RV64-NEXT: vmv1r.v v11, v20
+; RV64-NEXT: add a4, a3, a2
+; RV64-NEXT: vmv1r.v v12, v16
; RV64-NEXT: add a6, a5, a2
-; RV64-NEXT: vmv1r.v v18, v11
-; RV64-NEXT: vsseg5e16.v v22, (a0)
-; RV64-NEXT: vmv1r.v v20, v15
-; RV64-NEXT: vsseg5e16.v v17, (a1)
+; RV64-NEXT: vmv1r.v v13, v18
+; RV64-NEXT: vsseg6e16.v v8, (a0)
+; RV64-NEXT: vl1re16.v v14, (a1)
+; RV64-NEXT: add a1, a6, a2
+; RV64-NEXT: vl1re16.v v15, (a5)
+; RV64-NEXT: add a5, a1, a2
+; RV64-NEXT: vl1re16.v v18, (a5)
+; RV64-NEXT: add a5, a5, a2
+; RV64-NEXT: vl1re16.v v19, (a5)
+; RV64-NEXT: add a5, a4, a2
; RV64-NEXT: vl1re16.v v16, (a6)
+; RV64-NEXT: add a6, a5, a2
+; RV64-NEXT: vl1re16.v v12, (a6)
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re16.v v17, (a6)
-; RV64-NEXT: add a6, a3, a2
-; RV64-NEXT: vl1re16.v v10, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re16.v v11, (a6)
+; RV64-NEXT: vl1re16.v v13, (a6)
+; RV64-NEXT: csrr a6, vlenb
+; RV64-NEXT: li a7, 12
+; RV64-NEXT: mul a6, a6, a7
+; RV64-NEXT: add a6, sp, a6
+; RV64-NEXT: addi a6, a6, 64
+; RV64-NEXT: vl1re16.v v17, (a1)
+; RV64-NEXT: vl1re16.v v10, (a4)
+; RV64-NEXT: vl1re16.v v11, (a5)
; RV64-NEXT: vl1re16.v v8, (a0)
; RV64-NEXT: vl1re16.v v9, (a3)
-; RV64-NEXT: vl1re16.v v14, (a4)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: li a3, 10
-; RV64-NEXT: mul a0, a0, a3
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 64
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re16.v v15, (a5)
-; RV64-NEXT: vl1re16.v v12, (a6)
-; RV64-NEXT: vl1re16.v v13, (a1)
; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a0, a2
-; RV64-NEXT: vs2r.v v16, (a2)
-; RV64-NEXT: vs8r.v v8, (a0)
+; RV64-NEXT: add a2, a6, a2
+; RV64-NEXT: vs4r.v v16, (a2)
+; RV64-NEXT: vs8r.v v8, (a6)
; RV64-NEXT: vl8re16.v v16, (a2)
-; RV64-NEXT: vl8re16.v v8, (a0)
+; RV64-NEXT: vl8re16.v v8, (a6)
; RV64-NEXT: addi sp, s0, -80
; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 80
; RV64-NEXT: ret
;
-; ZVBB-RV32-LABEL: vector_interleave_nxv40f16_nxv8f16:
+; ZVBB-RV32-LABEL: vector_interleave_nxv48bf16_nxv8bf16:
; ZVBB-RV32: # %bb.0:
; ZVBB-RV32-NEXT: addi sp, sp, -80
; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
@@ -5230,61 +10595,68 @@ define <vscale x 40 x half> @vector_interleave_nxv40f16_nxv8f16(<vscale x 8 x ha
; ZVBB-RV32-NEXT: sub sp, sp, a0
; ZVBB-RV32-NEXT: andi sp, sp, -64
; ZVBB-RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; ZVBB-RV32-NEXT: vmv2r.v v20, v16
-; ZVBB-RV32-NEXT: addi a0, sp, 64
-; ZVBB-RV32-NEXT: vmv2r.v v18, v12
+; ZVBB-RV32-NEXT: vmv2r.v v20, v14
+; ZVBB-RV32-NEXT: vmv2r.v v22, v12
+; ZVBB-RV32-NEXT: vmv2r.v v24, v10
; ZVBB-RV32-NEXT: csrr a1, vlenb
-; ZVBB-RV32-NEXT: slli a2, a1, 2
-; ZVBB-RV32-NEXT: add a1, a2, a1
+; ZVBB-RV32-NEXT: li a0, 6
+; ZVBB-RV32-NEXT: mul a1, a1, a0
; ZVBB-RV32-NEXT: add a1, sp, a1
; ZVBB-RV32-NEXT: addi a1, a1, 64
+; ZVBB-RV32-NEXT: vmv1r.v v10, v25
+; ZVBB-RV32-NEXT: vmv1r.v v11, v23
+; ZVBB-RV32-NEXT: vmv1r.v v12, v21
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv1r.v v13, v17
; ZVBB-RV32-NEXT: csrr a2, vlenb
-; ZVBB-RV32-NEXT: vmv2r.v v16, v8
-; ZVBB-RV32-NEXT: vmv2r.v v22, v16
-; ZVBB-RV32-NEXT: vmv2r.v v24, v18
-; ZVBB-RV32-NEXT: vmv1r.v v26, v20
+; ZVBB-RV32-NEXT: vmv1r.v v14, v19
+; ZVBB-RV32-NEXT: vsseg6e16.v v9, (a1)
+; ZVBB-RV32-NEXT: vmv1r.v v9, v24
+; ZVBB-RV32-NEXT: add a5, a1, a2
+; ZVBB-RV32-NEXT: vmv1r.v v10, v22
; ZVBB-RV32-NEXT: add a3, a0, a2
-; ZVBB-RV32-NEXT: vmv1r.v v23, v10
-; ZVBB-RV32-NEXT: add a4, a1, a2
-; ZVBB-RV32-NEXT: add a5, a4, a2
-; ZVBB-RV32-NEXT: vmv1r.v v25, v14
+; ZVBB-RV32-NEXT: vmv1r.v v11, v20
+; ZVBB-RV32-NEXT: add a4, a3, a2
+; ZVBB-RV32-NEXT: vmv1r.v v12, v16
; ZVBB-RV32-NEXT: add a6, a5, a2
-; ZVBB-RV32-NEXT: vmv1r.v v18, v11
-; ZVBB-RV32-NEXT: vsseg5e16.v v22, (a0)
-; ZVBB-RV32-NEXT: vmv1r.v v20, v15
-; ZVBB-RV32-NEXT: vsseg5e16.v v17, (a1)
+; ZVBB-RV32-NEXT: vmv1r.v v13, v18
+; ZVBB-RV32-NEXT: vsseg6e16.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re16.v v14, (a1)
+; ZVBB-RV32-NEXT: add a1, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v15, (a5)
+; ZVBB-RV32-NEXT: add a5, a1, a2
+; ZVBB-RV32-NEXT: vl1re16.v v18, (a5)
+; ZVBB-RV32-NEXT: add a5, a5, a2
+; ZVBB-RV32-NEXT: vl1re16.v v19, (a5)
+; ZVBB-RV32-NEXT: add a5, a4, a2
; ZVBB-RV32-NEXT: vl1re16.v v16, (a6)
+; ZVBB-RV32-NEXT: add a6, a5, a2
+; ZVBB-RV32-NEXT: vl1re16.v v12, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re16.v v17, (a6)
-; ZVBB-RV32-NEXT: add a6, a3, a2
-; ZVBB-RV32-NEXT: vl1re16.v v10, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re16.v v11, (a6)
+; ZVBB-RV32-NEXT: vl1re16.v v13, (a6)
+; ZVBB-RV32-NEXT: csrr a6, vlenb
+; ZVBB-RV32-NEXT: li a7, 12
+; ZVBB-RV32-NEXT: mul a6, a6, a7
+; ZVBB-RV32-NEXT: add a6, sp, a6
+; ZVBB-RV32-NEXT: addi a6, a6, 64
+; ZVBB-RV32-NEXT: vl1re16.v v17, (a1)
+; ZVBB-RV32-NEXT: vl1re16.v v10, (a4)
+; ZVBB-RV32-NEXT: vl1re16.v v11, (a5)
; ZVBB-RV32-NEXT: vl1re16.v v8, (a0)
; ZVBB-RV32-NEXT: vl1re16.v v9, (a3)
-; ZVBB-RV32-NEXT: vl1re16.v v14, (a4)
-; ZVBB-RV32-NEXT: csrr a0, vlenb
-; ZVBB-RV32-NEXT: li a3, 10
-; ZVBB-RV32-NEXT: mul a0, a0, a3
-; ZVBB-RV32-NEXT: add a0, sp, a0
-; ZVBB-RV32-NEXT: addi a0, a0, 64
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re16.v v15, (a5)
-; ZVBB-RV32-NEXT: vl1re16.v v12, (a6)
-; ZVBB-RV32-NEXT: vl1re16.v v13, (a1)
; ZVBB-RV32-NEXT: slli a2, a2, 3
-; ZVBB-RV32-NEXT: add a2, a0, a2
-; ZVBB-RV32-NEXT: vs2r.v v16, (a2)
-; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT: add a2, a6, a2
+; ZVBB-RV32-NEXT: vs4r.v v16, (a2)
+; ZVBB-RV32-NEXT: vs8r.v v8, (a6)
; ZVBB-RV32-NEXT: vl8re16.v v16, (a2)
-; ZVBB-RV32-NEXT: vl8re16.v v8, (a0)
+; ZVBB-RV32-NEXT: vl8re16.v v8, (a6)
; ZVBB-RV32-NEXT: addi sp, s0, -80
; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; ZVBB-RV32-NEXT: addi sp, sp, 80
; ZVBB-RV32-NEXT: ret
;
-; ZVBB-RV64-LABEL: vector_interleave_nxv40f16_nxv8f16:
+; ZVBB-RV64-LABEL: vector_interleave_nxv48bf16_nxv8bf16:
; ZVBB-RV64: # %bb.0:
; ZVBB-RV64-NEXT: addi sp, sp, -80
; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -5296,61 +10668,68 @@ define <vscale x 40 x half> @vector_interleave_nxv40f16_nxv8f16(<vscale x 8 x ha
; ZVBB-RV64-NEXT: sub sp, sp, a0
; ZVBB-RV64-NEXT: andi sp, sp, -64
; ZVBB-RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; ZVBB-RV64-NEXT: vmv2r.v v20, v16
-; ZVBB-RV64-NEXT: addi a0, sp, 64
-; ZVBB-RV64-NEXT: vmv2r.v v18, v12
+; ZVBB-RV64-NEXT: vmv2r.v v20, v14
+; ZVBB-RV64-NEXT: vmv2r.v v22, v12
+; ZVBB-RV64-NEXT: vmv2r.v v24, v10
; ZVBB-RV64-NEXT: csrr a1, vlenb
-; ZVBB-RV64-NEXT: slli a2, a1, 2
-; ZVBB-RV64-NEXT: add a1, a2, a1
+; ZVBB-RV64-NEXT: li a0, 6
+; ZVBB-RV64-NEXT: mul a1, a1, a0
; ZVBB-RV64-NEXT: add a1, sp, a1
; ZVBB-RV64-NEXT: addi a1, a1, 64
+; ZVBB-RV64-NEXT: vmv1r.v v10, v25
+; ZVBB-RV64-NEXT: vmv1r.v v11, v23
+; ZVBB-RV64-NEXT: vmv1r.v v12, v21
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv1r.v v13, v17
; ZVBB-RV64-NEXT: csrr a2, vlenb
-; ZVBB-RV64-NEXT: vmv2r.v v16, v8
-; ZVBB-RV64-NEXT: vmv2r.v v22, v16
-; ZVBB-RV64-NEXT: vmv2r.v v24, v18
-; ZVBB-RV64-NEXT: vmv1r.v v26, v20
+; ZVBB-RV64-NEXT: vmv1r.v v14, v19
+; ZVBB-RV64-NEXT: vsseg6e16.v v9, (a1)
+; ZVBB-RV64-NEXT: vmv1r.v v9, v24
+; ZVBB-RV64-NEXT: add a5, a1, a2
+; ZVBB-RV64-NEXT: vmv1r.v v10, v22
; ZVBB-RV64-NEXT: add a3, a0, a2
-; ZVBB-RV64-NEXT: vmv1r.v v23, v10
-; ZVBB-RV64-NEXT: add a4, a1, a2
-; ZVBB-RV64-NEXT: add a5, a4, a2
-; ZVBB-RV64-NEXT: vmv1r.v v25, v14
+; ZVBB-RV64-NEXT: vmv1r.v v11, v20
+; ZVBB-RV64-NEXT: add a4, a3, a2
+; ZVBB-RV64-NEXT: vmv1r.v v12, v16
; ZVBB-RV64-NEXT: add a6, a5, a2
-; ZVBB-RV64-NEXT: vmv1r.v v18, v11
-; ZVBB-RV64-NEXT: vsseg5e16.v v22, (a0)
-; ZVBB-RV64-NEXT: vmv1r.v v20, v15
-; ZVBB-RV64-NEXT: vsseg5e16.v v17, (a1)
+; ZVBB-RV64-NEXT: vmv1r.v v13, v18
+; ZVBB-RV64-NEXT: vsseg6e16.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re16.v v14, (a1)
+; ZVBB-RV64-NEXT: add a1, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v15, (a5)
+; ZVBB-RV64-NEXT: add a5, a1, a2
+; ZVBB-RV64-NEXT: vl1re16.v v18, (a5)
+; ZVBB-RV64-NEXT: add a5, a5, a2
+; ZVBB-RV64-NEXT: vl1re16.v v19, (a5)
+; ZVBB-RV64-NEXT: add a5, a4, a2
; ZVBB-RV64-NEXT: vl1re16.v v16, (a6)
+; ZVBB-RV64-NEXT: add a6, a5, a2
+; ZVBB-RV64-NEXT: vl1re16.v v12, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re16.v v17, (a6)
-; ZVBB-RV64-NEXT: add a6, a3, a2
-; ZVBB-RV64-NEXT: vl1re16.v v10, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re16.v v11, (a6)
+; ZVBB-RV64-NEXT: vl1re16.v v13, (a6)
+; ZVBB-RV64-NEXT: csrr a6, vlenb
+; ZVBB-RV64-NEXT: li a7, 12
+; ZVBB-RV64-NEXT: mul a6, a6, a7
+; ZVBB-RV64-NEXT: add a6, sp, a6
+; ZVBB-RV64-NEXT: addi a6, a6, 64
+; ZVBB-RV64-NEXT: vl1re16.v v17, (a1)
+; ZVBB-RV64-NEXT: vl1re16.v v10, (a4)
+; ZVBB-RV64-NEXT: vl1re16.v v11, (a5)
; ZVBB-RV64-NEXT: vl1re16.v v8, (a0)
; ZVBB-RV64-NEXT: vl1re16.v v9, (a3)
-; ZVBB-RV64-NEXT: vl1re16.v v14, (a4)
-; ZVBB-RV64-NEXT: csrr a0, vlenb
-; ZVBB-RV64-NEXT: li a3, 10
-; ZVBB-RV64-NEXT: mul a0, a0, a3
-; ZVBB-RV64-NEXT: add a0, sp, a0
-; ZVBB-RV64-NEXT: addi a0, a0, 64
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re16.v v15, (a5)
-; ZVBB-RV64-NEXT: vl1re16.v v12, (a6)
-; ZVBB-RV64-NEXT: vl1re16.v v13, (a1)
; ZVBB-RV64-NEXT: slli a2, a2, 3
-; ZVBB-RV64-NEXT: add a2, a0, a2
-; ZVBB-RV64-NEXT: vs2r.v v16, (a2)
-; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT: add a2, a6, a2
+; ZVBB-RV64-NEXT: vs4r.v v16, (a2)
+; ZVBB-RV64-NEXT: vs8r.v v8, (a6)
; ZVBB-RV64-NEXT: vl8re16.v v16, (a2)
-; ZVBB-RV64-NEXT: vl8re16.v v8, (a0)
+; ZVBB-RV64-NEXT: vl8re16.v v8, (a6)
; ZVBB-RV64-NEXT: addi sp, s0, -80
; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZVBB-RV64-NEXT: addi sp, sp, 80
; ZVBB-RV64-NEXT: ret
;
-; ZIP-LABEL: vector_interleave_nxv40f16_nxv8f16:
+; ZIP-LABEL: vector_interleave_nxv48bf16_nxv8bf16:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -80
; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -5362,65 +10741,72 @@ define <vscale x 40 x half> @vector_interleave_nxv40f16_nxv8f16(<vscale x 8 x ha
; ZIP-NEXT: sub sp, sp, a0
; ZIP-NEXT: andi sp, sp, -64
; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; ZIP-NEXT: vmv2r.v v20, v16
-; ZIP-NEXT: addi a0, sp, 64
-; ZIP-NEXT: vmv2r.v v18, v12
+; ZIP-NEXT: vmv2r.v v20, v14
+; ZIP-NEXT: vmv2r.v v22, v12
+; ZIP-NEXT: vmv2r.v v24, v10
; ZIP-NEXT: csrr a1, vlenb
-; ZIP-NEXT: slli a2, a1, 2
-; ZIP-NEXT: add a1, a2, a1
+; ZIP-NEXT: li a0, 6
+; ZIP-NEXT: mul a1, a1, a0
; ZIP-NEXT: add a1, sp, a1
; ZIP-NEXT: addi a1, a1, 64
+; ZIP-NEXT: vmv1r.v v10, v25
+; ZIP-NEXT: vmv1r.v v11, v23
+; ZIP-NEXT: vmv1r.v v12, v21
+; ZIP-NEXT: addi a0, sp, 64
+; ZIP-NEXT: vmv1r.v v13, v17
; ZIP-NEXT: csrr a2, vlenb
-; ZIP-NEXT: vmv2r.v v16, v8
-; ZIP-NEXT: vmv2r.v v22, v16
-; ZIP-NEXT: vmv2r.v v24, v18
-; ZIP-NEXT: vmv1r.v v26, v20
+; ZIP-NEXT: vmv1r.v v14, v19
+; ZIP-NEXT: vsseg6e16.v v9, (a1)
+; ZIP-NEXT: vmv1r.v v9, v24
+; ZIP-NEXT: add a5, a1, a2
+; ZIP-NEXT: vmv1r.v v10, v22
; ZIP-NEXT: add a3, a0, a2
-; ZIP-NEXT: vmv1r.v v23, v10
-; ZIP-NEXT: add a4, a1, a2
-; ZIP-NEXT: add a5, a4, a2
-; ZIP-NEXT: vmv1r.v v25, v14
+; ZIP-NEXT: vmv1r.v v11, v20
+; ZIP-NEXT: add a4, a3, a2
+; ZIP-NEXT: vmv1r.v v12, v16
; ZIP-NEXT: add a6, a5, a2
-; ZIP-NEXT: vmv1r.v v18, v11
-; ZIP-NEXT: vsseg5e16.v v22, (a0)
-; ZIP-NEXT: vmv1r.v v20, v15
-; ZIP-NEXT: vsseg5e16.v v17, (a1)
+; ZIP-NEXT: vmv1r.v v13, v18
+; ZIP-NEXT: vsseg6e16.v v8, (a0)
+; ZIP-NEXT: vl1re16.v v14, (a1)
+; ZIP-NEXT: add a1, a6, a2
+; ZIP-NEXT: vl1re16.v v15, (a5)
+; ZIP-NEXT: add a5, a1, a2
+; ZIP-NEXT: vl1re16.v v18, (a5)
+; ZIP-NEXT: add a5, a5, a2
+; ZIP-NEXT: vl1re16.v v19, (a5)
+; ZIP-NEXT: add a5, a4, a2
; ZIP-NEXT: vl1re16.v v16, (a6)
+; ZIP-NEXT: add a6, a5, a2
+; ZIP-NEXT: vl1re16.v v12, (a6)
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re16.v v17, (a6)
-; ZIP-NEXT: add a6, a3, a2
-; ZIP-NEXT: vl1re16.v v10, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re16.v v11, (a6)
+; ZIP-NEXT: vl1re16.v v13, (a6)
+; ZIP-NEXT: csrr a6, vlenb
+; ZIP-NEXT: li a7, 12
+; ZIP-NEXT: mul a6, a6, a7
+; ZIP-NEXT: add a6, sp, a6
+; ZIP-NEXT: addi a6, a6, 64
+; ZIP-NEXT: vl1re16.v v17, (a1)
+; ZIP-NEXT: vl1re16.v v10, (a4)
+; ZIP-NEXT: vl1re16.v v11, (a5)
; ZIP-NEXT: vl1re16.v v8, (a0)
; ZIP-NEXT: vl1re16.v v9, (a3)
-; ZIP-NEXT: vl1re16.v v14, (a4)
-; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: li a3, 10
-; ZIP-NEXT: mul a0, a0, a3
-; ZIP-NEXT: add a0, sp, a0
-; ZIP-NEXT: addi a0, a0, 64
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re16.v v15, (a5)
-; ZIP-NEXT: vl1re16.v v12, (a6)
-; ZIP-NEXT: vl1re16.v v13, (a1)
; ZIP-NEXT: slli a2, a2, 3
-; ZIP-NEXT: add a2, a0, a2
-; ZIP-NEXT: vs2r.v v16, (a2)
-; ZIP-NEXT: vs8r.v v8, (a0)
+; ZIP-NEXT: add a2, a6, a2
+; ZIP-NEXT: vs4r.v v16, (a2)
+; ZIP-NEXT: vs8r.v v8, (a6)
; ZIP-NEXT: vl8re16.v v16, (a2)
-; ZIP-NEXT: vl8re16.v v8, (a0)
+; ZIP-NEXT: vl8re16.v v8, (a6)
; ZIP-NEXT: addi sp, s0, -80
; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZIP-NEXT: addi sp, sp, 80
; ZIP-NEXT: ret
- %res = call <vscale x 40 x half> @llvm.vector.interleave5.nxv40f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4)
- ret <vscale x 40 x half> %res
+ %res = call <vscale x 48 x bfloat> @llvm.vector.interleave6.nxv48bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4, <vscale x 8 x bfloat> %v5)
+ ret <vscale x 48 x bfloat> %res
}
-define <vscale x 10 x bfloat> @vector_interleave_nxv10bf16_nxv2bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4) nounwind {
-; CHECK-LABEL: vector_interleave_nxv10bf16_nxv2bf16:
+define <vscale x 6 x float> @vector_interleave_nxv6f32_nxv1f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4, <vscale x 1 x float> %v6) nounwind {
+; CHECK-LABEL: vector_interleave_nxv6f32_nxv1f32:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
@@ -5432,23 +10818,27 @@ define <vscale x 10 x bfloat> @vector_interleave_nxv10bf16_nxv2bf16(<vscale x 2
; CHECK-NEXT: srli a2, a1, 1
; CHECK-NEXT: add a3, a0, a2
; CHECK-NEXT: add a4, a3, a2
-; CHECK-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vsseg5e16.v v8, (a0)
; CHECK-NEXT: add a5, a4, a2
-; CHECK-NEXT: vle16.v v8, (a5)
-; CHECK-NEXT: vle16.v v9, (a4)
-; CHECK-NEXT: srli a1, a1, 2
-; CHECK-NEXT: add a4, a1, a1
-; CHECK-NEXT: vle16.v v10, (a3)
-; CHECK-NEXT: vsetvli zero, a4, e16, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v9, v8, a1
-; CHECK-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vsetvli zero, a4, e16, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v10, a1
-; CHECK-NEXT: add a2, a5, a2
-; CHECK-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vle16.v v10, (a2)
+; CHECK-NEXT: vsetvli a6, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsseg6e32.v v8, (a0)
+; CHECK-NEXT: add a6, a5, a2
+; CHECK-NEXT: add a2, a6, a2
+; CHECK-NEXT: vle32.v v10, (a6)
+; CHECK-NEXT: vle32.v v8, (a2)
+; CHECK-NEXT: srli a1, a1, 3
+; CHECK-NEXT: add a2, a1, a1
+; CHECK-NEXT: vle32.v v11, (a5)
+; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v10, v8, a1
+; CHECK-NEXT: vsetvli a5, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vle32.v v9, (a4)
+; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v11, a1
+; CHECK-NEXT: vsetvli a4, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vle32.v v11, (a3)
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v11, a1
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a1, a0, 1
; CHECK-NEXT: add a0, a1, a0
@@ -5456,7 +10846,7 @@ define <vscale x 10 x bfloat> @vector_interleave_nxv10bf16_nxv2bf16(<vscale x 2
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv10bf16_nxv2bf16:
+; ZVBB-LABEL: vector_interleave_nxv6f32_nxv1f32:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: csrr a0, vlenb
@@ -5468,93 +10858,101 @@ define <vscale x 10 x bfloat> @vector_interleave_nxv10bf16_nxv2bf16(<vscale x 2
; ZVBB-NEXT: srli a2, a1, 1
; ZVBB-NEXT: add a3, a0, a2
; ZVBB-NEXT: add a4, a3, a2
-; ZVBB-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
-; ZVBB-NEXT: vsseg5e16.v v8, (a0)
; ZVBB-NEXT: add a5, a4, a2
-; ZVBB-NEXT: vle16.v v8, (a5)
-; ZVBB-NEXT: vle16.v v9, (a4)
-; ZVBB-NEXT: srli a1, a1, 2
-; ZVBB-NEXT: add a4, a1, a1
-; ZVBB-NEXT: vle16.v v10, (a3)
-; ZVBB-NEXT: vsetvli zero, a4, e16, m1, ta, ma
-; ZVBB-NEXT: vslideup.vx v9, v8, a1
-; ZVBB-NEXT: vsetvli a3, zero, e16, mf2, ta, ma
-; ZVBB-NEXT: vle16.v v8, (a0)
-; ZVBB-NEXT: vsetvli zero, a4, e16, m1, ta, ma
-; ZVBB-NEXT: vslideup.vx v8, v10, a1
-; ZVBB-NEXT: add a2, a5, a2
-; ZVBB-NEXT: vsetvli a0, zero, e16, mf2, ta, ma
-; ZVBB-NEXT: vle16.v v10, (a2)
+; ZVBB-NEXT: vsetvli a6, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vsseg6e32.v v8, (a0)
+; ZVBB-NEXT: add a6, a5, a2
+; ZVBB-NEXT: add a2, a6, a2
+; ZVBB-NEXT: vle32.v v10, (a6)
+; ZVBB-NEXT: vle32.v v8, (a2)
+; ZVBB-NEXT: srli a1, a1, 3
+; ZVBB-NEXT: add a2, a1, a1
+; ZVBB-NEXT: vle32.v v11, (a5)
+; ZVBB-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v10, v8, a1
+; ZVBB-NEXT: vsetvli a5, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vle32.v v9, (a4)
+; ZVBB-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v9, v11, a1
+; ZVBB-NEXT: vsetvli a4, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vle32.v v11, (a3)
+; ZVBB-NEXT: vle32.v v8, (a0)
+; ZVBB-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v8, v11, a1
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a1, a0, 1
; ZVBB-NEXT: add a0, a1, a0
; ZVBB-NEXT: add sp, sp, a0
; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
- %res = call <vscale x 10 x bfloat> @llvm.vector.interleave5.nxv10bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4)
- ret <vscale x 10 x bfloat> %res
+ %res = call <vscale x 6 x float> @llvm.vector.interleave6.nxv6f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4, <vscale x 1 x float> %v6)
+ ret <vscale x 6 x float> %res
}
-define <vscale x 20 x bfloat> @vector_interleave_nxv20bf16_nxv4bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4) nounwind {
-; CHECK-LABEL: vector_interleave_nxv20bf16_nxv4bf16:
+define <vscale x 12 x float> @vector_interleave_nxv12f32_nxv2f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4, <vscale x 2 x float> %v5) nounwind {
+; CHECK-LABEL: vector_interleave_nxv12f32_nxv2f32:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 2
-; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: add a2, a0, a1
; CHECK-NEXT: add a3, a2, a1
-; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, ma
-; CHECK-NEXT: vsseg5e16.v v8, (a0)
-; CHECK-NEXT: vl1re16.v v10, (a3)
+; CHECK-NEXT: vsetvli a4, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg6e32.v v8, (a0)
+; CHECK-NEXT: vl1re32.v v10, (a3)
; CHECK-NEXT: add a3, a3, a1
-; CHECK-NEXT: vl1re16.v v11, (a3)
-; CHECK-NEXT: vl1re16.v v8, (a0)
-; CHECK-NEXT: vl1re16.v v9, (a2)
+; CHECK-NEXT: vl1re32.v v11, (a3)
+; CHECK-NEXT: add a3, a3, a1
+; CHECK-NEXT: vl1re32.v v8, (a0)
+; CHECK-NEXT: vl1re32.v v9, (a2)
+; CHECK-NEXT: vl1re32.v v12, (a3)
; CHECK-NEXT: add a1, a3, a1
-; CHECK-NEXT: vl1re16.v v12, (a1)
+; CHECK-NEXT: vl1re32.v v13, (a1)
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 2
-; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv20bf16_nxv4bf16:
+; ZVBB-LABEL: vector_interleave_nxv12f32_nxv2f32:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a1, a0, 2
-; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
; ZVBB-NEXT: sub sp, sp, a0
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: add a2, a0, a1
; ZVBB-NEXT: add a3, a2, a1
-; ZVBB-NEXT: vsetvli a4, zero, e16, m1, ta, ma
-; ZVBB-NEXT: vsseg5e16.v v8, (a0)
-; ZVBB-NEXT: vl1re16.v v10, (a3)
+; ZVBB-NEXT: vsetvli a4, zero, e32, m1, ta, ma
+; ZVBB-NEXT: vsseg6e32.v v8, (a0)
+; ZVBB-NEXT: vl1re32.v v10, (a3)
; ZVBB-NEXT: add a3, a3, a1
-; ZVBB-NEXT: vl1re16.v v11, (a3)
-; ZVBB-NEXT: vl1re16.v v8, (a0)
-; ZVBB-NEXT: vl1re16.v v9, (a2)
+; ZVBB-NEXT: vl1re32.v v11, (a3)
+; ZVBB-NEXT: add a3, a3, a1
+; ZVBB-NEXT: vl1re32.v v8, (a0)
+; ZVBB-NEXT: vl1re32.v v9, (a2)
+; ZVBB-NEXT: vl1re32.v v12, (a3)
; ZVBB-NEXT: add a1, a3, a1
-; ZVBB-NEXT: vl1re16.v v12, (a1)
+; ZVBB-NEXT: vl1re32.v v13, (a1)
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a1, a0, 2
-; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
; ZVBB-NEXT: add sp, sp, a0
; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
- %res = call <vscale x 20 x bfloat> @llvm.vector.interleave5.nxv20bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4)
- ret <vscale x 20 x bfloat> %res
+ %res = call <vscale x 12 x float> @llvm.vector.interleave6.nxv12f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4, <vscale x 2 x float> %v5)
+ ret <vscale x 12 x float> %res
}
-define <vscale x 40 x bfloat> @vector_interleave_nxv40bf16_nxv8bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4) nounwind {
-; RV32-LABEL: vector_interleave_nxv40bf16_nxv8bf16:
+define <vscale x 24 x float> @vector_interleave_nxv24f32_nxv4f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4, <vscale x 4 x float> %v5) nounwind {
+; RV32-LABEL: vector_interleave_nxv24f32_nxv4f32:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -80
; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
@@ -5565,62 +10963,69 @@ define <vscale x 40 x bfloat> @vector_interleave_nxv40bf16_nxv8bf16(<vscale x 8
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: sub sp, sp, a0
; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; RV32-NEXT: vmv2r.v v20, v16
-; RV32-NEXT: addi a0, sp, 64
-; RV32-NEXT: vmv2r.v v18, v12
+; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; RV32-NEXT: vmv2r.v v20, v14
+; RV32-NEXT: vmv2r.v v22, v12
+; RV32-NEXT: vmv2r.v v24, v10
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a2, a1, 2
-; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: li a0, 6
+; RV32-NEXT: mul a1, a1, a0
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 64
+; RV32-NEXT: vmv1r.v v10, v25
+; RV32-NEXT: vmv1r.v v11, v23
+; RV32-NEXT: vmv1r.v v12, v21
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv1r.v v13, v17
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: vmv2r.v v16, v8
-; RV32-NEXT: vmv2r.v v22, v16
-; RV32-NEXT: vmv2r.v v24, v18
-; RV32-NEXT: vmv1r.v v26, v20
+; RV32-NEXT: vmv1r.v v14, v19
+; RV32-NEXT: vsseg6e32.v v9, (a1)
+; RV32-NEXT: vmv1r.v v9, v24
+; RV32-NEXT: add a5, a1, a2
+; RV32-NEXT: vmv1r.v v10, v22
; RV32-NEXT: add a3, a0, a2
-; RV32-NEXT: vmv1r.v v23, v10
-; RV32-NEXT: add a4, a1, a2
+; RV32-NEXT: vmv1r.v v11, v20
+; RV32-NEXT: add a4, a3, a2
+; RV32-NEXT: vmv1r.v v12, v16
+; RV32-NEXT: add a6, a5, a2
+; RV32-NEXT: vmv1r.v v13, v18
+; RV32-NEXT: vsseg6e32.v v8, (a0)
+; RV32-NEXT: vl1re32.v v14, (a1)
+; RV32-NEXT: add a1, a6, a2
+; RV32-NEXT: vl1re32.v v15, (a5)
+; RV32-NEXT: add a5, a1, a2
+; RV32-NEXT: vl1re32.v v18, (a5)
+; RV32-NEXT: add a5, a5, a2
+; RV32-NEXT: vl1re32.v v19, (a5)
; RV32-NEXT: add a5, a4, a2
-; RV32-NEXT: vmv1r.v v25, v14
+; RV32-NEXT: vl1re32.v v16, (a6)
; RV32-NEXT: add a6, a5, a2
-; RV32-NEXT: vmv1r.v v18, v11
-; RV32-NEXT: vsseg5e16.v v22, (a0)
-; RV32-NEXT: vmv1r.v v20, v15
-; RV32-NEXT: vsseg5e16.v v17, (a1)
-; RV32-NEXT: vl1re16.v v16, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re16.v v17, (a6)
-; RV32-NEXT: add a6, a3, a2
-; RV32-NEXT: vl1re16.v v10, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re16.v v11, (a6)
-; RV32-NEXT: vl1re16.v v8, (a0)
-; RV32-NEXT: vl1re16.v v9, (a3)
-; RV32-NEXT: vl1re16.v v14, (a4)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a3, 10
-; RV32-NEXT: mul a0, a0, a3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 64
+; RV32-NEXT: vl1re32.v v12, (a6)
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re16.v v15, (a5)
-; RV32-NEXT: vl1re16.v v12, (a6)
-; RV32-NEXT: vl1re16.v v13, (a1)
+; RV32-NEXT: vl1re32.v v13, (a6)
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: li a7, 12
+; RV32-NEXT: mul a6, a6, a7
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 64
+; RV32-NEXT: vl1re32.v v17, (a1)
+; RV32-NEXT: vl1re32.v v10, (a4)
+; RV32-NEXT: vl1re32.v v11, (a5)
+; RV32-NEXT: vl1re32.v v8, (a0)
+; RV32-NEXT: vl1re32.v v9, (a3)
; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a0, a2
-; RV32-NEXT: vs2r.v v16, (a2)
-; RV32-NEXT: vs8r.v v8, (a0)
-; RV32-NEXT: vl8re16.v v16, (a2)
-; RV32-NEXT: vl8re16.v v8, (a0)
+; RV32-NEXT: add a2, a6, a2
+; RV32-NEXT: vs4r.v v16, (a2)
+; RV32-NEXT: vs8r.v v8, (a6)
+; RV32-NEXT: vl8re32.v v16, (a2)
+; RV32-NEXT: vl8re32.v v8, (a6)
; RV32-NEXT: addi sp, s0, -80
; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 80
; RV32-NEXT: ret
;
-; RV64-LABEL: vector_interleave_nxv40bf16_nxv8bf16:
+; RV64-LABEL: vector_interleave_nxv24f32_nxv4f32:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -80
; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -5631,62 +11036,69 @@ define <vscale x 40 x bfloat> @vector_interleave_nxv40bf16_nxv8bf16(<vscale x 8
; RV64-NEXT: mul a0, a0, a1
; RV64-NEXT: sub sp, sp, a0
; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; RV64-NEXT: vmv2r.v v20, v16
-; RV64-NEXT: addi a0, sp, 64
-; RV64-NEXT: vmv2r.v v18, v12
+; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; RV64-NEXT: vmv2r.v v20, v14
+; RV64-NEXT: vmv2r.v v22, v12
+; RV64-NEXT: vmv2r.v v24, v10
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a2, a1, 2
-; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: li a0, 6
+; RV64-NEXT: mul a1, a1, a0
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 64
+; RV64-NEXT: vmv1r.v v10, v25
+; RV64-NEXT: vmv1r.v v11, v23
+; RV64-NEXT: vmv1r.v v12, v21
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv1r.v v13, v17
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: vmv2r.v v16, v8
-; RV64-NEXT: vmv2r.v v22, v16
-; RV64-NEXT: vmv2r.v v24, v18
-; RV64-NEXT: vmv1r.v v26, v20
+; RV64-NEXT: vmv1r.v v14, v19
+; RV64-NEXT: vsseg6e32.v v9, (a1)
+; RV64-NEXT: vmv1r.v v9, v24
+; RV64-NEXT: add a5, a1, a2
+; RV64-NEXT: vmv1r.v v10, v22
; RV64-NEXT: add a3, a0, a2
-; RV64-NEXT: vmv1r.v v23, v10
-; RV64-NEXT: add a4, a1, a2
+; RV64-NEXT: vmv1r.v v11, v20
+; RV64-NEXT: add a4, a3, a2
+; RV64-NEXT: vmv1r.v v12, v16
+; RV64-NEXT: add a6, a5, a2
+; RV64-NEXT: vmv1r.v v13, v18
+; RV64-NEXT: vsseg6e32.v v8, (a0)
+; RV64-NEXT: vl1re32.v v14, (a1)
+; RV64-NEXT: add a1, a6, a2
+; RV64-NEXT: vl1re32.v v15, (a5)
+; RV64-NEXT: add a5, a1, a2
+; RV64-NEXT: vl1re32.v v18, (a5)
+; RV64-NEXT: add a5, a5, a2
+; RV64-NEXT: vl1re32.v v19, (a5)
; RV64-NEXT: add a5, a4, a2
-; RV64-NEXT: vmv1r.v v25, v14
+; RV64-NEXT: vl1re32.v v16, (a6)
; RV64-NEXT: add a6, a5, a2
-; RV64-NEXT: vmv1r.v v18, v11
-; RV64-NEXT: vsseg5e16.v v22, (a0)
-; RV64-NEXT: vmv1r.v v20, v15
-; RV64-NEXT: vsseg5e16.v v17, (a1)
-; RV64-NEXT: vl1re16.v v16, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re16.v v17, (a6)
-; RV64-NEXT: add a6, a3, a2
-; RV64-NEXT: vl1re16.v v10, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re16.v v11, (a6)
-; RV64-NEXT: vl1re16.v v8, (a0)
-; RV64-NEXT: vl1re16.v v9, (a3)
-; RV64-NEXT: vl1re16.v v14, (a4)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: li a3, 10
-; RV64-NEXT: mul a0, a0, a3
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 64
+; RV64-NEXT: vl1re32.v v12, (a6)
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re16.v v15, (a5)
-; RV64-NEXT: vl1re16.v v12, (a6)
-; RV64-NEXT: vl1re16.v v13, (a1)
+; RV64-NEXT: vl1re32.v v13, (a6)
+; RV64-NEXT: csrr a6, vlenb
+; RV64-NEXT: li a7, 12
+; RV64-NEXT: mul a6, a6, a7
+; RV64-NEXT: add a6, sp, a6
+; RV64-NEXT: addi a6, a6, 64
+; RV64-NEXT: vl1re32.v v17, (a1)
+; RV64-NEXT: vl1re32.v v10, (a4)
+; RV64-NEXT: vl1re32.v v11, (a5)
+; RV64-NEXT: vl1re32.v v8, (a0)
+; RV64-NEXT: vl1re32.v v9, (a3)
; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a0, a2
-; RV64-NEXT: vs2r.v v16, (a2)
-; RV64-NEXT: vs8r.v v8, (a0)
-; RV64-NEXT: vl8re16.v v16, (a2)
-; RV64-NEXT: vl8re16.v v8, (a0)
+; RV64-NEXT: add a2, a6, a2
+; RV64-NEXT: vs4r.v v16, (a2)
+; RV64-NEXT: vs8r.v v8, (a6)
+; RV64-NEXT: vl8re32.v v16, (a2)
+; RV64-NEXT: vl8re32.v v8, (a6)
; RV64-NEXT: addi sp, s0, -80
; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 80
; RV64-NEXT: ret
;
-; ZVBB-RV32-LABEL: vector_interleave_nxv40bf16_nxv8bf16:
+; ZVBB-RV32-LABEL: vector_interleave_nxv24f32_nxv4f32:
; ZVBB-RV32: # %bb.0:
; ZVBB-RV32-NEXT: addi sp, sp, -80
; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
@@ -5697,62 +11109,69 @@ define <vscale x 40 x bfloat> @vector_interleave_nxv40bf16_nxv8bf16(<vscale x 8
; ZVBB-RV32-NEXT: mul a0, a0, a1
; ZVBB-RV32-NEXT: sub sp, sp, a0
; ZVBB-RV32-NEXT: andi sp, sp, -64
-; ZVBB-RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; ZVBB-RV32-NEXT: vmv2r.v v20, v16
-; ZVBB-RV32-NEXT: addi a0, sp, 64
-; ZVBB-RV32-NEXT: vmv2r.v v18, v12
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV32-NEXT: vmv2r.v v20, v14
+; ZVBB-RV32-NEXT: vmv2r.v v22, v12
+; ZVBB-RV32-NEXT: vmv2r.v v24, v10
; ZVBB-RV32-NEXT: csrr a1, vlenb
-; ZVBB-RV32-NEXT: slli a2, a1, 2
-; ZVBB-RV32-NEXT: add a1, a2, a1
+; ZVBB-RV32-NEXT: li a0, 6
+; ZVBB-RV32-NEXT: mul a1, a1, a0
; ZVBB-RV32-NEXT: add a1, sp, a1
; ZVBB-RV32-NEXT: addi a1, a1, 64
+; ZVBB-RV32-NEXT: vmv1r.v v10, v25
+; ZVBB-RV32-NEXT: vmv1r.v v11, v23
+; ZVBB-RV32-NEXT: vmv1r.v v12, v21
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv1r.v v13, v17
; ZVBB-RV32-NEXT: csrr a2, vlenb
-; ZVBB-RV32-NEXT: vmv2r.v v16, v8
-; ZVBB-RV32-NEXT: vmv2r.v v22, v16
-; ZVBB-RV32-NEXT: vmv2r.v v24, v18
-; ZVBB-RV32-NEXT: vmv1r.v v26, v20
+; ZVBB-RV32-NEXT: vmv1r.v v14, v19
+; ZVBB-RV32-NEXT: vsseg6e32.v v9, (a1)
+; ZVBB-RV32-NEXT: vmv1r.v v9, v24
+; ZVBB-RV32-NEXT: add a5, a1, a2
+; ZVBB-RV32-NEXT: vmv1r.v v10, v22
; ZVBB-RV32-NEXT: add a3, a0, a2
-; ZVBB-RV32-NEXT: vmv1r.v v23, v10
-; ZVBB-RV32-NEXT: add a4, a1, a2
+; ZVBB-RV32-NEXT: vmv1r.v v11, v20
+; ZVBB-RV32-NEXT: add a4, a3, a2
+; ZVBB-RV32-NEXT: vmv1r.v v12, v16
+; ZVBB-RV32-NEXT: add a6, a5, a2
+; ZVBB-RV32-NEXT: vmv1r.v v13, v18
+; ZVBB-RV32-NEXT: vsseg6e32.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re32.v v14, (a1)
+; ZVBB-RV32-NEXT: add a1, a6, a2
+; ZVBB-RV32-NEXT: vl1re32.v v15, (a5)
+; ZVBB-RV32-NEXT: add a5, a1, a2
+; ZVBB-RV32-NEXT: vl1re32.v v18, (a5)
+; ZVBB-RV32-NEXT: add a5, a5, a2
+; ZVBB-RV32-NEXT: vl1re32.v v19, (a5)
; ZVBB-RV32-NEXT: add a5, a4, a2
-; ZVBB-RV32-NEXT: vmv1r.v v25, v14
+; ZVBB-RV32-NEXT: vl1re32.v v16, (a6)
; ZVBB-RV32-NEXT: add a6, a5, a2
-; ZVBB-RV32-NEXT: vmv1r.v v18, v11
-; ZVBB-RV32-NEXT: vsseg5e16.v v22, (a0)
-; ZVBB-RV32-NEXT: vmv1r.v v20, v15
-; ZVBB-RV32-NEXT: vsseg5e16.v v17, (a1)
-; ZVBB-RV32-NEXT: vl1re16.v v16, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re16.v v17, (a6)
-; ZVBB-RV32-NEXT: add a6, a3, a2
-; ZVBB-RV32-NEXT: vl1re16.v v10, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re16.v v11, (a6)
-; ZVBB-RV32-NEXT: vl1re16.v v8, (a0)
-; ZVBB-RV32-NEXT: vl1re16.v v9, (a3)
-; ZVBB-RV32-NEXT: vl1re16.v v14, (a4)
-; ZVBB-RV32-NEXT: csrr a0, vlenb
-; ZVBB-RV32-NEXT: li a3, 10
-; ZVBB-RV32-NEXT: mul a0, a0, a3
-; ZVBB-RV32-NEXT: add a0, sp, a0
-; ZVBB-RV32-NEXT: addi a0, a0, 64
+; ZVBB-RV32-NEXT: vl1re32.v v12, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re16.v v15, (a5)
-; ZVBB-RV32-NEXT: vl1re16.v v12, (a6)
-; ZVBB-RV32-NEXT: vl1re16.v v13, (a1)
+; ZVBB-RV32-NEXT: vl1re32.v v13, (a6)
+; ZVBB-RV32-NEXT: csrr a6, vlenb
+; ZVBB-RV32-NEXT: li a7, 12
+; ZVBB-RV32-NEXT: mul a6, a6, a7
+; ZVBB-RV32-NEXT: add a6, sp, a6
+; ZVBB-RV32-NEXT: addi a6, a6, 64
+; ZVBB-RV32-NEXT: vl1re32.v v17, (a1)
+; ZVBB-RV32-NEXT: vl1re32.v v10, (a4)
+; ZVBB-RV32-NEXT: vl1re32.v v11, (a5)
+; ZVBB-RV32-NEXT: vl1re32.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re32.v v9, (a3)
; ZVBB-RV32-NEXT: slli a2, a2, 3
-; ZVBB-RV32-NEXT: add a2, a0, a2
-; ZVBB-RV32-NEXT: vs2r.v v16, (a2)
-; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
-; ZVBB-RV32-NEXT: vl8re16.v v16, (a2)
-; ZVBB-RV32-NEXT: vl8re16.v v8, (a0)
+; ZVBB-RV32-NEXT: add a2, a6, a2
+; ZVBB-RV32-NEXT: vs4r.v v16, (a2)
+; ZVBB-RV32-NEXT: vs8r.v v8, (a6)
+; ZVBB-RV32-NEXT: vl8re32.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8re32.v v8, (a6)
; ZVBB-RV32-NEXT: addi sp, s0, -80
; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; ZVBB-RV32-NEXT: addi sp, sp, 80
; ZVBB-RV32-NEXT: ret
;
-; ZVBB-RV64-LABEL: vector_interleave_nxv40bf16_nxv8bf16:
+; ZVBB-RV64-LABEL: vector_interleave_nxv24f32_nxv4f32:
; ZVBB-RV64: # %bb.0:
; ZVBB-RV64-NEXT: addi sp, sp, -80
; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -5763,62 +11182,69 @@ define <vscale x 40 x bfloat> @vector_interleave_nxv40bf16_nxv8bf16(<vscale x 8
; ZVBB-RV64-NEXT: mul a0, a0, a1
; ZVBB-RV64-NEXT: sub sp, sp, a0
; ZVBB-RV64-NEXT: andi sp, sp, -64
-; ZVBB-RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; ZVBB-RV64-NEXT: vmv2r.v v20, v16
-; ZVBB-RV64-NEXT: addi a0, sp, 64
-; ZVBB-RV64-NEXT: vmv2r.v v18, v12
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV64-NEXT: vmv2r.v v20, v14
+; ZVBB-RV64-NEXT: vmv2r.v v22, v12
+; ZVBB-RV64-NEXT: vmv2r.v v24, v10
; ZVBB-RV64-NEXT: csrr a1, vlenb
-; ZVBB-RV64-NEXT: slli a2, a1, 2
-; ZVBB-RV64-NEXT: add a1, a2, a1
+; ZVBB-RV64-NEXT: li a0, 6
+; ZVBB-RV64-NEXT: mul a1, a1, a0
; ZVBB-RV64-NEXT: add a1, sp, a1
; ZVBB-RV64-NEXT: addi a1, a1, 64
+; ZVBB-RV64-NEXT: vmv1r.v v10, v25
+; ZVBB-RV64-NEXT: vmv1r.v v11, v23
+; ZVBB-RV64-NEXT: vmv1r.v v12, v21
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv1r.v v13, v17
; ZVBB-RV64-NEXT: csrr a2, vlenb
-; ZVBB-RV64-NEXT: vmv2r.v v16, v8
-; ZVBB-RV64-NEXT: vmv2r.v v22, v16
-; ZVBB-RV64-NEXT: vmv2r.v v24, v18
-; ZVBB-RV64-NEXT: vmv1r.v v26, v20
+; ZVBB-RV64-NEXT: vmv1r.v v14, v19
+; ZVBB-RV64-NEXT: vsseg6e32.v v9, (a1)
+; ZVBB-RV64-NEXT: vmv1r.v v9, v24
+; ZVBB-RV64-NEXT: add a5, a1, a2
+; ZVBB-RV64-NEXT: vmv1r.v v10, v22
; ZVBB-RV64-NEXT: add a3, a0, a2
-; ZVBB-RV64-NEXT: vmv1r.v v23, v10
-; ZVBB-RV64-NEXT: add a4, a1, a2
+; ZVBB-RV64-NEXT: vmv1r.v v11, v20
+; ZVBB-RV64-NEXT: add a4, a3, a2
+; ZVBB-RV64-NEXT: vmv1r.v v12, v16
+; ZVBB-RV64-NEXT: add a6, a5, a2
+; ZVBB-RV64-NEXT: vmv1r.v v13, v18
+; ZVBB-RV64-NEXT: vsseg6e32.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re32.v v14, (a1)
+; ZVBB-RV64-NEXT: add a1, a6, a2
+; ZVBB-RV64-NEXT: vl1re32.v v15, (a5)
+; ZVBB-RV64-NEXT: add a5, a1, a2
+; ZVBB-RV64-NEXT: vl1re32.v v18, (a5)
+; ZVBB-RV64-NEXT: add a5, a5, a2
+; ZVBB-RV64-NEXT: vl1re32.v v19, (a5)
; ZVBB-RV64-NEXT: add a5, a4, a2
-; ZVBB-RV64-NEXT: vmv1r.v v25, v14
+; ZVBB-RV64-NEXT: vl1re32.v v16, (a6)
; ZVBB-RV64-NEXT: add a6, a5, a2
-; ZVBB-RV64-NEXT: vmv1r.v v18, v11
-; ZVBB-RV64-NEXT: vsseg5e16.v v22, (a0)
-; ZVBB-RV64-NEXT: vmv1r.v v20, v15
-; ZVBB-RV64-NEXT: vsseg5e16.v v17, (a1)
-; ZVBB-RV64-NEXT: vl1re16.v v16, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re16.v v17, (a6)
-; ZVBB-RV64-NEXT: add a6, a3, a2
-; ZVBB-RV64-NEXT: vl1re16.v v10, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re16.v v11, (a6)
-; ZVBB-RV64-NEXT: vl1re16.v v8, (a0)
-; ZVBB-RV64-NEXT: vl1re16.v v9, (a3)
-; ZVBB-RV64-NEXT: vl1re16.v v14, (a4)
-; ZVBB-RV64-NEXT: csrr a0, vlenb
-; ZVBB-RV64-NEXT: li a3, 10
-; ZVBB-RV64-NEXT: mul a0, a0, a3
-; ZVBB-RV64-NEXT: add a0, sp, a0
-; ZVBB-RV64-NEXT: addi a0, a0, 64
+; ZVBB-RV64-NEXT: vl1re32.v v12, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re16.v v15, (a5)
-; ZVBB-RV64-NEXT: vl1re16.v v12, (a6)
-; ZVBB-RV64-NEXT: vl1re16.v v13, (a1)
+; ZVBB-RV64-NEXT: vl1re32.v v13, (a6)
+; ZVBB-RV64-NEXT: csrr a6, vlenb
+; ZVBB-RV64-NEXT: li a7, 12
+; ZVBB-RV64-NEXT: mul a6, a6, a7
+; ZVBB-RV64-NEXT: add a6, sp, a6
+; ZVBB-RV64-NEXT: addi a6, a6, 64
+; ZVBB-RV64-NEXT: vl1re32.v v17, (a1)
+; ZVBB-RV64-NEXT: vl1re32.v v10, (a4)
+; ZVBB-RV64-NEXT: vl1re32.v v11, (a5)
+; ZVBB-RV64-NEXT: vl1re32.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re32.v v9, (a3)
; ZVBB-RV64-NEXT: slli a2, a2, 3
-; ZVBB-RV64-NEXT: add a2, a0, a2
-; ZVBB-RV64-NEXT: vs2r.v v16, (a2)
-; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
-; ZVBB-RV64-NEXT: vl8re16.v v16, (a2)
-; ZVBB-RV64-NEXT: vl8re16.v v8, (a0)
+; ZVBB-RV64-NEXT: add a2, a6, a2
+; ZVBB-RV64-NEXT: vs4r.v v16, (a2)
+; ZVBB-RV64-NEXT: vs8r.v v8, (a6)
+; ZVBB-RV64-NEXT: vl8re32.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8re32.v v8, (a6)
; ZVBB-RV64-NEXT: addi sp, s0, -80
; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZVBB-RV64-NEXT: addi sp, sp, 80
; ZVBB-RV64-NEXT: ret
;
-; ZIP-LABEL: vector_interleave_nxv40bf16_nxv8bf16:
+; ZIP-LABEL: vector_interleave_nxv24f32_nxv4f32:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -80
; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -5829,200 +11255,135 @@ define <vscale x 40 x bfloat> @vector_interleave_nxv40bf16_nxv8bf16(<vscale x 8
; ZIP-NEXT: mul a0, a0, a1
; ZIP-NEXT: sub sp, sp, a0
; ZIP-NEXT: andi sp, sp, -64
-; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
-; ZIP-NEXT: vmv2r.v v20, v16
-; ZIP-NEXT: addi a0, sp, 64
-; ZIP-NEXT: vmv2r.v v18, v12
+; ZIP-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZIP-NEXT: vmv2r.v v20, v14
+; ZIP-NEXT: vmv2r.v v22, v12
+; ZIP-NEXT: vmv2r.v v24, v10
; ZIP-NEXT: csrr a1, vlenb
-; ZIP-NEXT: slli a2, a1, 2
-; ZIP-NEXT: add a1, a2, a1
+; ZIP-NEXT: li a0, 6
+; ZIP-NEXT: mul a1, a1, a0
; ZIP-NEXT: add a1, sp, a1
; ZIP-NEXT: addi a1, a1, 64
+; ZIP-NEXT: vmv1r.v v10, v25
+; ZIP-NEXT: vmv1r.v v11, v23
+; ZIP-NEXT: vmv1r.v v12, v21
+; ZIP-NEXT: addi a0, sp, 64
+; ZIP-NEXT: vmv1r.v v13, v17
; ZIP-NEXT: csrr a2, vlenb
-; ZIP-NEXT: vmv2r.v v16, v8
-; ZIP-NEXT: vmv2r.v v22, v16
-; ZIP-NEXT: vmv2r.v v24, v18
-; ZIP-NEXT: vmv1r.v v26, v20
+; ZIP-NEXT: vmv1r.v v14, v19
+; ZIP-NEXT: vsseg6e32.v v9, (a1)
+; ZIP-NEXT: vmv1r.v v9, v24
+; ZIP-NEXT: add a5, a1, a2
+; ZIP-NEXT: vmv1r.v v10, v22
; ZIP-NEXT: add a3, a0, a2
-; ZIP-NEXT: vmv1r.v v23, v10
-; ZIP-NEXT: add a4, a1, a2
+; ZIP-NEXT: vmv1r.v v11, v20
+; ZIP-NEXT: add a4, a3, a2
+; ZIP-NEXT: vmv1r.v v12, v16
+; ZIP-NEXT: add a6, a5, a2
+; ZIP-NEXT: vmv1r.v v13, v18
+; ZIP-NEXT: vsseg6e32.v v8, (a0)
+; ZIP-NEXT: vl1re32.v v14, (a1)
+; ZIP-NEXT: add a1, a6, a2
+; ZIP-NEXT: vl1re32.v v15, (a5)
+; ZIP-NEXT: add a5, a1, a2
+; ZIP-NEXT: vl1re32.v v18, (a5)
+; ZIP-NEXT: add a5, a5, a2
+; ZIP-NEXT: vl1re32.v v19, (a5)
; ZIP-NEXT: add a5, a4, a2
-; ZIP-NEXT: vmv1r.v v25, v14
+; ZIP-NEXT: vl1re32.v v16, (a6)
; ZIP-NEXT: add a6, a5, a2
-; ZIP-NEXT: vmv1r.v v18, v11
-; ZIP-NEXT: vsseg5e16.v v22, (a0)
-; ZIP-NEXT: vmv1r.v v20, v15
-; ZIP-NEXT: vsseg5e16.v v17, (a1)
-; ZIP-NEXT: vl1re16.v v16, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re16.v v17, (a6)
-; ZIP-NEXT: add a6, a3, a2
-; ZIP-NEXT: vl1re16.v v10, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re16.v v11, (a6)
-; ZIP-NEXT: vl1re16.v v8, (a0)
-; ZIP-NEXT: vl1re16.v v9, (a3)
-; ZIP-NEXT: vl1re16.v v14, (a4)
-; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: li a3, 10
-; ZIP-NEXT: mul a0, a0, a3
-; ZIP-NEXT: add a0, sp, a0
-; ZIP-NEXT: addi a0, a0, 64
+; ZIP-NEXT: vl1re32.v v12, (a6)
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re16.v v15, (a5)
-; ZIP-NEXT: vl1re16.v v12, (a6)
-; ZIP-NEXT: vl1re16.v v13, (a1)
+; ZIP-NEXT: vl1re32.v v13, (a6)
+; ZIP-NEXT: csrr a6, vlenb
+; ZIP-NEXT: li a7, 12
+; ZIP-NEXT: mul a6, a6, a7
+; ZIP-NEXT: add a6, sp, a6
+; ZIP-NEXT: addi a6, a6, 64
+; ZIP-NEXT: vl1re32.v v17, (a1)
+; ZIP-NEXT: vl1re32.v v10, (a4)
+; ZIP-NEXT: vl1re32.v v11, (a5)
+; ZIP-NEXT: vl1re32.v v8, (a0)
+; ZIP-NEXT: vl1re32.v v9, (a3)
; ZIP-NEXT: slli a2, a2, 3
-; ZIP-NEXT: add a2, a0, a2
-; ZIP-NEXT: vs2r.v v16, (a2)
-; ZIP-NEXT: vs8r.v v8, (a0)
-; ZIP-NEXT: vl8re16.v v16, (a2)
-; ZIP-NEXT: vl8re16.v v8, (a0)
+; ZIP-NEXT: add a2, a6, a2
+; ZIP-NEXT: vs4r.v v16, (a2)
+; ZIP-NEXT: vs8r.v v8, (a6)
+; ZIP-NEXT: vl8re32.v v16, (a2)
+; ZIP-NEXT: vl8re32.v v8, (a6)
; ZIP-NEXT: addi sp, s0, -80
; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZIP-NEXT: addi sp, sp, 80
; ZIP-NEXT: ret
- %res = call <vscale x 40 x bfloat> @llvm.vector.interleave5.nxv40bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4)
- ret <vscale x 40 x bfloat> %res
-}
-
-define <vscale x 5 x float> @vector_interleave_nxv5f32_nxv1f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4) nounwind {
-; CHECK-LABEL: vector_interleave_nxv5f32_nxv1f32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 1
-; CHECK-NEXT: add a0, a1, a0
-; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: srli a2, a1, 1
-; CHECK-NEXT: add a3, a0, a2
-; CHECK-NEXT: add a4, a3, a2
-; CHECK-NEXT: vsetvli a5, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vsseg5e32.v v8, (a0)
-; CHECK-NEXT: add a5, a4, a2
-; CHECK-NEXT: vle32.v v8, (a5)
-; CHECK-NEXT: vle32.v v9, (a4)
-; CHECK-NEXT: srli a1, a1, 3
-; CHECK-NEXT: add a4, a1, a1
-; CHECK-NEXT: vle32.v v10, (a3)
-; CHECK-NEXT: vsetvli zero, a4, e32, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v9, v8, a1
-; CHECK-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vsetvli zero, a4, e32, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v10, a1
-; CHECK-NEXT: add a2, a5, a2
-; CHECK-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v10, (a2)
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 1
-; CHECK-NEXT: add a0, a1, a0
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: ret
-;
-; ZVBB-LABEL: vector_interleave_nxv5f32_nxv1f32:
-; ZVBB: # %bb.0:
-; ZVBB-NEXT: addi sp, sp, -16
-; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a1, a0, 1
-; ZVBB-NEXT: add a0, a1, a0
-; ZVBB-NEXT: sub sp, sp, a0
-; ZVBB-NEXT: addi a0, sp, 16
-; ZVBB-NEXT: csrr a1, vlenb
-; ZVBB-NEXT: srli a2, a1, 1
-; ZVBB-NEXT: add a3, a0, a2
-; ZVBB-NEXT: add a4, a3, a2
-; ZVBB-NEXT: vsetvli a5, zero, e32, mf2, ta, ma
-; ZVBB-NEXT: vsseg5e32.v v8, (a0)
-; ZVBB-NEXT: add a5, a4, a2
-; ZVBB-NEXT: vle32.v v8, (a5)
-; ZVBB-NEXT: vle32.v v9, (a4)
-; ZVBB-NEXT: srli a1, a1, 3
-; ZVBB-NEXT: add a4, a1, a1
-; ZVBB-NEXT: vle32.v v10, (a3)
-; ZVBB-NEXT: vsetvli zero, a4, e32, m1, ta, ma
-; ZVBB-NEXT: vslideup.vx v9, v8, a1
-; ZVBB-NEXT: vsetvli a3, zero, e32, mf2, ta, ma
-; ZVBB-NEXT: vle32.v v8, (a0)
-; ZVBB-NEXT: vsetvli zero, a4, e32, m1, ta, ma
-; ZVBB-NEXT: vslideup.vx v8, v10, a1
-; ZVBB-NEXT: add a2, a5, a2
-; ZVBB-NEXT: vsetvli a0, zero, e32, mf2, ta, ma
-; ZVBB-NEXT: vle32.v v10, (a2)
-; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a1, a0, 1
-; ZVBB-NEXT: add a0, a1, a0
-; ZVBB-NEXT: add sp, sp, a0
-; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: ret
- %res = call <vscale x 5 x float> @llvm.vector.interleave5.nxv5f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4)
- ret <vscale x 5 x float> %res
+ %res = call <vscale x 24 x float> @llvm.vector.interleave6.nxv24f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4, <vscale x 4 x float> %v5)
+ ret <vscale x 24 x float> %res
}
-define <vscale x 10 x float> @vector_interleave_nxv10f32_nxv2f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4) nounwind {
-; CHECK-LABEL: vector_interleave_nxv10f32_nxv2f32:
+define <vscale x 6 x double> @vector_interleave_nxv6f64_nxv1f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4, <vscale x 1 x double> %v5) nounwind {
+; CHECK-LABEL: vector_interleave_nxv6f64_nxv1f64:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 2
-; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: add a2, a0, a1
; CHECK-NEXT: add a3, a2, a1
-; CHECK-NEXT: vsetvli a4, zero, e32, m1, ta, ma
-; CHECK-NEXT: vsseg5e32.v v8, (a0)
-; CHECK-NEXT: vl1re32.v v10, (a3)
+; CHECK-NEXT: vsetvli a4, zero, e64, m1, ta, ma
+; CHECK-NEXT: vsseg6e64.v v8, (a0)
+; CHECK-NEXT: vl1re64.v v10, (a3)
; CHECK-NEXT: add a3, a3, a1
-; CHECK-NEXT: vl1re32.v v11, (a3)
-; CHECK-NEXT: vl1re32.v v8, (a0)
-; CHECK-NEXT: vl1re32.v v9, (a2)
+; CHECK-NEXT: vl1re64.v v11, (a3)
+; CHECK-NEXT: add a3, a3, a1
+; CHECK-NEXT: vl1re64.v v8, (a0)
+; CHECK-NEXT: vl1re64.v v9, (a2)
+; CHECK-NEXT: vl1re64.v v12, (a3)
; CHECK-NEXT: add a1, a3, a1
-; CHECK-NEXT: vl1re32.v v12, (a1)
+; CHECK-NEXT: vl1re64.v v13, (a1)
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 2
-; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: li a1, 6
+; CHECK-NEXT: mul a0, a0, a1
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv10f32_nxv2f32:
+; ZVBB-LABEL: vector_interleave_nxv6f64_nxv1f64:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a1, a0, 2
-; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
; ZVBB-NEXT: sub sp, sp, a0
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: add a2, a0, a1
; ZVBB-NEXT: add a3, a2, a1
-; ZVBB-NEXT: vsetvli a4, zero, e32, m1, ta, ma
-; ZVBB-NEXT: vsseg5e32.v v8, (a0)
-; ZVBB-NEXT: vl1re32.v v10, (a3)
+; ZVBB-NEXT: vsetvli a4, zero, e64, m1, ta, ma
+; ZVBB-NEXT: vsseg6e64.v v8, (a0)
+; ZVBB-NEXT: vl1re64.v v10, (a3)
; ZVBB-NEXT: add a3, a3, a1
-; ZVBB-NEXT: vl1re32.v v11, (a3)
-; ZVBB-NEXT: vl1re32.v v8, (a0)
-; ZVBB-NEXT: vl1re32.v v9, (a2)
+; ZVBB-NEXT: vl1re64.v v11, (a3)
+; ZVBB-NEXT: add a3, a3, a1
+; ZVBB-NEXT: vl1re64.v v8, (a0)
+; ZVBB-NEXT: vl1re64.v v9, (a2)
+; ZVBB-NEXT: vl1re64.v v12, (a3)
; ZVBB-NEXT: add a1, a3, a1
-; ZVBB-NEXT: vl1re32.v v12, (a1)
+; ZVBB-NEXT: vl1re64.v v13, (a1)
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a1, a0, 2
-; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: li a1, 6
+; ZVBB-NEXT: mul a0, a0, a1
; ZVBB-NEXT: add sp, sp, a0
; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
- %res = call <vscale x 10 x float> @llvm.vector.interleave5.nxv10f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4)
- ret <vscale x 10 x float> %res
+ %res = call <vscale x 6 x double> @llvm.vector.interleave6.nxv6f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4, <vscale x 1 x double> %v5)
+ ret <vscale x 6 x double> %res
}
-define <vscale x 20 x float> @vector_interleave_nxv20f32_nxv4f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4) nounwind {
-; RV32-LABEL: vector_interleave_nxv20f32_nxv4f32:
+define <vscale x 12 x double> @vector_interleave_nxv12f64_nxv2f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4, <vscale x 2 x double> %v5) nounwind {
+; RV32-LABEL: vector_interleave_nxv12f64_nxv2f64:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -80
; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
@@ -6033,62 +11394,69 @@ define <vscale x 20 x float> @vector_interleave_nxv20f32_nxv4f32(<vscale x 4 x f
; RV32-NEXT: mul a0, a0, a1
; RV32-NEXT: sub sp, sp, a0
; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; RV32-NEXT: vmv2r.v v20, v16
-; RV32-NEXT: addi a0, sp, 64
-; RV32-NEXT: vmv2r.v v18, v12
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT: vmv2r.v v20, v14
+; RV32-NEXT: vmv2r.v v22, v12
+; RV32-NEXT: vmv2r.v v24, v10
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a2, a1, 2
-; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: li a0, 6
+; RV32-NEXT: mul a1, a1, a0
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 64
+; RV32-NEXT: vmv1r.v v10, v25
+; RV32-NEXT: vmv1r.v v11, v23
+; RV32-NEXT: vmv1r.v v12, v21
+; RV32-NEXT: addi a0, sp, 64
+; RV32-NEXT: vmv1r.v v13, v17
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: vmv2r.v v16, v8
-; RV32-NEXT: vmv2r.v v22, v16
-; RV32-NEXT: vmv2r.v v24, v18
-; RV32-NEXT: vmv1r.v v26, v20
+; RV32-NEXT: vmv1r.v v14, v19
+; RV32-NEXT: vsseg6e64.v v9, (a1)
+; RV32-NEXT: vmv1r.v v9, v24
+; RV32-NEXT: add a5, a1, a2
+; RV32-NEXT: vmv1r.v v10, v22
; RV32-NEXT: add a3, a0, a2
-; RV32-NEXT: vmv1r.v v23, v10
-; RV32-NEXT: add a4, a1, a2
+; RV32-NEXT: vmv1r.v v11, v20
+; RV32-NEXT: add a4, a3, a2
+; RV32-NEXT: vmv1r.v v12, v16
+; RV32-NEXT: add a6, a5, a2
+; RV32-NEXT: vmv1r.v v13, v18
+; RV32-NEXT: vsseg6e64.v v8, (a0)
+; RV32-NEXT: vl1re64.v v14, (a1)
+; RV32-NEXT: add a1, a6, a2
+; RV32-NEXT: vl1re64.v v15, (a5)
+; RV32-NEXT: add a5, a1, a2
+; RV32-NEXT: vl1re64.v v18, (a5)
+; RV32-NEXT: add a5, a5, a2
+; RV32-NEXT: vl1re64.v v19, (a5)
; RV32-NEXT: add a5, a4, a2
-; RV32-NEXT: vmv1r.v v25, v14
+; RV32-NEXT: vl1re64.v v16, (a6)
; RV32-NEXT: add a6, a5, a2
-; RV32-NEXT: vmv1r.v v18, v11
-; RV32-NEXT: vsseg5e32.v v22, (a0)
-; RV32-NEXT: vmv1r.v v20, v15
-; RV32-NEXT: vsseg5e32.v v17, (a1)
-; RV32-NEXT: vl1re32.v v16, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re32.v v17, (a6)
-; RV32-NEXT: add a6, a3, a2
-; RV32-NEXT: vl1re32.v v10, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re32.v v11, (a6)
-; RV32-NEXT: vl1re32.v v8, (a0)
-; RV32-NEXT: vl1re32.v v9, (a3)
-; RV32-NEXT: vl1re32.v v14, (a4)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a3, 10
-; RV32-NEXT: mul a0, a0, a3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 64
+; RV32-NEXT: vl1re64.v v12, (a6)
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re32.v v15, (a5)
-; RV32-NEXT: vl1re32.v v12, (a6)
-; RV32-NEXT: vl1re32.v v13, (a1)
+; RV32-NEXT: vl1re64.v v13, (a6)
+; RV32-NEXT: csrr a6, vlenb
+; RV32-NEXT: li a7, 12
+; RV32-NEXT: mul a6, a6, a7
+; RV32-NEXT: add a6, sp, a6
+; RV32-NEXT: addi a6, a6, 64
+; RV32-NEXT: vl1re64.v v17, (a1)
+; RV32-NEXT: vl1re64.v v10, (a4)
+; RV32-NEXT: vl1re64.v v11, (a5)
+; RV32-NEXT: vl1re64.v v8, (a0)
+; RV32-NEXT: vl1re64.v v9, (a3)
; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a0, a2
-; RV32-NEXT: vs2r.v v16, (a2)
-; RV32-NEXT: vs8r.v v8, (a0)
-; RV32-NEXT: vl8re32.v v16, (a2)
-; RV32-NEXT: vl8re32.v v8, (a0)
+; RV32-NEXT: add a2, a6, a2
+; RV32-NEXT: vs4r.v v16, (a2)
+; RV32-NEXT: vs8r.v v8, (a6)
+; RV32-NEXT: vl8re64.v v16, (a2)
+; RV32-NEXT: vl8re64.v v8, (a6)
; RV32-NEXT: addi sp, s0, -80
; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 80
; RV32-NEXT: ret
;
-; RV64-LABEL: vector_interleave_nxv20f32_nxv4f32:
+; RV64-LABEL: vector_interleave_nxv12f64_nxv2f64:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -80
; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -6099,62 +11467,69 @@ define <vscale x 20 x float> @vector_interleave_nxv20f32_nxv4f32(<vscale x 4 x f
; RV64-NEXT: mul a0, a0, a1
; RV64-NEXT: sub sp, sp, a0
; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; RV64-NEXT: vmv2r.v v20, v16
-; RV64-NEXT: addi a0, sp, 64
-; RV64-NEXT: vmv2r.v v18, v12
+; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; RV64-NEXT: vmv2r.v v20, v14
+; RV64-NEXT: vmv2r.v v22, v12
+; RV64-NEXT: vmv2r.v v24, v10
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a2, a1, 2
-; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: li a0, 6
+; RV64-NEXT: mul a1, a1, a0
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 64
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: vmv2r.v v16, v8
-; RV64-NEXT: vmv2r.v v22, v16
-; RV64-NEXT: vmv2r.v v24, v18
-; RV64-NEXT: vmv1r.v v26, v20
-; RV64-NEXT: add a3, a0, a2
-; RV64-NEXT: vmv1r.v v23, v10
-; RV64-NEXT: add a4, a1, a2
-; RV64-NEXT: add a5, a4, a2
-; RV64-NEXT: vmv1r.v v25, v14
-; RV64-NEXT: add a6, a5, a2
-; RV64-NEXT: vmv1r.v v18, v11
-; RV64-NEXT: vsseg5e32.v v22, (a0)
-; RV64-NEXT: vmv1r.v v20, v15
-; RV64-NEXT: vsseg5e32.v v17, (a1)
-; RV64-NEXT: vl1re32.v v16, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re32.v v17, (a6)
-; RV64-NEXT: add a6, a3, a2
-; RV64-NEXT: vl1re32.v v10, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re32.v v11, (a6)
-; RV64-NEXT: vl1re32.v v8, (a0)
-; RV64-NEXT: vl1re32.v v9, (a3)
-; RV64-NEXT: vl1re32.v v14, (a4)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: li a3, 10
-; RV64-NEXT: mul a0, a0, a3
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 64
+; RV64-NEXT: vmv1r.v v10, v25
+; RV64-NEXT: vmv1r.v v11, v23
+; RV64-NEXT: vmv1r.v v12, v21
+; RV64-NEXT: addi a0, sp, 64
+; RV64-NEXT: vmv1r.v v13, v17
+; RV64-NEXT: csrr a2, vlenb
+; RV64-NEXT: vmv1r.v v14, v19
+; RV64-NEXT: vsseg6e64.v v9, (a1)
+; RV64-NEXT: vmv1r.v v9, v24
+; RV64-NEXT: add a5, a1, a2
+; RV64-NEXT: vmv1r.v v10, v22
+; RV64-NEXT: add a3, a0, a2
+; RV64-NEXT: vmv1r.v v11, v20
+; RV64-NEXT: add a4, a3, a2
+; RV64-NEXT: vmv1r.v v12, v16
+; RV64-NEXT: add a6, a5, a2
+; RV64-NEXT: vmv1r.v v13, v18
+; RV64-NEXT: vsseg6e64.v v8, (a0)
+; RV64-NEXT: vl1re64.v v14, (a1)
+; RV64-NEXT: add a1, a6, a2
+; RV64-NEXT: vl1re64.v v15, (a5)
+; RV64-NEXT: add a5, a1, a2
+; RV64-NEXT: vl1re64.v v18, (a5)
+; RV64-NEXT: add a5, a5, a2
+; RV64-NEXT: vl1re64.v v19, (a5)
+; RV64-NEXT: add a5, a4, a2
+; RV64-NEXT: vl1re64.v v16, (a6)
+; RV64-NEXT: add a6, a5, a2
+; RV64-NEXT: vl1re64.v v12, (a6)
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re32.v v15, (a5)
-; RV64-NEXT: vl1re32.v v12, (a6)
-; RV64-NEXT: vl1re32.v v13, (a1)
+; RV64-NEXT: vl1re64.v v13, (a6)
+; RV64-NEXT: csrr a6, vlenb
+; RV64-NEXT: li a7, 12
+; RV64-NEXT: mul a6, a6, a7
+; RV64-NEXT: add a6, sp, a6
+; RV64-NEXT: addi a6, a6, 64
+; RV64-NEXT: vl1re64.v v17, (a1)
+; RV64-NEXT: vl1re64.v v10, (a4)
+; RV64-NEXT: vl1re64.v v11, (a5)
+; RV64-NEXT: vl1re64.v v8, (a0)
+; RV64-NEXT: vl1re64.v v9, (a3)
; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a0, a2
-; RV64-NEXT: vs2r.v v16, (a2)
-; RV64-NEXT: vs8r.v v8, (a0)
-; RV64-NEXT: vl8re32.v v16, (a2)
-; RV64-NEXT: vl8re32.v v8, (a0)
+; RV64-NEXT: add a2, a6, a2
+; RV64-NEXT: vs4r.v v16, (a2)
+; RV64-NEXT: vs8r.v v8, (a6)
+; RV64-NEXT: vl8re64.v v16, (a2)
+; RV64-NEXT: vl8re64.v v8, (a6)
; RV64-NEXT: addi sp, s0, -80
; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 80
; RV64-NEXT: ret
;
-; ZVBB-RV32-LABEL: vector_interleave_nxv20f32_nxv4f32:
+; ZVBB-RV32-LABEL: vector_interleave_nxv12f64_nxv2f64:
; ZVBB-RV32: # %bb.0:
; ZVBB-RV32-NEXT: addi sp, sp, -80
; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
@@ -6165,62 +11540,69 @@ define <vscale x 20 x float> @vector_interleave_nxv20f32_nxv4f32(<vscale x 4 x f
; ZVBB-RV32-NEXT: mul a0, a0, a1
; ZVBB-RV32-NEXT: sub sp, sp, a0
; ZVBB-RV32-NEXT: andi sp, sp, -64
-; ZVBB-RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; ZVBB-RV32-NEXT: vmv2r.v v20, v16
-; ZVBB-RV32-NEXT: addi a0, sp, 64
-; ZVBB-RV32-NEXT: vmv2r.v v18, v12
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV32-NEXT: vmv2r.v v20, v14
+; ZVBB-RV32-NEXT: vmv2r.v v22, v12
+; ZVBB-RV32-NEXT: vmv2r.v v24, v10
; ZVBB-RV32-NEXT: csrr a1, vlenb
-; ZVBB-RV32-NEXT: slli a2, a1, 2
-; ZVBB-RV32-NEXT: add a1, a2, a1
+; ZVBB-RV32-NEXT: li a0, 6
+; ZVBB-RV32-NEXT: mul a1, a1, a0
; ZVBB-RV32-NEXT: add a1, sp, a1
; ZVBB-RV32-NEXT: addi a1, a1, 64
+; ZVBB-RV32-NEXT: vmv1r.v v10, v25
+; ZVBB-RV32-NEXT: vmv1r.v v11, v23
+; ZVBB-RV32-NEXT: vmv1r.v v12, v21
+; ZVBB-RV32-NEXT: addi a0, sp, 64
+; ZVBB-RV32-NEXT: vmv1r.v v13, v17
; ZVBB-RV32-NEXT: csrr a2, vlenb
-; ZVBB-RV32-NEXT: vmv2r.v v16, v8
-; ZVBB-RV32-NEXT: vmv2r.v v22, v16
-; ZVBB-RV32-NEXT: vmv2r.v v24, v18
-; ZVBB-RV32-NEXT: vmv1r.v v26, v20
+; ZVBB-RV32-NEXT: vmv1r.v v14, v19
+; ZVBB-RV32-NEXT: vsseg6e64.v v9, (a1)
+; ZVBB-RV32-NEXT: vmv1r.v v9, v24
+; ZVBB-RV32-NEXT: add a5, a1, a2
+; ZVBB-RV32-NEXT: vmv1r.v v10, v22
; ZVBB-RV32-NEXT: add a3, a0, a2
-; ZVBB-RV32-NEXT: vmv1r.v v23, v10
-; ZVBB-RV32-NEXT: add a4, a1, a2
+; ZVBB-RV32-NEXT: vmv1r.v v11, v20
+; ZVBB-RV32-NEXT: add a4, a3, a2
+; ZVBB-RV32-NEXT: vmv1r.v v12, v16
+; ZVBB-RV32-NEXT: add a6, a5, a2
+; ZVBB-RV32-NEXT: vmv1r.v v13, v18
+; ZVBB-RV32-NEXT: vsseg6e64.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re64.v v14, (a1)
+; ZVBB-RV32-NEXT: add a1, a6, a2
+; ZVBB-RV32-NEXT: vl1re64.v v15, (a5)
+; ZVBB-RV32-NEXT: add a5, a1, a2
+; ZVBB-RV32-NEXT: vl1re64.v v18, (a5)
+; ZVBB-RV32-NEXT: add a5, a5, a2
+; ZVBB-RV32-NEXT: vl1re64.v v19, (a5)
; ZVBB-RV32-NEXT: add a5, a4, a2
-; ZVBB-RV32-NEXT: vmv1r.v v25, v14
+; ZVBB-RV32-NEXT: vl1re64.v v16, (a6)
; ZVBB-RV32-NEXT: add a6, a5, a2
-; ZVBB-RV32-NEXT: vmv1r.v v18, v11
-; ZVBB-RV32-NEXT: vsseg5e32.v v22, (a0)
-; ZVBB-RV32-NEXT: vmv1r.v v20, v15
-; ZVBB-RV32-NEXT: vsseg5e32.v v17, (a1)
-; ZVBB-RV32-NEXT: vl1re32.v v16, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re32.v v17, (a6)
-; ZVBB-RV32-NEXT: add a6, a3, a2
-; ZVBB-RV32-NEXT: vl1re32.v v10, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re32.v v11, (a6)
-; ZVBB-RV32-NEXT: vl1re32.v v8, (a0)
-; ZVBB-RV32-NEXT: vl1re32.v v9, (a3)
-; ZVBB-RV32-NEXT: vl1re32.v v14, (a4)
-; ZVBB-RV32-NEXT: csrr a0, vlenb
-; ZVBB-RV32-NEXT: li a3, 10
-; ZVBB-RV32-NEXT: mul a0, a0, a3
-; ZVBB-RV32-NEXT: add a0, sp, a0
-; ZVBB-RV32-NEXT: addi a0, a0, 64
+; ZVBB-RV32-NEXT: vl1re64.v v12, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re32.v v15, (a5)
-; ZVBB-RV32-NEXT: vl1re32.v v12, (a6)
-; ZVBB-RV32-NEXT: vl1re32.v v13, (a1)
+; ZVBB-RV32-NEXT: vl1re64.v v13, (a6)
+; ZVBB-RV32-NEXT: csrr a6, vlenb
+; ZVBB-RV32-NEXT: li a7, 12
+; ZVBB-RV32-NEXT: mul a6, a6, a7
+; ZVBB-RV32-NEXT: add a6, sp, a6
+; ZVBB-RV32-NEXT: addi a6, a6, 64
+; ZVBB-RV32-NEXT: vl1re64.v v17, (a1)
+; ZVBB-RV32-NEXT: vl1re64.v v10, (a4)
+; ZVBB-RV32-NEXT: vl1re64.v v11, (a5)
+; ZVBB-RV32-NEXT: vl1re64.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re64.v v9, (a3)
; ZVBB-RV32-NEXT: slli a2, a2, 3
-; ZVBB-RV32-NEXT: add a2, a0, a2
-; ZVBB-RV32-NEXT: vs2r.v v16, (a2)
-; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
-; ZVBB-RV32-NEXT: vl8re32.v v16, (a2)
-; ZVBB-RV32-NEXT: vl8re32.v v8, (a0)
+; ZVBB-RV32-NEXT: add a2, a6, a2
+; ZVBB-RV32-NEXT: vs4r.v v16, (a2)
+; ZVBB-RV32-NEXT: vs8r.v v8, (a6)
+; ZVBB-RV32-NEXT: vl8re64.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8re64.v v8, (a6)
; ZVBB-RV32-NEXT: addi sp, s0, -80
; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; ZVBB-RV32-NEXT: addi sp, sp, 80
; ZVBB-RV32-NEXT: ret
;
-; ZVBB-RV64-LABEL: vector_interleave_nxv20f32_nxv4f32:
+; ZVBB-RV64-LABEL: vector_interleave_nxv12f64_nxv2f64:
; ZVBB-RV64: # %bb.0:
; ZVBB-RV64-NEXT: addi sp, sp, -80
; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -6231,62 +11613,69 @@ define <vscale x 20 x float> @vector_interleave_nxv20f32_nxv4f32(<vscale x 4 x f
; ZVBB-RV64-NEXT: mul a0, a0, a1
; ZVBB-RV64-NEXT: sub sp, sp, a0
; ZVBB-RV64-NEXT: andi sp, sp, -64
-; ZVBB-RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; ZVBB-RV64-NEXT: vmv2r.v v20, v16
-; ZVBB-RV64-NEXT: addi a0, sp, 64
-; ZVBB-RV64-NEXT: vmv2r.v v18, v12
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV64-NEXT: vmv2r.v v20, v14
+; ZVBB-RV64-NEXT: vmv2r.v v22, v12
+; ZVBB-RV64-NEXT: vmv2r.v v24, v10
; ZVBB-RV64-NEXT: csrr a1, vlenb
-; ZVBB-RV64-NEXT: slli a2, a1, 2
-; ZVBB-RV64-NEXT: add a1, a2, a1
+; ZVBB-RV64-NEXT: li a0, 6
+; ZVBB-RV64-NEXT: mul a1, a1, a0
; ZVBB-RV64-NEXT: add a1, sp, a1
; ZVBB-RV64-NEXT: addi a1, a1, 64
+; ZVBB-RV64-NEXT: vmv1r.v v10, v25
+; ZVBB-RV64-NEXT: vmv1r.v v11, v23
+; ZVBB-RV64-NEXT: vmv1r.v v12, v21
+; ZVBB-RV64-NEXT: addi a0, sp, 64
+; ZVBB-RV64-NEXT: vmv1r.v v13, v17
; ZVBB-RV64-NEXT: csrr a2, vlenb
-; ZVBB-RV64-NEXT: vmv2r.v v16, v8
-; ZVBB-RV64-NEXT: vmv2r.v v22, v16
-; ZVBB-RV64-NEXT: vmv2r.v v24, v18
-; ZVBB-RV64-NEXT: vmv1r.v v26, v20
+; ZVBB-RV64-NEXT: vmv1r.v v14, v19
+; ZVBB-RV64-NEXT: vsseg6e64.v v9, (a1)
+; ZVBB-RV64-NEXT: vmv1r.v v9, v24
+; ZVBB-RV64-NEXT: add a5, a1, a2
+; ZVBB-RV64-NEXT: vmv1r.v v10, v22
; ZVBB-RV64-NEXT: add a3, a0, a2
-; ZVBB-RV64-NEXT: vmv1r.v v23, v10
-; ZVBB-RV64-NEXT: add a4, a1, a2
+; ZVBB-RV64-NEXT: vmv1r.v v11, v20
+; ZVBB-RV64-NEXT: add a4, a3, a2
+; ZVBB-RV64-NEXT: vmv1r.v v12, v16
+; ZVBB-RV64-NEXT: add a6, a5, a2
+; ZVBB-RV64-NEXT: vmv1r.v v13, v18
+; ZVBB-RV64-NEXT: vsseg6e64.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re64.v v14, (a1)
+; ZVBB-RV64-NEXT: add a1, a6, a2
+; ZVBB-RV64-NEXT: vl1re64.v v15, (a5)
+; ZVBB-RV64-NEXT: add a5, a1, a2
+; ZVBB-RV64-NEXT: vl1re64.v v18, (a5)
+; ZVBB-RV64-NEXT: add a5, a5, a2
+; ZVBB-RV64-NEXT: vl1re64.v v19, (a5)
; ZVBB-RV64-NEXT: add a5, a4, a2
-; ZVBB-RV64-NEXT: vmv1r.v v25, v14
+; ZVBB-RV64-NEXT: vl1re64.v v16, (a6)
; ZVBB-RV64-NEXT: add a6, a5, a2
-; ZVBB-RV64-NEXT: vmv1r.v v18, v11
-; ZVBB-RV64-NEXT: vsseg5e32.v v22, (a0)
-; ZVBB-RV64-NEXT: vmv1r.v v20, v15
-; ZVBB-RV64-NEXT: vsseg5e32.v v17, (a1)
-; ZVBB-RV64-NEXT: vl1re32.v v16, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re32.v v17, (a6)
-; ZVBB-RV64-NEXT: add a6, a3, a2
-; ZVBB-RV64-NEXT: vl1re32.v v10, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re32.v v11, (a6)
-; ZVBB-RV64-NEXT: vl1re32.v v8, (a0)
-; ZVBB-RV64-NEXT: vl1re32.v v9, (a3)
-; ZVBB-RV64-NEXT: vl1re32.v v14, (a4)
-; ZVBB-RV64-NEXT: csrr a0, vlenb
-; ZVBB-RV64-NEXT: li a3, 10
-; ZVBB-RV64-NEXT: mul a0, a0, a3
-; ZVBB-RV64-NEXT: add a0, sp, a0
-; ZVBB-RV64-NEXT: addi a0, a0, 64
+; ZVBB-RV64-NEXT: vl1re64.v v12, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re32.v v15, (a5)
-; ZVBB-RV64-NEXT: vl1re32.v v12, (a6)
-; ZVBB-RV64-NEXT: vl1re32.v v13, (a1)
+; ZVBB-RV64-NEXT: vl1re64.v v13, (a6)
+; ZVBB-RV64-NEXT: csrr a6, vlenb
+; ZVBB-RV64-NEXT: li a7, 12
+; ZVBB-RV64-NEXT: mul a6, a6, a7
+; ZVBB-RV64-NEXT: add a6, sp, a6
+; ZVBB-RV64-NEXT: addi a6, a6, 64
+; ZVBB-RV64-NEXT: vl1re64.v v17, (a1)
+; ZVBB-RV64-NEXT: vl1re64.v v10, (a4)
+; ZVBB-RV64-NEXT: vl1re64.v v11, (a5)
+; ZVBB-RV64-NEXT: vl1re64.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re64.v v9, (a3)
; ZVBB-RV64-NEXT: slli a2, a2, 3
-; ZVBB-RV64-NEXT: add a2, a0, a2
-; ZVBB-RV64-NEXT: vs2r.v v16, (a2)
-; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
-; ZVBB-RV64-NEXT: vl8re32.v v16, (a2)
-; ZVBB-RV64-NEXT: vl8re32.v v8, (a0)
+; ZVBB-RV64-NEXT: add a2, a6, a2
+; ZVBB-RV64-NEXT: vs4r.v v16, (a2)
+; ZVBB-RV64-NEXT: vs8r.v v8, (a6)
+; ZVBB-RV64-NEXT: vl8re64.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8re64.v v8, (a6)
; ZVBB-RV64-NEXT: addi sp, s0, -80
; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZVBB-RV64-NEXT: addi sp, sp, 80
; ZVBB-RV64-NEXT: ret
;
-; ZIP-LABEL: vector_interleave_nxv20f32_nxv4f32:
+; ZIP-LABEL: vector_interleave_nxv12f64_nxv2f64:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -80
; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -6297,458 +11686,637 @@ define <vscale x 20 x float> @vector_interleave_nxv20f32_nxv4f32(<vscale x 4 x f
; ZIP-NEXT: mul a0, a0, a1
; ZIP-NEXT: sub sp, sp, a0
; ZIP-NEXT: andi sp, sp, -64
-; ZIP-NEXT: vsetvli a0, zero, e32, m1, ta, ma
-; ZIP-NEXT: vmv2r.v v20, v16
-; ZIP-NEXT: addi a0, sp, 64
-; ZIP-NEXT: vmv2r.v v18, v12
+; ZIP-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZIP-NEXT: vmv2r.v v20, v14
+; ZIP-NEXT: vmv2r.v v22, v12
+; ZIP-NEXT: vmv2r.v v24, v10
; ZIP-NEXT: csrr a1, vlenb
-; ZIP-NEXT: slli a2, a1, 2
-; ZIP-NEXT: add a1, a2, a1
+; ZIP-NEXT: li a0, 6
+; ZIP-NEXT: mul a1, a1, a0
; ZIP-NEXT: add a1, sp, a1
; ZIP-NEXT: addi a1, a1, 64
+; ZIP-NEXT: vmv1r.v v10, v25
+; ZIP-NEXT: vmv1r.v v11, v23
+; ZIP-NEXT: vmv1r.v v12, v21
+; ZIP-NEXT: addi a0, sp, 64
+; ZIP-NEXT: vmv1r.v v13, v17
; ZIP-NEXT: csrr a2, vlenb
-; ZIP-NEXT: vmv2r.v v16, v8
-; ZIP-NEXT: vmv2r.v v22, v16
-; ZIP-NEXT: vmv2r.v v24, v18
-; ZIP-NEXT: vmv1r.v v26, v20
+; ZIP-NEXT: vmv1r.v v14, v19
+; ZIP-NEXT: vsseg6e64.v v9, (a1)
+; ZIP-NEXT: vmv1r.v v9, v24
+; ZIP-NEXT: add a5, a1, a2
+; ZIP-NEXT: vmv1r.v v10, v22
; ZIP-NEXT: add a3, a0, a2
-; ZIP-NEXT: vmv1r.v v23, v10
-; ZIP-NEXT: add a4, a1, a2
+; ZIP-NEXT: vmv1r.v v11, v20
+; ZIP-NEXT: add a4, a3, a2
+; ZIP-NEXT: vmv1r.v v12, v16
+; ZIP-NEXT: add a6, a5, a2
+; ZIP-NEXT: vmv1r.v v13, v18
+; ZIP-NEXT: vsseg6e64.v v8, (a0)
+; ZIP-NEXT: vl1re64.v v14, (a1)
+; ZIP-NEXT: add a1, a6, a2
+; ZIP-NEXT: vl1re64.v v15, (a5)
+; ZIP-NEXT: add a5, a1, a2
+; ZIP-NEXT: vl1re64.v v18, (a5)
+; ZIP-NEXT: add a5, a5, a2
+; ZIP-NEXT: vl1re64.v v19, (a5)
; ZIP-NEXT: add a5, a4, a2
-; ZIP-NEXT: vmv1r.v v25, v14
+; ZIP-NEXT: vl1re64.v v16, (a6)
; ZIP-NEXT: add a6, a5, a2
-; ZIP-NEXT: vmv1r.v v18, v11
-; ZIP-NEXT: vsseg5e32.v v22, (a0)
-; ZIP-NEXT: vmv1r.v v20, v15
-; ZIP-NEXT: vsseg5e32.v v17, (a1)
-; ZIP-NEXT: vl1re32.v v16, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re32.v v17, (a6)
-; ZIP-NEXT: add a6, a3, a2
-; ZIP-NEXT: vl1re32.v v10, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re32.v v11, (a6)
-; ZIP-NEXT: vl1re32.v v8, (a0)
-; ZIP-NEXT: vl1re32.v v9, (a3)
-; ZIP-NEXT: vl1re32.v v14, (a4)
-; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: li a3, 10
-; ZIP-NEXT: mul a0, a0, a3
-; ZIP-NEXT: add a0, sp, a0
-; ZIP-NEXT: addi a0, a0, 64
+; ZIP-NEXT: vl1re64.v v12, (a6)
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re32.v v15, (a5)
-; ZIP-NEXT: vl1re32.v v12, (a6)
-; ZIP-NEXT: vl1re32.v v13, (a1)
+; ZIP-NEXT: vl1re64.v v13, (a6)
+; ZIP-NEXT: csrr a6, vlenb
+; ZIP-NEXT: li a7, 12
+; ZIP-NEXT: mul a6, a6, a7
+; ZIP-NEXT: add a6, sp, a6
+; ZIP-NEXT: addi a6, a6, 64
+; ZIP-NEXT: vl1re64.v v17, (a1)
+; ZIP-NEXT: vl1re64.v v10, (a4)
+; ZIP-NEXT: vl1re64.v v11, (a5)
+; ZIP-NEXT: vl1re64.v v8, (a0)
+; ZIP-NEXT: vl1re64.v v9, (a3)
; ZIP-NEXT: slli a2, a2, 3
-; ZIP-NEXT: add a2, a0, a2
-; ZIP-NEXT: vs2r.v v16, (a2)
-; ZIP-NEXT: vs8r.v v8, (a0)
-; ZIP-NEXT: vl8re32.v v16, (a2)
-; ZIP-NEXT: vl8re32.v v8, (a0)
+; ZIP-NEXT: add a2, a6, a2
+; ZIP-NEXT: vs4r.v v16, (a2)
+; ZIP-NEXT: vs8r.v v8, (a6)
+; ZIP-NEXT: vl8re64.v v16, (a2)
+; ZIP-NEXT: vl8re64.v v8, (a6)
; ZIP-NEXT: addi sp, s0, -80
; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZIP-NEXT: addi sp, sp, 80
; ZIP-NEXT: ret
- %res = call <vscale x 20 x float> @llvm.vector.interleave5.nxv20f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4)
- ret <vscale x 20 x float> %res
+ %res = call <vscale x 12 x double> @llvm.vector.interleave6.nxv12f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4, <vscale x 2 x double> %v5)
+ ret <vscale x 12 x double> %res
+}
+
+define <vscale x 14 x half> @vector_interleave_nxv14f16_nxv2f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4, <vscale x 2 x half> %v5, <vscale x 2 x half> %v6) nounwind {
+; CHECK-LABEL: vector_interleave_nxv14f16_nxv2f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a2, a1, 1
+; CHECK-NEXT: srli a1, a1, 2
+; CHECK-NEXT: add a3, a0, a2
+; CHECK-NEXT: add a4, a3, a2
+; CHECK-NEXT: add a5, a4, a2
+; CHECK-NEXT: add a6, a5, a2
+; CHECK-NEXT: vsetvli a7, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vsseg7e16.v v8, (a0)
+; CHECK-NEXT: add a7, a6, a2
+; CHECK-NEXT: vle16.v v8, (a7)
+; CHECK-NEXT: vle16.v v10, (a6)
+; CHECK-NEXT: add a6, a1, a1
+; CHECK-NEXT: add a2, a7, a2
+; CHECK-NEXT: vle16.v v12, (a5)
+; CHECK-NEXT: vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v10, v8, a1
+; CHECK-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vle16.v v11, (a2)
+; CHECK-NEXT: vle16.v v9, (a4)
+; CHECK-NEXT: vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v12, a1
+; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vle16.v v12, (a3)
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v12, a1
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv14f16_nxv2f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 2
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a2, a1, 1
+; ZVBB-NEXT: srli a1, a1, 2
+; ZVBB-NEXT: add a3, a0, a2
+; ZVBB-NEXT: add a4, a3, a2
+; ZVBB-NEXT: add a5, a4, a2
+; ZVBB-NEXT: add a6, a5, a2
+; ZVBB-NEXT: vsetvli a7, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vsseg7e16.v v8, (a0)
+; ZVBB-NEXT: add a7, a6, a2
+; ZVBB-NEXT: vle16.v v8, (a7)
+; ZVBB-NEXT: vle16.v v10, (a6)
+; ZVBB-NEXT: add a6, a1, a1
+; ZVBB-NEXT: add a2, a7, a2
+; ZVBB-NEXT: vle16.v v12, (a5)
+; ZVBB-NEXT: vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v10, v8, a1
+; ZVBB-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vle16.v v11, (a2)
+; ZVBB-NEXT: vle16.v v9, (a4)
+; ZVBB-NEXT: vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v9, v12, a1
+; ZVBB-NEXT: vsetvli a2, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vle16.v v12, (a3)
+; ZVBB-NEXT: vle16.v v8, (a0)
+; ZVBB-NEXT: vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v8, v12, a1
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 2
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 14 x half> @llvm.vector.interleave7.nxv14f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4, <vscale x 2 x half> %v5, <vscale x 2 x half> %v6)
+ ret <vscale x 14 x half> %res
}
-define <vscale x 5 x double> @vector_interleave_nxv5f64_nxv1f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4) nounwind {
-; CHECK-LABEL: vector_interleave_nxv5f64_nxv1f64:
+define <vscale x 28 x half> @vector_interleave_nxv28f16_nxv4f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4, <vscale x 4 x half> %v5, <vscale x 4 x half> %v6) nounwind {
+; CHECK-LABEL: vector_interleave_nxv28f16_nxv4f16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 2
-; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: slli a1, a0, 3
+; CHECK-NEXT: sub a0, a1, a0
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: add a2, a0, a1
; CHECK-NEXT: add a3, a2, a1
-; CHECK-NEXT: vsetvli a4, zero, e64, m1, ta, ma
-; CHECK-NEXT: vsseg5e64.v v8, (a0)
-; CHECK-NEXT: vl1re64.v v10, (a3)
+; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, ma
+; CHECK-NEXT: vsseg7e16.v v8, (a0)
+; CHECK-NEXT: vl1re16.v v10, (a3)
; CHECK-NEXT: add a3, a3, a1
-; CHECK-NEXT: vl1re64.v v11, (a3)
-; CHECK-NEXT: vl1re64.v v8, (a0)
-; CHECK-NEXT: vl1re64.v v9, (a2)
-; CHECK-NEXT: add a1, a3, a1
-; CHECK-NEXT: vl1re64.v v12, (a1)
+; CHECK-NEXT: vl1re16.v v11, (a3)
+; CHECK-NEXT: add a3, a3, a1
+; CHECK-NEXT: vl1re16.v v8, (a0)
+; CHECK-NEXT: add a0, a3, a1
+; CHECK-NEXT: vl1re16.v v9, (a2)
+; CHECK-NEXT: vl1re16.v v12, (a3)
+; CHECK-NEXT: vl1re16.v v13, (a0)
+; CHECK-NEXT: add a0, a0, a1
+; CHECK-NEXT: vl1re16.v v14, (a0)
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 2
-; CHECK-NEXT: add a0, a1, a0
+; CHECK-NEXT: slli a1, a0, 3
+; CHECK-NEXT: sub a0, a1, a0
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv5f64_nxv1f64:
+; ZVBB-LABEL: vector_interleave_nxv28f16_nxv4f16:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a1, a0, 2
-; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: slli a1, a0, 3
+; ZVBB-NEXT: sub a0, a1, a0
; ZVBB-NEXT: sub sp, sp, a0
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: add a2, a0, a1
; ZVBB-NEXT: add a3, a2, a1
-; ZVBB-NEXT: vsetvli a4, zero, e64, m1, ta, ma
-; ZVBB-NEXT: vsseg5e64.v v8, (a0)
-; ZVBB-NEXT: vl1re64.v v10, (a3)
+; ZVBB-NEXT: vsetvli a4, zero, e16, m1, ta, ma
+; ZVBB-NEXT: vsseg7e16.v v8, (a0)
+; ZVBB-NEXT: vl1re16.v v10, (a3)
; ZVBB-NEXT: add a3, a3, a1
-; ZVBB-NEXT: vl1re64.v v11, (a3)
-; ZVBB-NEXT: vl1re64.v v8, (a0)
-; ZVBB-NEXT: vl1re64.v v9, (a2)
-; ZVBB-NEXT: add a1, a3, a1
-; ZVBB-NEXT: vl1re64.v v12, (a1)
+; ZVBB-NEXT: vl1re16.v v11, (a3)
+; ZVBB-NEXT: add a3, a3, a1
+; ZVBB-NEXT: vl1re16.v v8, (a0)
+; ZVBB-NEXT: add a0, a3, a1
+; ZVBB-NEXT: vl1re16.v v9, (a2)
+; ZVBB-NEXT: vl1re16.v v12, (a3)
+; ZVBB-NEXT: vl1re16.v v13, (a0)
+; ZVBB-NEXT: add a0, a0, a1
+; ZVBB-NEXT: vl1re16.v v14, (a0)
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a1, a0, 2
-; ZVBB-NEXT: add a0, a1, a0
+; ZVBB-NEXT: slli a1, a0, 3
+; ZVBB-NEXT: sub a0, a1, a0
; ZVBB-NEXT: add sp, sp, a0
; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
- %res = call <vscale x 5 x double> @llvm.vector.interleave5.nxv5f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4)
- ret <vscale x 5 x double> %res
+ %res = call <vscale x 28 x half> @llvm.vector.interleave7.nxv28f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4, <vscale x 4 x half> %v5, <vscale x 4 x half> %v6)
+ ret <vscale x 28 x half> %res
}
-define <vscale x 10 x double> @vector_interleave_nxv10f64_nxv2f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4) nounwind {
-; RV32-LABEL: vector_interleave_nxv10f64_nxv2f64:
+define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4, <vscale x 8 x half> %v5, <vscale x 8 x half> %v6) nounwind {
+; RV32-LABEL: vector_interleave_nxv56f16_nxv8f16:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -80
; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
; RV32-NEXT: addi s0, sp, 80
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a1, 28
-; RV32-NEXT: mul a0, a0, a1
+; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: sub sp, sp, a0
; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT: vmv2r.v v20, v16
+; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vmv2r.v v26, v20
; RV32-NEXT: addi a0, sp, 64
-; RV32-NEXT: vmv2r.v v18, v12
+; RV32-NEXT: vmv2r.v v24, v16
; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a2, a1, 2
-; RV32-NEXT: add a1, a2, a1
+; RV32-NEXT: slli a2, a1, 3
+; RV32-NEXT: sub a1, a2, a1
; RV32-NEXT: add a1, sp, a1
; RV32-NEXT: addi a1, a1, 64
+; RV32-NEXT: vmv2r.v v22, v12
; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: vmv2r.v v16, v8
-; RV32-NEXT: vmv2r.v v22, v16
-; RV32-NEXT: vmv2r.v v24, v18
-; RV32-NEXT: vmv1r.v v26, v20
+; RV32-NEXT: vmv2r.v v20, v8
+; RV32-NEXT: vmv1r.v v1, v20
+; RV32-NEXT: vmv1r.v v3, v22
+; RV32-NEXT: vmv1r.v v5, v24
+; RV32-NEXT: vmv1r.v v7, v26
; RV32-NEXT: add a3, a0, a2
-; RV32-NEXT: vmv1r.v v23, v10
+; RV32-NEXT: vmv1r.v v2, v10
; RV32-NEXT: add a4, a1, a2
-; RV32-NEXT: add a5, a4, a2
-; RV32-NEXT: vmv1r.v v25, v14
-; RV32-NEXT: add a6, a5, a2
-; RV32-NEXT: vmv1r.v v18, v11
-; RV32-NEXT: vsseg5e64.v v22, (a0)
-; RV32-NEXT: vmv1r.v v20, v15
-; RV32-NEXT: vsseg5e64.v v17, (a1)
-; RV32-NEXT: vl1re64.v v16, (a6)
+; RV32-NEXT: slli a5, a2, 2
+; RV32-NEXT: vmv1r.v v4, v14
+; RV32-NEXT: slli a6, a2, 4
+; RV32-NEXT: add a7, a4, a2
+; RV32-NEXT: vmv1r.v v6, v18
+; RV32-NEXT: sub a5, a6, a5
+; RV32-NEXT: vmv1r.v v22, v11
+; RV32-NEXT: add a6, a7, a2
+; RV32-NEXT: vmv1r.v v24, v15
+; RV32-NEXT: vsseg7e16.v v1, (a0)
+; RV32-NEXT: vmv1r.v v26, v19
+; RV32-NEXT: vsseg7e16.v v21, (a1)
+; RV32-NEXT: vl1re16.v v18, (a6)
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re64.v v17, (a6)
+; RV32-NEXT: vl1re16.v v19, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v20, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v21, (a6)
; RV32-NEXT: add a6, a3, a2
-; RV32-NEXT: vl1re64.v v10, (a6)
+; RV32-NEXT: vl1re16.v v10, (a6)
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re64.v v11, (a6)
-; RV32-NEXT: vl1re64.v v8, (a0)
-; RV32-NEXT: vl1re64.v v9, (a3)
-; RV32-NEXT: vl1re64.v v14, (a4)
+; RV32-NEXT: vl1re16.v v11, (a6)
+; RV32-NEXT: vl1re16.v v8, (a0)
+; RV32-NEXT: vl1re16.v v16, (a4)
+; RV32-NEXT: vl1re16.v v9, (a3)
+; RV32-NEXT: vl1re16.v v17, (a7)
; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a3, 10
+; RV32-NEXT: li a3, 14
; RV32-NEXT: mul a0, a0, a3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 64
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re64.v v15, (a5)
-; RV32-NEXT: vl1re64.v v12, (a6)
-; RV32-NEXT: vl1re64.v v13, (a1)
+; RV32-NEXT: vl1re16.v v12, (a6)
+; RV32-NEXT: add a6, a6, a2
+; RV32-NEXT: vl1re16.v v13, (a6)
+; RV32-NEXT: add a6, a6, a2
; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: add a2, a0, a2
-; RV32-NEXT: vs2r.v v16, (a2)
+; RV32-NEXT: vl1re16.v v14, (a6)
+; RV32-NEXT: vl1re16.v v15, (a1)
+; RV32-NEXT: add a5, a0, a5
+; RV32-NEXT: vs2r.v v20, (a5)
+; RV32-NEXT: vs4r.v v16, (a2)
; RV32-NEXT: vs8r.v v8, (a0)
-; RV32-NEXT: vl8re64.v v16, (a2)
-; RV32-NEXT: vl8re64.v v8, (a0)
+; RV32-NEXT: vl8re16.v v16, (a2)
+; RV32-NEXT: vl8re16.v v8, (a0)
; RV32-NEXT: addi sp, s0, -80
; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 80
; RV32-NEXT: ret
;
-; RV64-LABEL: vector_interleave_nxv10f64_nxv2f64:
+; RV64-LABEL: vector_interleave_nxv56f16_nxv8f16:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -80
; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; RV64-NEXT: addi s0, sp, 80
; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: li a1, 28
-; RV64-NEXT: mul a0, a0, a1
+; RV64-NEXT: slli a0, a0, 5
; RV64-NEXT: sub sp, sp, a0
; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT: vmv2r.v v20, v16
+; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vmv2r.v v26, v20
; RV64-NEXT: addi a0, sp, 64
-; RV64-NEXT: vmv2r.v v18, v12
+; RV64-NEXT: vmv2r.v v24, v16
; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a2, a1, 2
-; RV64-NEXT: add a1, a2, a1
+; RV64-NEXT: slli a2, a1, 3
+; RV64-NEXT: sub a1, a2, a1
; RV64-NEXT: add a1, sp, a1
; RV64-NEXT: addi a1, a1, 64
+; RV64-NEXT: vmv2r.v v22, v12
; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: vmv2r.v v16, v8
-; RV64-NEXT: vmv2r.v v22, v16
-; RV64-NEXT: vmv2r.v v24, v18
-; RV64-NEXT: vmv1r.v v26, v20
+; RV64-NEXT: vmv2r.v v20, v8
+; RV64-NEXT: vmv1r.v v1, v20
+; RV64-NEXT: vmv1r.v v3, v22
+; RV64-NEXT: vmv1r.v v5, v24
+; RV64-NEXT: vmv1r.v v7, v26
; RV64-NEXT: add a3, a0, a2
-; RV64-NEXT: vmv1r.v v23, v10
+; RV64-NEXT: vmv1r.v v2, v10
; RV64-NEXT: add a4, a1, a2
-; RV64-NEXT: add a5, a4, a2
-; RV64-NEXT: vmv1r.v v25, v14
-; RV64-NEXT: add a6, a5, a2
-; RV64-NEXT: vmv1r.v v18, v11
-; RV64-NEXT: vsseg5e64.v v22, (a0)
-; RV64-NEXT: vmv1r.v v20, v15
-; RV64-NEXT: vsseg5e64.v v17, (a1)
-; RV64-NEXT: vl1re64.v v16, (a6)
+; RV64-NEXT: slli a5, a2, 2
+; RV64-NEXT: vmv1r.v v4, v14
+; RV64-NEXT: slli a6, a2, 4
+; RV64-NEXT: add a7, a4, a2
+; RV64-NEXT: vmv1r.v v6, v18
+; RV64-NEXT: sub a5, a6, a5
+; RV64-NEXT: vmv1r.v v22, v11
+; RV64-NEXT: add a6, a7, a2
+; RV64-NEXT: vmv1r.v v24, v15
+; RV64-NEXT: vsseg7e16.v v1, (a0)
+; RV64-NEXT: vmv1r.v v26, v19
+; RV64-NEXT: vsseg7e16.v v21, (a1)
+; RV64-NEXT: vl1re16.v v18, (a6)
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re64.v v17, (a6)
+; RV64-NEXT: vl1re16.v v19, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v20, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v21, (a6)
; RV64-NEXT: add a6, a3, a2
-; RV64-NEXT: vl1re64.v v10, (a6)
+; RV64-NEXT: vl1re16.v v10, (a6)
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re64.v v11, (a6)
-; RV64-NEXT: vl1re64.v v8, (a0)
-; RV64-NEXT: vl1re64.v v9, (a3)
-; RV64-NEXT: vl1re64.v v14, (a4)
+; RV64-NEXT: vl1re16.v v11, (a6)
+; RV64-NEXT: vl1re16.v v8, (a0)
+; RV64-NEXT: vl1re16.v v16, (a4)
+; RV64-NEXT: vl1re16.v v9, (a3)
+; RV64-NEXT: vl1re16.v v17, (a7)
; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: li a3, 10
+; RV64-NEXT: li a3, 14
; RV64-NEXT: mul a0, a0, a3
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 64
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re64.v v15, (a5)
-; RV64-NEXT: vl1re64.v v12, (a6)
-; RV64-NEXT: vl1re64.v v13, (a1)
+; RV64-NEXT: vl1re16.v v12, (a6)
+; RV64-NEXT: add a6, a6, a2
+; RV64-NEXT: vl1re16.v v13, (a6)
+; RV64-NEXT: add a6, a6, a2
; RV64-NEXT: slli a2, a2, 3
; RV64-NEXT: add a2, a0, a2
-; RV64-NEXT: vs2r.v v16, (a2)
+; RV64-NEXT: vl1re16.v v14, (a6)
+; RV64-NEXT: vl1re16.v v15, (a1)
+; RV64-NEXT: add a5, a0, a5
+; RV64-NEXT: vs2r.v v20, (a5)
+; RV64-NEXT: vs4r.v v16, (a2)
; RV64-NEXT: vs8r.v v8, (a0)
-; RV64-NEXT: vl8re64.v v16, (a2)
-; RV64-NEXT: vl8re64.v v8, (a0)
+; RV64-NEXT: vl8re16.v v16, (a2)
+; RV64-NEXT: vl8re16.v v8, (a0)
; RV64-NEXT: addi sp, s0, -80
; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 80
; RV64-NEXT: ret
;
-; ZVBB-RV32-LABEL: vector_interleave_nxv10f64_nxv2f64:
+; ZVBB-RV32-LABEL: vector_interleave_nxv56f16_nxv8f16:
; ZVBB-RV32: # %bb.0:
; ZVBB-RV32-NEXT: addi sp, sp, -80
; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
; ZVBB-RV32-NEXT: addi s0, sp, 80
; ZVBB-RV32-NEXT: csrr a0, vlenb
-; ZVBB-RV32-NEXT: li a1, 28
-; ZVBB-RV32-NEXT: mul a0, a0, a1
+; ZVBB-RV32-NEXT: slli a0, a0, 5
; ZVBB-RV32-NEXT: sub sp, sp, a0
; ZVBB-RV32-NEXT: andi sp, sp, -64
-; ZVBB-RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; ZVBB-RV32-NEXT: vmv2r.v v20, v16
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV32-NEXT: vmv2r.v v26, v20
; ZVBB-RV32-NEXT: addi a0, sp, 64
-; ZVBB-RV32-NEXT: vmv2r.v v18, v12
+; ZVBB-RV32-NEXT: vmv2r.v v24, v16
; ZVBB-RV32-NEXT: csrr a1, vlenb
-; ZVBB-RV32-NEXT: slli a2, a1, 2
-; ZVBB-RV32-NEXT: add a1, a2, a1
+; ZVBB-RV32-NEXT: slli a2, a1, 3
+; ZVBB-RV32-NEXT: sub a1, a2, a1
; ZVBB-RV32-NEXT: add a1, sp, a1
; ZVBB-RV32-NEXT: addi a1, a1, 64
+; ZVBB-RV32-NEXT: vmv2r.v v22, v12
; ZVBB-RV32-NEXT: csrr a2, vlenb
-; ZVBB-RV32-NEXT: vmv2r.v v16, v8
-; ZVBB-RV32-NEXT: vmv2r.v v22, v16
-; ZVBB-RV32-NEXT: vmv2r.v v24, v18
-; ZVBB-RV32-NEXT: vmv1r.v v26, v20
+; ZVBB-RV32-NEXT: vmv2r.v v20, v8
+; ZVBB-RV32-NEXT: vmv1r.v v1, v20
+; ZVBB-RV32-NEXT: vmv1r.v v3, v22
+; ZVBB-RV32-NEXT: vmv1r.v v5, v24
+; ZVBB-RV32-NEXT: vmv1r.v v7, v26
; ZVBB-RV32-NEXT: add a3, a0, a2
-; ZVBB-RV32-NEXT: vmv1r.v v23, v10
+; ZVBB-RV32-NEXT: vmv1r.v v2, v10
; ZVBB-RV32-NEXT: add a4, a1, a2
-; ZVBB-RV32-NEXT: add a5, a4, a2
-; ZVBB-RV32-NEXT: vmv1r.v v25, v14
-; ZVBB-RV32-NEXT: add a6, a5, a2
-; ZVBB-RV32-NEXT: vmv1r.v v18, v11
-; ZVBB-RV32-NEXT: vsseg5e64.v v22, (a0)
-; ZVBB-RV32-NEXT: vmv1r.v v20, v15
-; ZVBB-RV32-NEXT: vsseg5e64.v v17, (a1)
-; ZVBB-RV32-NEXT: vl1re64.v v16, (a6)
+; ZVBB-RV32-NEXT: slli a5, a2, 2
+; ZVBB-RV32-NEXT: vmv1r.v v4, v14
+; ZVBB-RV32-NEXT: slli a6, a2, 4
+; ZVBB-RV32-NEXT: add a7, a4, a2
+; ZVBB-RV32-NEXT: vmv1r.v v6, v18
+; ZVBB-RV32-NEXT: sub a5, a6, a5
+; ZVBB-RV32-NEXT: vmv1r.v v22, v11
+; ZVBB-RV32-NEXT: add a6, a7, a2
+; ZVBB-RV32-NEXT: vmv1r.v v24, v15
+; ZVBB-RV32-NEXT: vsseg7e16.v v1, (a0)
+; ZVBB-RV32-NEXT: vmv1r.v v26, v19
+; ZVBB-RV32-NEXT: vsseg7e16.v v21, (a1)
+; ZVBB-RV32-NEXT: vl1re16.v v18, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re64.v v17, (a6)
+; ZVBB-RV32-NEXT: vl1re16.v v19, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v20, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v21, (a6)
; ZVBB-RV32-NEXT: add a6, a3, a2
-; ZVBB-RV32-NEXT: vl1re64.v v10, (a6)
+; ZVBB-RV32-NEXT: vl1re16.v v10, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re64.v v11, (a6)
-; ZVBB-RV32-NEXT: vl1re64.v v8, (a0)
-; ZVBB-RV32-NEXT: vl1re64.v v9, (a3)
-; ZVBB-RV32-NEXT: vl1re64.v v14, (a4)
+; ZVBB-RV32-NEXT: vl1re16.v v11, (a6)
+; ZVBB-RV32-NEXT: vl1re16.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re16.v v16, (a4)
+; ZVBB-RV32-NEXT: vl1re16.v v9, (a3)
+; ZVBB-RV32-NEXT: vl1re16.v v17, (a7)
; ZVBB-RV32-NEXT: csrr a0, vlenb
-; ZVBB-RV32-NEXT: li a3, 10
+; ZVBB-RV32-NEXT: li a3, 14
; ZVBB-RV32-NEXT: mul a0, a0, a3
; ZVBB-RV32-NEXT: add a0, sp, a0
; ZVBB-RV32-NEXT: addi a0, a0, 64
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re64.v v15, (a5)
-; ZVBB-RV32-NEXT: vl1re64.v v12, (a6)
-; ZVBB-RV32-NEXT: vl1re64.v v13, (a1)
+; ZVBB-RV32-NEXT: vl1re16.v v12, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
+; ZVBB-RV32-NEXT: vl1re16.v v13, (a6)
+; ZVBB-RV32-NEXT: add a6, a6, a2
; ZVBB-RV32-NEXT: slli a2, a2, 3
; ZVBB-RV32-NEXT: add a2, a0, a2
-; ZVBB-RV32-NEXT: vs2r.v v16, (a2)
+; ZVBB-RV32-NEXT: vl1re16.v v14, (a6)
+; ZVBB-RV32-NEXT: vl1re16.v v15, (a1)
+; ZVBB-RV32-NEXT: add a5, a0, a5
+; ZVBB-RV32-NEXT: vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT: vs4r.v v16, (a2)
; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
-; ZVBB-RV32-NEXT: vl8re64.v v16, (a2)
-; ZVBB-RV32-NEXT: vl8re64.v v8, (a0)
+; ZVBB-RV32-NEXT: vl8re16.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8re16.v v8, (a0)
; ZVBB-RV32-NEXT: addi sp, s0, -80
; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; ZVBB-RV32-NEXT: addi sp, sp, 80
; ZVBB-RV32-NEXT: ret
;
-; ZVBB-RV64-LABEL: vector_interleave_nxv10f64_nxv2f64:
+; ZVBB-RV64-LABEL: vector_interleave_nxv56f16_nxv8f16:
; ZVBB-RV64: # %bb.0:
; ZVBB-RV64-NEXT: addi sp, sp, -80
; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; ZVBB-RV64-NEXT: addi s0, sp, 80
; ZVBB-RV64-NEXT: csrr a0, vlenb
-; ZVBB-RV64-NEXT: li a1, 28
-; ZVBB-RV64-NEXT: mul a0, a0, a1
+; ZVBB-RV64-NEXT: slli a0, a0, 5
; ZVBB-RV64-NEXT: sub sp, sp, a0
; ZVBB-RV64-NEXT: andi sp, sp, -64
-; ZVBB-RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; ZVBB-RV64-NEXT: vmv2r.v v20, v16
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV64-NEXT: vmv2r.v v26, v20
; ZVBB-RV64-NEXT: addi a0, sp, 64
-; ZVBB-RV64-NEXT: vmv2r.v v18, v12
+; ZVBB-RV64-NEXT: vmv2r.v v24, v16
; ZVBB-RV64-NEXT: csrr a1, vlenb
-; ZVBB-RV64-NEXT: slli a2, a1, 2
-; ZVBB-RV64-NEXT: add a1, a2, a1
+; ZVBB-RV64-NEXT: slli a2, a1, 3
+; ZVBB-RV64-NEXT: sub a1, a2, a1
; ZVBB-RV64-NEXT: add a1, sp, a1
; ZVBB-RV64-NEXT: addi a1, a1, 64
+; ZVBB-RV64-NEXT: vmv2r.v v22, v12
; ZVBB-RV64-NEXT: csrr a2, vlenb
-; ZVBB-RV64-NEXT: vmv2r.v v16, v8
-; ZVBB-RV64-NEXT: vmv2r.v v22, v16
-; ZVBB-RV64-NEXT: vmv2r.v v24, v18
-; ZVBB-RV64-NEXT: vmv1r.v v26, v20
+; ZVBB-RV64-NEXT: vmv2r.v v20, v8
+; ZVBB-RV64-NEXT: vmv1r.v v1, v20
+; ZVBB-RV64-NEXT: vmv1r.v v3, v22
+; ZVBB-RV64-NEXT: vmv1r.v v5, v24
+; ZVBB-RV64-NEXT: vmv1r.v v7, v26
; ZVBB-RV64-NEXT: add a3, a0, a2
-; ZVBB-RV64-NEXT: vmv1r.v v23, v10
+; ZVBB-RV64-NEXT: vmv1r.v v2, v10
; ZVBB-RV64-NEXT: add a4, a1, a2
-; ZVBB-RV64-NEXT: add a5, a4, a2
-; ZVBB-RV64-NEXT: vmv1r.v v25, v14
-; ZVBB-RV64-NEXT: add a6, a5, a2
-; ZVBB-RV64-NEXT: vmv1r.v v18, v11
-; ZVBB-RV64-NEXT: vsseg5e64.v v22, (a0)
-; ZVBB-RV64-NEXT: vmv1r.v v20, v15
-; ZVBB-RV64-NEXT: vsseg5e64.v v17, (a1)
-; ZVBB-RV64-NEXT: vl1re64.v v16, (a6)
+; ZVBB-RV64-NEXT: slli a5, a2, 2
+; ZVBB-RV64-NEXT: vmv1r.v v4, v14
+; ZVBB-RV64-NEXT: slli a6, a2, 4
+; ZVBB-RV64-NEXT: add a7, a4, a2
+; ZVBB-RV64-NEXT: vmv1r.v v6, v18
+; ZVBB-RV64-NEXT: sub a5, a6, a5
+; ZVBB-RV64-NEXT: vmv1r.v v22, v11
+; ZVBB-RV64-NEXT: add a6, a7, a2
+; ZVBB-RV64-NEXT: vmv1r.v v24, v15
+; ZVBB-RV64-NEXT: vsseg7e16.v v1, (a0)
+; ZVBB-RV64-NEXT: vmv1r.v v26, v19
+; ZVBB-RV64-NEXT: vsseg7e16.v v21, (a1)
+; ZVBB-RV64-NEXT: vl1re16.v v18, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re64.v v17, (a6)
+; ZVBB-RV64-NEXT: vl1re16.v v19, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v20, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v21, (a6)
; ZVBB-RV64-NEXT: add a6, a3, a2
-; ZVBB-RV64-NEXT: vl1re64.v v10, (a6)
+; ZVBB-RV64-NEXT: vl1re16.v v10, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re64.v v11, (a6)
-; ZVBB-RV64-NEXT: vl1re64.v v8, (a0)
-; ZVBB-RV64-NEXT: vl1re64.v v9, (a3)
-; ZVBB-RV64-NEXT: vl1re64.v v14, (a4)
+; ZVBB-RV64-NEXT: vl1re16.v v11, (a6)
+; ZVBB-RV64-NEXT: vl1re16.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re16.v v16, (a4)
+; ZVBB-RV64-NEXT: vl1re16.v v9, (a3)
+; ZVBB-RV64-NEXT: vl1re16.v v17, (a7)
; ZVBB-RV64-NEXT: csrr a0, vlenb
-; ZVBB-RV64-NEXT: li a3, 10
+; ZVBB-RV64-NEXT: li a3, 14
; ZVBB-RV64-NEXT: mul a0, a0, a3
; ZVBB-RV64-NEXT: add a0, sp, a0
; ZVBB-RV64-NEXT: addi a0, a0, 64
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re64.v v15, (a5)
-; ZVBB-RV64-NEXT: vl1re64.v v12, (a6)
-; ZVBB-RV64-NEXT: vl1re64.v v13, (a1)
+; ZVBB-RV64-NEXT: vl1re16.v v12, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
+; ZVBB-RV64-NEXT: vl1re16.v v13, (a6)
+; ZVBB-RV64-NEXT: add a6, a6, a2
; ZVBB-RV64-NEXT: slli a2, a2, 3
; ZVBB-RV64-NEXT: add a2, a0, a2
-; ZVBB-RV64-NEXT: vs2r.v v16, (a2)
+; ZVBB-RV64-NEXT: vl1re16.v v14, (a6)
+; ZVBB-RV64-NEXT: vl1re16.v v15, (a1)
+; ZVBB-RV64-NEXT: add a5, a0, a5
+; ZVBB-RV64-NEXT: vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT: vs4r.v v16, (a2)
; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
-; ZVBB-RV64-NEXT: vl8re64.v v16, (a2)
-; ZVBB-RV64-NEXT: vl8re64.v v8, (a0)
+; ZVBB-RV64-NEXT: vl8re16.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8re16.v v8, (a0)
; ZVBB-RV64-NEXT: addi sp, s0, -80
; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZVBB-RV64-NEXT: addi sp, sp, 80
; ZVBB-RV64-NEXT: ret
;
-; ZIP-LABEL: vector_interleave_nxv10f64_nxv2f64:
+; ZIP-LABEL: vector_interleave_nxv56f16_nxv8f16:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -80
; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
; ZIP-NEXT: addi s0, sp, 80
; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: li a1, 28
-; ZIP-NEXT: mul a0, a0, a1
+; ZIP-NEXT: slli a0, a0, 5
; ZIP-NEXT: sub sp, sp, a0
; ZIP-NEXT: andi sp, sp, -64
-; ZIP-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; ZIP-NEXT: vmv2r.v v20, v16
+; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT: vmv2r.v v26, v20
; ZIP-NEXT: addi a0, sp, 64
-; ZIP-NEXT: vmv2r.v v18, v12
+; ZIP-NEXT: vmv2r.v v24, v16
; ZIP-NEXT: csrr a1, vlenb
-; ZIP-NEXT: slli a2, a1, 2
-; ZIP-NEXT: add a1, a2, a1
+; ZIP-NEXT: slli a2, a1, 3
+; ZIP-NEXT: sub a1, a2, a1
; ZIP-NEXT: add a1, sp, a1
; ZIP-NEXT: addi a1, a1, 64
+; ZIP-NEXT: vmv2r.v v22, v12
; ZIP-NEXT: csrr a2, vlenb
-; ZIP-NEXT: vmv2r.v v16, v8
-; ZIP-NEXT: vmv2r.v v22, v16
-; ZIP-NEXT: vmv2r.v v24, v18
-; ZIP-NEXT: vmv1r.v v26, v20
+; ZIP-NEXT: vmv2r.v v20, v8
+; ZIP-NEXT: vmv1r.v v1, v20
+; ZIP-NEXT: vmv1r.v v3, v22
+; ZIP-NEXT: vmv1r.v v5, v24
+; ZIP-NEXT: vmv1r.v v7, v26
; ZIP-NEXT: add a3, a0, a2
-; ZIP-NEXT: vmv1r.v v23, v10
+; ZIP-NEXT: vmv1r.v v2, v10
; ZIP-NEXT: add a4, a1, a2
-; ZIP-NEXT: add a5, a4, a2
-; ZIP-NEXT: vmv1r.v v25, v14
-; ZIP-NEXT: add a6, a5, a2
-; ZIP-NEXT: vmv1r.v v18, v11
-; ZIP-NEXT: vsseg5e64.v v22, (a0)
-; ZIP-NEXT: vmv1r.v v20, v15
-; ZIP-NEXT: vsseg5e64.v v17, (a1)
-; ZIP-NEXT: vl1re64.v v16, (a6)
+; ZIP-NEXT: slli a5, a2, 2
+; ZIP-NEXT: vmv1r.v v4, v14
+; ZIP-NEXT: slli a6, a2, 4
+; ZIP-NEXT: add a7, a4, a2
+; ZIP-NEXT: vmv1r.v v6, v18
+; ZIP-NEXT: sub a5, a6, a5
+; ZIP-NEXT: vmv1r.v v22, v11
+; ZIP-NEXT: add a6, a7, a2
+; ZIP-NEXT: vmv1r.v v24, v15
+; ZIP-NEXT: vsseg7e16.v v1, (a0)
+; ZIP-NEXT: vmv1r.v v26, v19
+; ZIP-NEXT: vsseg7e16.v v21, (a1)
+; ZIP-NEXT: vl1re16.v v18, (a6)
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re64.v v17, (a6)
+; ZIP-NEXT: vl1re16.v v19, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re16.v v20, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re16.v v21, (a6)
; ZIP-NEXT: add a6, a3, a2
-; ZIP-NEXT: vl1re64.v v10, (a6)
+; ZIP-NEXT: vl1re16.v v10, (a6)
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re64.v v11, (a6)
-; ZIP-NEXT: vl1re64.v v8, (a0)
-; ZIP-NEXT: vl1re64.v v9, (a3)
-; ZIP-NEXT: vl1re64.v v14, (a4)
+; ZIP-NEXT: vl1re16.v v11, (a6)
+; ZIP-NEXT: vl1re16.v v8, (a0)
+; ZIP-NEXT: vl1re16.v v16, (a4)
+; ZIP-NEXT: vl1re16.v v9, (a3)
+; ZIP-NEXT: vl1re16.v v17, (a7)
; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: li a3, 10
+; ZIP-NEXT: li a3, 14
; ZIP-NEXT: mul a0, a0, a3
; ZIP-NEXT: add a0, sp, a0
; ZIP-NEXT: addi a0, a0, 64
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re64.v v15, (a5)
-; ZIP-NEXT: vl1re64.v v12, (a6)
-; ZIP-NEXT: vl1re64.v v13, (a1)
+; ZIP-NEXT: vl1re16.v v12, (a6)
+; ZIP-NEXT: add a6, a6, a2
+; ZIP-NEXT: vl1re16.v v13, (a6)
+; ZIP-NEXT: add a6, a6, a2
; ZIP-NEXT: slli a2, a2, 3
; ZIP-NEXT: add a2, a0, a2
-; ZIP-NEXT: vs2r.v v16, (a2)
+; ZIP-NEXT: vl1re16.v v14, (a6)
+; ZIP-NEXT: vl1re16.v v15, (a1)
+; ZIP-NEXT: add a5, a0, a5
+; ZIP-NEXT: vs2r.v v20, (a5)
+; ZIP-NEXT: vs4r.v v16, (a2)
; ZIP-NEXT: vs8r.v v8, (a0)
-; ZIP-NEXT: vl8re64.v v16, (a2)
-; ZIP-NEXT: vl8re64.v v8, (a0)
+; ZIP-NEXT: vl8re16.v v16, (a2)
+; ZIP-NEXT: vl8re16.v v8, (a0)
; ZIP-NEXT: addi sp, s0, -80
; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZIP-NEXT: addi sp, sp, 80
; ZIP-NEXT: ret
- %res = call <vscale x 10 x double> @llvm.vector.interleave5.nxv10f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4)
- ret <vscale x 10 x double> %res
+ %res = call <vscale x 56 x half> @llvm.vector.interleave7.nxv56f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4, <vscale x 8 x half> %v5, <vscale x 8 x half> %v6)
+ ret <vscale x 56 x half> %res
}
-define <vscale x 14 x half> @vector_interleave_nxv14f16_nxv2f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4, <vscale x 2 x half> %v5, <vscale x 2 x half> %v6) nounwind {
-; CHECK-LABEL: vector_interleave_nxv14f16_nxv2f16:
+define <vscale x 14 x bfloat> @vector_interleave_nxv14bf16_nxv2bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4, <vscale x 2 x bfloat> %v5, <vscale x 2 x bfloat> %v6) nounwind {
+; CHECK-LABEL: vector_interleave_nxv14bf16_nxv2bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
@@ -6788,7 +12356,7 @@ define <vscale x 14 x half> @vector_interleave_nxv14f16_nxv2f16(<vscale x 2 x ha
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv14f16_nxv2f16:
+; ZVBB-LABEL: vector_interleave_nxv14bf16_nxv2bf16:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: csrr a0, vlenb
@@ -6826,13 +12394,13 @@ define <vscale x 14 x half> @vector_interleave_nxv14f16_nxv2f16(<vscale x 2 x ha
; ZVBB-NEXT: slli a0, a0, 2
; ZVBB-NEXT: add sp, sp, a0
; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: ret
- %res = call <vscale x 14 x half> @llvm.vector.interleave7.nxv14f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4, <vscale x 2 x half> %v5, <vscale x 2 x half> %v6)
- ret <vscale x 14 x half> %res
+; ZVBB-NEXT: ret
+ %res = call <vscale x 14 x bfloat> @llvm.vector.interleave7.nxv14bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4, <vscale x 2 x bfloat> %v5, <vscale x 2 x bfloat> %v6)
+ ret <vscale x 14 x bfloat> %res
}
-define <vscale x 28 x half> @vector_interleave_nxv28f16_nxv4f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4, <vscale x 4 x half> %v5, <vscale x 4 x half> %v6) nounwind {
-; CHECK-LABEL: vector_interleave_nxv28f16_nxv4f16:
+define <vscale x 28 x bfloat> @vector_interleave_nxv28bf16_nxv4bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4, <vscale x 4 x bfloat> %v5, <vscale x 4 x bfloat> %v6) nounwind {
+; CHECK-LABEL: vector_interleave_nxv28bf16_nxv4bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
@@ -6863,7 +12431,7 @@ define <vscale x 28 x half> @vector_interleave_nxv28f16_nxv4f16(<vscale x 4 x ha
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv28f16_nxv4f16:
+; ZVBB-LABEL: vector_interleave_nxv28bf16_nxv4bf16:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: csrr a0, vlenb
@@ -6893,12 +12461,12 @@ define <vscale x 28 x half> @vector_interleave_nxv28f16_nxv4f16(<vscale x 4 x ha
; ZVBB-NEXT: add sp, sp, a0
; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
- %res = call <vscale x 28 x half> @llvm.vector.interleave7.nxv28f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4, <vscale x 4 x half> %v5, <vscale x 4 x half> %v6)
- ret <vscale x 28 x half> %res
+ %res = call <vscale x 28 x bfloat> @llvm.vector.interleave7.nxv28bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4, <vscale x 4 x bfloat> %v5, <vscale x 4 x bfloat> %v6)
+ ret <vscale x 28 x bfloat> %res
}
-define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4, <vscale x 8 x half> %v5, <vscale x 8 x half> %v6) nounwind {
-; RV32-LABEL: vector_interleave_nxv56f16_nxv8f16:
+define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4, <vscale x 8 x bfloat> %v5, <vscale x 8 x bfloat> %v6) nounwind {
+; RV32-LABEL: vector_interleave_nxv56bf16_nxv8bf16:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -80
; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
@@ -6980,7 +12548,7 @@ define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x ha
; RV32-NEXT: addi sp, sp, 80
; RV32-NEXT: ret
;
-; RV64-LABEL: vector_interleave_nxv56f16_nxv8f16:
+; RV64-LABEL: vector_interleave_nxv56bf16_nxv8bf16:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -80
; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -7062,7 +12630,7 @@ define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x ha
; RV64-NEXT: addi sp, sp, 80
; RV64-NEXT: ret
;
-; ZVBB-RV32-LABEL: vector_interleave_nxv56f16_nxv8f16:
+; ZVBB-RV32-LABEL: vector_interleave_nxv56bf16_nxv8bf16:
; ZVBB-RV32: # %bb.0:
; ZVBB-RV32-NEXT: addi sp, sp, -80
; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
@@ -7144,7 +12712,7 @@ define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x ha
; ZVBB-RV32-NEXT: addi sp, sp, 80
; ZVBB-RV32-NEXT: ret
;
-; ZVBB-RV64-LABEL: vector_interleave_nxv56f16_nxv8f16:
+; ZVBB-RV64-LABEL: vector_interleave_nxv56bf16_nxv8bf16:
; ZVBB-RV64: # %bb.0:
; ZVBB-RV64-NEXT: addi sp, sp, -80
; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -7226,7 +12794,7 @@ define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x ha
; ZVBB-RV64-NEXT: addi sp, sp, 80
; ZVBB-RV64-NEXT: ret
;
-; ZIP-LABEL: vector_interleave_nxv56f16_nxv8f16:
+; ZIP-LABEL: vector_interleave_nxv56bf16_nxv8bf16:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -80
; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -7307,12 +12875,12 @@ define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x ha
; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZIP-NEXT: addi sp, sp, 80
; ZIP-NEXT: ret
- %res = call <vscale x 56 x half> @llvm.vector.interleave7.nxv56f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4, <vscale x 8 x half> %v5, <vscale x 8 x half> %v6)
- ret <vscale x 56 x half> %res
+ %res = call <vscale x 56 x bfloat> @llvm.vector.interleave7.nxv56bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4, <vscale x 8 x bfloat> %v5, <vscale x 8 x bfloat> %v6)
+ ret <vscale x 56 x bfloat> %res
}
-define <vscale x 14 x bfloat> @vector_interleave_nxv14bf16_nxv2bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4, <vscale x 2 x bfloat> %v5, <vscale x 2 x bfloat> %v6) nounwind {
-; CHECK-LABEL: vector_interleave_nxv14bf16_nxv2bf16:
+define <vscale x 7 x float> @vector_interleave_nxv7f32_nxv1f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4, <vscale x 1 x float> %v5, <vscale x 1 x float> %v6) nounwind {
+; CHECK-LABEL: vector_interleave_nxv7f32_nxv1f32:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
@@ -7321,30 +12889,30 @@ define <vscale x 14 x bfloat> @vector_interleave_nxv14bf16_nxv2bf16(<vscale x 2
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a2, a1, 1
-; CHECK-NEXT: srli a1, a1, 2
+; CHECK-NEXT: srli a1, a1, 3
; CHECK-NEXT: add a3, a0, a2
; CHECK-NEXT: add a4, a3, a2
; CHECK-NEXT: add a5, a4, a2
; CHECK-NEXT: add a6, a5, a2
-; CHECK-NEXT: vsetvli a7, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vsseg7e16.v v8, (a0)
+; CHECK-NEXT: vsetvli a7, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsseg7e32.v v8, (a0)
; CHECK-NEXT: add a7, a6, a2
-; CHECK-NEXT: vle16.v v8, (a7)
-; CHECK-NEXT: vle16.v v10, (a6)
+; CHECK-NEXT: vle32.v v8, (a7)
+; CHECK-NEXT: vle32.v v10, (a6)
; CHECK-NEXT: add a6, a1, a1
; CHECK-NEXT: add a2, a7, a2
-; CHECK-NEXT: vle16.v v12, (a5)
-; CHECK-NEXT: vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT: vle32.v v12, (a5)
+; CHECK-NEXT: vsetvli zero, a6, e32, m1, ta, ma
; CHECK-NEXT: vslideup.vx v10, v8, a1
-; CHECK-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vle16.v v11, (a2)
-; CHECK-NEXT: vle16.v v9, (a4)
-; CHECK-NEXT: vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT: vsetvli a5, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vle32.v v11, (a2)
+; CHECK-NEXT: vle32.v v9, (a4)
+; CHECK-NEXT: vsetvli zero, a6, e32, m1, ta, ma
; CHECK-NEXT: vslideup.vx v9, v12, a1
-; CHECK-NEXT: vsetvli a2, zero, e16, mf2, ta, ma
-; CHECK-NEXT: vle16.v v12, (a3)
-; CHECK-NEXT: vle16.v v8, (a0)
-; CHECK-NEXT: vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vle32.v v12, (a3)
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vsetvli zero, a6, e32, m1, ta, ma
; CHECK-NEXT: vslideup.vx v8, v12, a1
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 2
@@ -7352,7 +12920,7 @@ define <vscale x 14 x bfloat> @vector_interleave_nxv14bf16_nxv2bf16(<vscale x 2
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv14bf16_nxv2bf16:
+; ZVBB-LABEL: vector_interleave_nxv7f32_nxv1f32:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: csrr a0, vlenb
@@ -7361,42 +12929,42 @@ define <vscale x 14 x bfloat> @vector_interleave_nxv14bf16_nxv2bf16(<vscale x 2
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: srli a2, a1, 1
-; ZVBB-NEXT: srli a1, a1, 2
+; ZVBB-NEXT: srli a1, a1, 3
; ZVBB-NEXT: add a3, a0, a2
; ZVBB-NEXT: add a4, a3, a2
; ZVBB-NEXT: add a5, a4, a2
; ZVBB-NEXT: add a6, a5, a2
-; ZVBB-NEXT: vsetvli a7, zero, e16, mf2, ta, ma
-; ZVBB-NEXT: vsseg7e16.v v8, (a0)
+; ZVBB-NEXT: vsetvli a7, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vsseg7e32.v v8, (a0)
; ZVBB-NEXT: add a7, a6, a2
-; ZVBB-NEXT: vle16.v v8, (a7)
-; ZVBB-NEXT: vle16.v v10, (a6)
+; ZVBB-NEXT: vle32.v v8, (a7)
+; ZVBB-NEXT: vle32.v v10, (a6)
; ZVBB-NEXT: add a6, a1, a1
; ZVBB-NEXT: add a2, a7, a2
-; ZVBB-NEXT: vle16.v v12, (a5)
-; ZVBB-NEXT: vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT: vle32.v v12, (a5)
+; ZVBB-NEXT: vsetvli zero, a6, e32, m1, ta, ma
; ZVBB-NEXT: vslideup.vx v10, v8, a1
-; ZVBB-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
-; ZVBB-NEXT: vle16.v v11, (a2)
-; ZVBB-NEXT: vle16.v v9, (a4)
-; ZVBB-NEXT: vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT: vsetvli a5, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vle32.v v11, (a2)
+; ZVBB-NEXT: vle32.v v9, (a4)
+; ZVBB-NEXT: vsetvli zero, a6, e32, m1, ta, ma
; ZVBB-NEXT: vslideup.vx v9, v12, a1
-; ZVBB-NEXT: vsetvli a2, zero, e16, mf2, ta, ma
-; ZVBB-NEXT: vle16.v v12, (a3)
-; ZVBB-NEXT: vle16.v v8, (a0)
-; ZVBB-NEXT: vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vle32.v v12, (a3)
+; ZVBB-NEXT: vle32.v v8, (a0)
+; ZVBB-NEXT: vsetvli zero, a6, e32, m1, ta, ma
; ZVBB-NEXT: vslideup.vx v8, v12, a1
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 2
; ZVBB-NEXT: add sp, sp, a0
; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
- %res = call <vscale x 14 x bfloat> @llvm.vector.interleave7.nxv14bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4, <vscale x 2 x bfloat> %v5, <vscale x 2 x bfloat> %v6)
- ret <vscale x 14 x bfloat> %res
+ %res = call <vscale x 7 x float> @llvm.vector.interleave7.nxv7f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4, <vscale x 1 x float> %v5, <vscale x 1 x float> %v6)
+ ret <vscale x 7 x float> %res
}
-define <vscale x 28 x bfloat> @vector_interleave_nxv28bf16_nxv4bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4, <vscale x 4 x bfloat> %v5, <vscale x 4 x bfloat> %v6) nounwind {
-; CHECK-LABEL: vector_interleave_nxv28bf16_nxv4bf16:
+define <vscale x 14 x float> @vector_interleave_nxv14f32_nxv2f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4, <vscale x 2 x float> %v5, <vscale x 2 x float> %v6) nounwind {
+; CHECK-LABEL: vector_interleave_nxv14f32_nxv2f32:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
@@ -7407,19 +12975,19 @@ define <vscale x 28 x bfloat> @vector_interleave_nxv28bf16_nxv4bf16(<vscale x 4
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: add a2, a0, a1
; CHECK-NEXT: add a3, a2, a1
-; CHECK-NEXT: vsetvli a4, zero, e16, m1, ta, ma
-; CHECK-NEXT: vsseg7e16.v v8, (a0)
-; CHECK-NEXT: vl1re16.v v10, (a3)
+; CHECK-NEXT: vsetvli a4, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg7e32.v v8, (a0)
+; CHECK-NEXT: vl1re32.v v10, (a3)
; CHECK-NEXT: add a3, a3, a1
-; CHECK-NEXT: vl1re16.v v11, (a3)
+; CHECK-NEXT: vl1re32.v v11, (a3)
; CHECK-NEXT: add a3, a3, a1
-; CHECK-NEXT: vl1re16.v v8, (a0)
+; CHECK-NEXT: vl1re32.v v8, (a0)
; CHECK-NEXT: add a0, a3, a1
-; CHECK-NEXT: vl1re16.v v9, (a2)
-; CHECK-NEXT: vl1re16.v v12, (a3)
-; CHECK-NEXT: vl1re16.v v13, (a0)
+; CHECK-NEXT: vl1re32.v v9, (a2)
+; CHECK-NEXT: vl1re32.v v12, (a3)
+; CHECK-NEXT: vl1re32.v v13, (a0)
; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl1re16.v v14, (a0)
+; CHECK-NEXT: vl1re32.v v14, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a1, a0, 3
; CHECK-NEXT: sub a0, a1, a0
@@ -7427,7 +12995,7 @@ define <vscale x 28 x bfloat> @vector_interleave_nxv28bf16_nxv4bf16(<vscale x 4
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv28bf16_nxv4bf16:
+; ZVBB-LABEL: vector_interleave_nxv14f32_nxv2f32:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: csrr a0, vlenb
@@ -7438,31 +13006,31 @@ define <vscale x 28 x bfloat> @vector_interleave_nxv28bf16_nxv4bf16(<vscale x 4
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: add a2, a0, a1
; ZVBB-NEXT: add a3, a2, a1
-; ZVBB-NEXT: vsetvli a4, zero, e16, m1, ta, ma
-; ZVBB-NEXT: vsseg7e16.v v8, (a0)
-; ZVBB-NEXT: vl1re16.v v10, (a3)
+; ZVBB-NEXT: vsetvli a4, zero, e32, m1, ta, ma
+; ZVBB-NEXT: vsseg7e32.v v8, (a0)
+; ZVBB-NEXT: vl1re32.v v10, (a3)
; ZVBB-NEXT: add a3, a3, a1
-; ZVBB-NEXT: vl1re16.v v11, (a3)
+; ZVBB-NEXT: vl1re32.v v11, (a3)
; ZVBB-NEXT: add a3, a3, a1
-; ZVBB-NEXT: vl1re16.v v8, (a0)
+; ZVBB-NEXT: vl1re32.v v8, (a0)
; ZVBB-NEXT: add a0, a3, a1
-; ZVBB-NEXT: vl1re16.v v9, (a2)
-; ZVBB-NEXT: vl1re16.v v12, (a3)
-; ZVBB-NEXT: vl1re16.v v13, (a0)
+; ZVBB-NEXT: vl1re32.v v9, (a2)
+; ZVBB-NEXT: vl1re32.v v12, (a3)
+; ZVBB-NEXT: vl1re32.v v13, (a0)
; ZVBB-NEXT: add a0, a0, a1
-; ZVBB-NEXT: vl1re16.v v14, (a0)
+; ZVBB-NEXT: vl1re32.v v14, (a0)
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a1, a0, 3
; ZVBB-NEXT: sub a0, a1, a0
; ZVBB-NEXT: add sp, sp, a0
; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
- %res = call <vscale x 28 x bfloat> @llvm.vector.interleave7.nxv28bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4, <vscale x 4 x bfloat> %v5, <vscale x 4 x bfloat> %v6)
- ret <vscale x 28 x bfloat> %res
+ %res = call <vscale x 14 x float> @llvm.vector.interleave7.nxv14f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4, <vscale x 2 x float> %v5, <vscale x 2 x float> %v6)
+ ret <vscale x 14 x float> %res
}
-define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4, <vscale x 8 x bfloat> %v5, <vscale x 8 x bfloat> %v6) nounwind {
-; RV32-LABEL: vector_interleave_nxv56bf16_nxv8bf16:
+define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4, <vscale x 4 x float> %v5, <vscale x 4 x float> %v6) nounwind {
+; RV32-LABEL: vector_interleave_nxv28f32_nxv4f32:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -80
; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
@@ -7472,7 +13040,7 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: sub sp, sp, a0
; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
; RV32-NEXT: vmv2r.v v26, v20
; RV32-NEXT: addi a0, sp, 64
; RV32-NEXT: vmv2r.v v24, v16
@@ -7500,51 +13068,51 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
; RV32-NEXT: vmv1r.v v22, v11
; RV32-NEXT: add a6, a7, a2
; RV32-NEXT: vmv1r.v v24, v15
-; RV32-NEXT: vsseg7e16.v v1, (a0)
+; RV32-NEXT: vsseg7e32.v v1, (a0)
; RV32-NEXT: vmv1r.v v26, v19
-; RV32-NEXT: vsseg7e16.v v21, (a1)
-; RV32-NEXT: vl1re16.v v18, (a6)
+; RV32-NEXT: vsseg7e32.v v21, (a1)
+; RV32-NEXT: vl1re32.v v18, (a6)
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re16.v v19, (a6)
+; RV32-NEXT: vl1re32.v v19, (a6)
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re16.v v20, (a6)
+; RV32-NEXT: vl1re32.v v20, (a6)
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re16.v v21, (a6)
+; RV32-NEXT: vl1re32.v v21, (a6)
; RV32-NEXT: add a6, a3, a2
-; RV32-NEXT: vl1re16.v v10, (a6)
+; RV32-NEXT: vl1re32.v v10, (a6)
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re16.v v11, (a6)
-; RV32-NEXT: vl1re16.v v8, (a0)
-; RV32-NEXT: vl1re16.v v16, (a4)
-; RV32-NEXT: vl1re16.v v9, (a3)
-; RV32-NEXT: vl1re16.v v17, (a7)
+; RV32-NEXT: vl1re32.v v11, (a6)
+; RV32-NEXT: vl1re32.v v8, (a0)
+; RV32-NEXT: vl1re32.v v16, (a4)
+; RV32-NEXT: vl1re32.v v9, (a3)
+; RV32-NEXT: vl1re32.v v17, (a7)
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a3, 14
; RV32-NEXT: mul a0, a0, a3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 64
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re16.v v12, (a6)
+; RV32-NEXT: vl1re32.v v12, (a6)
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re16.v v13, (a6)
+; RV32-NEXT: vl1re32.v v13, (a6)
; RV32-NEXT: add a6, a6, a2
; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: add a2, a0, a2
-; RV32-NEXT: vl1re16.v v14, (a6)
-; RV32-NEXT: vl1re16.v v15, (a1)
+; RV32-NEXT: vl1re32.v v14, (a6)
+; RV32-NEXT: vl1re32.v v15, (a1)
; RV32-NEXT: add a5, a0, a5
; RV32-NEXT: vs2r.v v20, (a5)
; RV32-NEXT: vs4r.v v16, (a2)
; RV32-NEXT: vs8r.v v8, (a0)
-; RV32-NEXT: vl8re16.v v16, (a2)
-; RV32-NEXT: vl8re16.v v8, (a0)
+; RV32-NEXT: vl8re32.v v16, (a2)
+; RV32-NEXT: vl8re32.v v8, (a0)
; RV32-NEXT: addi sp, s0, -80
; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 80
; RV32-NEXT: ret
;
-; RV64-LABEL: vector_interleave_nxv56bf16_nxv8bf16:
+; RV64-LABEL: vector_interleave_nxv28f32_nxv4f32:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -80
; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -7554,7 +13122,7 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
; RV64-NEXT: slli a0, a0, 5
; RV64-NEXT: sub sp, sp, a0
; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
; RV64-NEXT: vmv2r.v v26, v20
; RV64-NEXT: addi a0, sp, 64
; RV64-NEXT: vmv2r.v v24, v16
@@ -7582,51 +13150,51 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
; RV64-NEXT: vmv1r.v v22, v11
; RV64-NEXT: add a6, a7, a2
; RV64-NEXT: vmv1r.v v24, v15
-; RV64-NEXT: vsseg7e16.v v1, (a0)
+; RV64-NEXT: vsseg7e32.v v1, (a0)
; RV64-NEXT: vmv1r.v v26, v19
-; RV64-NEXT: vsseg7e16.v v21, (a1)
-; RV64-NEXT: vl1re16.v v18, (a6)
+; RV64-NEXT: vsseg7e32.v v21, (a1)
+; RV64-NEXT: vl1re32.v v18, (a6)
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re16.v v19, (a6)
+; RV64-NEXT: vl1re32.v v19, (a6)
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re16.v v20, (a6)
+; RV64-NEXT: vl1re32.v v20, (a6)
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re16.v v21, (a6)
+; RV64-NEXT: vl1re32.v v21, (a6)
; RV64-NEXT: add a6, a3, a2
-; RV64-NEXT: vl1re16.v v10, (a6)
+; RV64-NEXT: vl1re32.v v10, (a6)
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re16.v v11, (a6)
-; RV64-NEXT: vl1re16.v v8, (a0)
-; RV64-NEXT: vl1re16.v v16, (a4)
-; RV64-NEXT: vl1re16.v v9, (a3)
-; RV64-NEXT: vl1re16.v v17, (a7)
+; RV64-NEXT: vl1re32.v v11, (a6)
+; RV64-NEXT: vl1re32.v v8, (a0)
+; RV64-NEXT: vl1re32.v v16, (a4)
+; RV64-NEXT: vl1re32.v v9, (a3)
+; RV64-NEXT: vl1re32.v v17, (a7)
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: li a3, 14
; RV64-NEXT: mul a0, a0, a3
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 64
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re16.v v12, (a6)
+; RV64-NEXT: vl1re32.v v12, (a6)
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re16.v v13, (a6)
+; RV64-NEXT: vl1re32.v v13, (a6)
; RV64-NEXT: add a6, a6, a2
; RV64-NEXT: slli a2, a2, 3
; RV64-NEXT: add a2, a0, a2
-; RV64-NEXT: vl1re16.v v14, (a6)
-; RV64-NEXT: vl1re16.v v15, (a1)
+; RV64-NEXT: vl1re32.v v14, (a6)
+; RV64-NEXT: vl1re32.v v15, (a1)
; RV64-NEXT: add a5, a0, a5
; RV64-NEXT: vs2r.v v20, (a5)
; RV64-NEXT: vs4r.v v16, (a2)
; RV64-NEXT: vs8r.v v8, (a0)
-; RV64-NEXT: vl8re16.v v16, (a2)
-; RV64-NEXT: vl8re16.v v8, (a0)
+; RV64-NEXT: vl8re32.v v16, (a2)
+; RV64-NEXT: vl8re32.v v8, (a0)
; RV64-NEXT: addi sp, s0, -80
; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 80
; RV64-NEXT: ret
;
-; ZVBB-RV32-LABEL: vector_interleave_nxv56bf16_nxv8bf16:
+; ZVBB-RV32-LABEL: vector_interleave_nxv28f32_nxv4f32:
; ZVBB-RV32: # %bb.0:
; ZVBB-RV32-NEXT: addi sp, sp, -80
; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
@@ -7636,7 +13204,7 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
; ZVBB-RV32-NEXT: slli a0, a0, 5
; ZVBB-RV32-NEXT: sub sp, sp, a0
; ZVBB-RV32-NEXT: andi sp, sp, -64
-; ZVBB-RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
; ZVBB-RV32-NEXT: vmv2r.v v26, v20
; ZVBB-RV32-NEXT: addi a0, sp, 64
; ZVBB-RV32-NEXT: vmv2r.v v24, v16
@@ -7664,51 +13232,51 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
; ZVBB-RV32-NEXT: vmv1r.v v22, v11
; ZVBB-RV32-NEXT: add a6, a7, a2
; ZVBB-RV32-NEXT: vmv1r.v v24, v15
-; ZVBB-RV32-NEXT: vsseg7e16.v v1, (a0)
+; ZVBB-RV32-NEXT: vsseg7e32.v v1, (a0)
; ZVBB-RV32-NEXT: vmv1r.v v26, v19
-; ZVBB-RV32-NEXT: vsseg7e16.v v21, (a1)
-; ZVBB-RV32-NEXT: vl1re16.v v18, (a6)
+; ZVBB-RV32-NEXT: vsseg7e32.v v21, (a1)
+; ZVBB-RV32-NEXT: vl1re32.v v18, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re16.v v19, (a6)
+; ZVBB-RV32-NEXT: vl1re32.v v19, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re16.v v20, (a6)
+; ZVBB-RV32-NEXT: vl1re32.v v20, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re16.v v21, (a6)
+; ZVBB-RV32-NEXT: vl1re32.v v21, (a6)
; ZVBB-RV32-NEXT: add a6, a3, a2
-; ZVBB-RV32-NEXT: vl1re16.v v10, (a6)
+; ZVBB-RV32-NEXT: vl1re32.v v10, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re16.v v11, (a6)
-; ZVBB-RV32-NEXT: vl1re16.v v8, (a0)
-; ZVBB-RV32-NEXT: vl1re16.v v16, (a4)
-; ZVBB-RV32-NEXT: vl1re16.v v9, (a3)
-; ZVBB-RV32-NEXT: vl1re16.v v17, (a7)
+; ZVBB-RV32-NEXT: vl1re32.v v11, (a6)
+; ZVBB-RV32-NEXT: vl1re32.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re32.v v16, (a4)
+; ZVBB-RV32-NEXT: vl1re32.v v9, (a3)
+; ZVBB-RV32-NEXT: vl1re32.v v17, (a7)
; ZVBB-RV32-NEXT: csrr a0, vlenb
; ZVBB-RV32-NEXT: li a3, 14
; ZVBB-RV32-NEXT: mul a0, a0, a3
; ZVBB-RV32-NEXT: add a0, sp, a0
; ZVBB-RV32-NEXT: addi a0, a0, 64
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re16.v v12, (a6)
+; ZVBB-RV32-NEXT: vl1re32.v v12, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re16.v v13, (a6)
+; ZVBB-RV32-NEXT: vl1re32.v v13, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
; ZVBB-RV32-NEXT: slli a2, a2, 3
; ZVBB-RV32-NEXT: add a2, a0, a2
-; ZVBB-RV32-NEXT: vl1re16.v v14, (a6)
-; ZVBB-RV32-NEXT: vl1re16.v v15, (a1)
+; ZVBB-RV32-NEXT: vl1re32.v v14, (a6)
+; ZVBB-RV32-NEXT: vl1re32.v v15, (a1)
; ZVBB-RV32-NEXT: add a5, a0, a5
; ZVBB-RV32-NEXT: vs2r.v v20, (a5)
; ZVBB-RV32-NEXT: vs4r.v v16, (a2)
; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
-; ZVBB-RV32-NEXT: vl8re16.v v16, (a2)
-; ZVBB-RV32-NEXT: vl8re16.v v8, (a0)
+; ZVBB-RV32-NEXT: vl8re32.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8re32.v v8, (a0)
; ZVBB-RV32-NEXT: addi sp, s0, -80
; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; ZVBB-RV32-NEXT: addi sp, sp, 80
; ZVBB-RV32-NEXT: ret
;
-; ZVBB-RV64-LABEL: vector_interleave_nxv56bf16_nxv8bf16:
+; ZVBB-RV64-LABEL: vector_interleave_nxv28f32_nxv4f32:
; ZVBB-RV64: # %bb.0:
; ZVBB-RV64-NEXT: addi sp, sp, -80
; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -7718,7 +13286,7 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
; ZVBB-RV64-NEXT: slli a0, a0, 5
; ZVBB-RV64-NEXT: sub sp, sp, a0
; ZVBB-RV64-NEXT: andi sp, sp, -64
-; ZVBB-RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
; ZVBB-RV64-NEXT: vmv2r.v v26, v20
; ZVBB-RV64-NEXT: addi a0, sp, 64
; ZVBB-RV64-NEXT: vmv2r.v v24, v16
@@ -7746,51 +13314,51 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
; ZVBB-RV64-NEXT: vmv1r.v v22, v11
; ZVBB-RV64-NEXT: add a6, a7, a2
; ZVBB-RV64-NEXT: vmv1r.v v24, v15
-; ZVBB-RV64-NEXT: vsseg7e16.v v1, (a0)
+; ZVBB-RV64-NEXT: vsseg7e32.v v1, (a0)
; ZVBB-RV64-NEXT: vmv1r.v v26, v19
-; ZVBB-RV64-NEXT: vsseg7e16.v v21, (a1)
-; ZVBB-RV64-NEXT: vl1re16.v v18, (a6)
+; ZVBB-RV64-NEXT: vsseg7e32.v v21, (a1)
+; ZVBB-RV64-NEXT: vl1re32.v v18, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re16.v v19, (a6)
+; ZVBB-RV64-NEXT: vl1re32.v v19, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re16.v v20, (a6)
+; ZVBB-RV64-NEXT: vl1re32.v v20, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re16.v v21, (a6)
+; ZVBB-RV64-NEXT: vl1re32.v v21, (a6)
; ZVBB-RV64-NEXT: add a6, a3, a2
-; ZVBB-RV64-NEXT: vl1re16.v v10, (a6)
+; ZVBB-RV64-NEXT: vl1re32.v v10, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re16.v v11, (a6)
-; ZVBB-RV64-NEXT: vl1re16.v v8, (a0)
-; ZVBB-RV64-NEXT: vl1re16.v v16, (a4)
-; ZVBB-RV64-NEXT: vl1re16.v v9, (a3)
-; ZVBB-RV64-NEXT: vl1re16.v v17, (a7)
+; ZVBB-RV64-NEXT: vl1re32.v v11, (a6)
+; ZVBB-RV64-NEXT: vl1re32.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re32.v v16, (a4)
+; ZVBB-RV64-NEXT: vl1re32.v v9, (a3)
+; ZVBB-RV64-NEXT: vl1re32.v v17, (a7)
; ZVBB-RV64-NEXT: csrr a0, vlenb
; ZVBB-RV64-NEXT: li a3, 14
; ZVBB-RV64-NEXT: mul a0, a0, a3
; ZVBB-RV64-NEXT: add a0, sp, a0
; ZVBB-RV64-NEXT: addi a0, a0, 64
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re16.v v12, (a6)
+; ZVBB-RV64-NEXT: vl1re32.v v12, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re16.v v13, (a6)
+; ZVBB-RV64-NEXT: vl1re32.v v13, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
; ZVBB-RV64-NEXT: slli a2, a2, 3
; ZVBB-RV64-NEXT: add a2, a0, a2
-; ZVBB-RV64-NEXT: vl1re16.v v14, (a6)
-; ZVBB-RV64-NEXT: vl1re16.v v15, (a1)
+; ZVBB-RV64-NEXT: vl1re32.v v14, (a6)
+; ZVBB-RV64-NEXT: vl1re32.v v15, (a1)
; ZVBB-RV64-NEXT: add a5, a0, a5
; ZVBB-RV64-NEXT: vs2r.v v20, (a5)
; ZVBB-RV64-NEXT: vs4r.v v16, (a2)
; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
-; ZVBB-RV64-NEXT: vl8re16.v v16, (a2)
-; ZVBB-RV64-NEXT: vl8re16.v v8, (a0)
+; ZVBB-RV64-NEXT: vl8re32.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8re32.v v8, (a0)
; ZVBB-RV64-NEXT: addi sp, s0, -80
; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZVBB-RV64-NEXT: addi sp, sp, 80
; ZVBB-RV64-NEXT: ret
;
-; ZIP-LABEL: vector_interleave_nxv56bf16_nxv8bf16:
+; ZIP-LABEL: vector_interleave_nxv28f32_nxv4f32:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -80
; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -7800,7 +13368,7 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
; ZIP-NEXT: slli a0, a0, 5
; ZIP-NEXT: sub sp, sp, a0
; ZIP-NEXT: andi sp, sp, -64
-; ZIP-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT: vsetvli a0, zero, e32, m1, ta, ma
; ZIP-NEXT: vmv2r.v v26, v20
; ZIP-NEXT: addi a0, sp, 64
; ZIP-NEXT: vmv2r.v v24, v16
@@ -7828,139 +13396,55 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
; ZIP-NEXT: vmv1r.v v22, v11
; ZIP-NEXT: add a6, a7, a2
; ZIP-NEXT: vmv1r.v v24, v15
-; ZIP-NEXT: vsseg7e16.v v1, (a0)
+; ZIP-NEXT: vsseg7e32.v v1, (a0)
; ZIP-NEXT: vmv1r.v v26, v19
-; ZIP-NEXT: vsseg7e16.v v21, (a1)
-; ZIP-NEXT: vl1re16.v v18, (a6)
+; ZIP-NEXT: vsseg7e32.v v21, (a1)
+; ZIP-NEXT: vl1re32.v v18, (a6)
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re16.v v19, (a6)
+; ZIP-NEXT: vl1re32.v v19, (a6)
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re16.v v20, (a6)
+; ZIP-NEXT: vl1re32.v v20, (a6)
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re16.v v21, (a6)
+; ZIP-NEXT: vl1re32.v v21, (a6)
; ZIP-NEXT: add a6, a3, a2
-; ZIP-NEXT: vl1re16.v v10, (a6)
+; ZIP-NEXT: vl1re32.v v10, (a6)
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re16.v v11, (a6)
-; ZIP-NEXT: vl1re16.v v8, (a0)
-; ZIP-NEXT: vl1re16.v v16, (a4)
-; ZIP-NEXT: vl1re16.v v9, (a3)
-; ZIP-NEXT: vl1re16.v v17, (a7)
+; ZIP-NEXT: vl1re32.v v11, (a6)
+; ZIP-NEXT: vl1re32.v v8, (a0)
+; ZIP-NEXT: vl1re32.v v16, (a4)
+; ZIP-NEXT: vl1re32.v v9, (a3)
+; ZIP-NEXT: vl1re32.v v17, (a7)
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: li a3, 14
; ZIP-NEXT: mul a0, a0, a3
; ZIP-NEXT: add a0, sp, a0
; ZIP-NEXT: addi a0, a0, 64
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re16.v v12, (a6)
+; ZIP-NEXT: vl1re32.v v12, (a6)
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re16.v v13, (a6)
+; ZIP-NEXT: vl1re32.v v13, (a6)
; ZIP-NEXT: add a6, a6, a2
; ZIP-NEXT: slli a2, a2, 3
; ZIP-NEXT: add a2, a0, a2
-; ZIP-NEXT: vl1re16.v v14, (a6)
-; ZIP-NEXT: vl1re16.v v15, (a1)
+; ZIP-NEXT: vl1re32.v v14, (a6)
+; ZIP-NEXT: vl1re32.v v15, (a1)
; ZIP-NEXT: add a5, a0, a5
; ZIP-NEXT: vs2r.v v20, (a5)
; ZIP-NEXT: vs4r.v v16, (a2)
; ZIP-NEXT: vs8r.v v8, (a0)
-; ZIP-NEXT: vl8re16.v v16, (a2)
-; ZIP-NEXT: vl8re16.v v8, (a0)
+; ZIP-NEXT: vl8re32.v v16, (a2)
+; ZIP-NEXT: vl8re32.v v8, (a0)
; ZIP-NEXT: addi sp, s0, -80
; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZIP-NEXT: addi sp, sp, 80
; ZIP-NEXT: ret
- %res = call <vscale x 56 x bfloat> @llvm.vector.interleave7.nxv56bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4, <vscale x 8 x bfloat> %v5, <vscale x 8 x bfloat> %v6)
- ret <vscale x 56 x bfloat> %res
-}
-
-define <vscale x 7 x float> @vector_interleave_nxv7f32_nxv1f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4, <vscale x 1 x float> %v5, <vscale x 1 x float> %v6) nounwind {
-; CHECK-LABEL: vector_interleave_nxv7f32_nxv1f32:
-; CHECK: # %bb.0:
-; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: addi a0, sp, 16
-; CHECK-NEXT: csrr a1, vlenb
-; CHECK-NEXT: srli a2, a1, 1
-; CHECK-NEXT: srli a1, a1, 3
-; CHECK-NEXT: add a3, a0, a2
-; CHECK-NEXT: add a4, a3, a2
-; CHECK-NEXT: add a5, a4, a2
-; CHECK-NEXT: add a6, a5, a2
-; CHECK-NEXT: vsetvli a7, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vsseg7e32.v v8, (a0)
-; CHECK-NEXT: add a7, a6, a2
-; CHECK-NEXT: vle32.v v8, (a7)
-; CHECK-NEXT: vle32.v v10, (a6)
-; CHECK-NEXT: add a6, a1, a1
-; CHECK-NEXT: add a2, a7, a2
-; CHECK-NEXT: vle32.v v12, (a5)
-; CHECK-NEXT: vsetvli zero, a6, e32, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v10, v8, a1
-; CHECK-NEXT: vsetvli a5, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v11, (a2)
-; CHECK-NEXT: vle32.v v9, (a4)
-; CHECK-NEXT: vsetvli zero, a6, e32, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v9, v12, a1
-; CHECK-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
-; CHECK-NEXT: vle32.v v12, (a3)
-; CHECK-NEXT: vle32.v v8, (a0)
-; CHECK-NEXT: vsetvli zero, a6, e32, m1, ta, ma
-; CHECK-NEXT: vslideup.vx v8, v12, a1
-; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a0, a0, 2
-; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: ret
-;
-; ZVBB-LABEL: vector_interleave_nxv7f32_nxv1f32:
-; ZVBB: # %bb.0:
-; ZVBB-NEXT: addi sp, sp, -16
-; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a0, a0, 2
-; ZVBB-NEXT: sub sp, sp, a0
-; ZVBB-NEXT: addi a0, sp, 16
-; ZVBB-NEXT: csrr a1, vlenb
-; ZVBB-NEXT: srli a2, a1, 1
-; ZVBB-NEXT: srli a1, a1, 3
-; ZVBB-NEXT: add a3, a0, a2
-; ZVBB-NEXT: add a4, a3, a2
-; ZVBB-NEXT: add a5, a4, a2
-; ZVBB-NEXT: add a6, a5, a2
-; ZVBB-NEXT: vsetvli a7, zero, e32, mf2, ta, ma
-; ZVBB-NEXT: vsseg7e32.v v8, (a0)
-; ZVBB-NEXT: add a7, a6, a2
-; ZVBB-NEXT: vle32.v v8, (a7)
-; ZVBB-NEXT: vle32.v v10, (a6)
-; ZVBB-NEXT: add a6, a1, a1
-; ZVBB-NEXT: add a2, a7, a2
-; ZVBB-NEXT: vle32.v v12, (a5)
-; ZVBB-NEXT: vsetvli zero, a6, e32, m1, ta, ma
-; ZVBB-NEXT: vslideup.vx v10, v8, a1
-; ZVBB-NEXT: vsetvli a5, zero, e32, mf2, ta, ma
-; ZVBB-NEXT: vle32.v v11, (a2)
-; ZVBB-NEXT: vle32.v v9, (a4)
-; ZVBB-NEXT: vsetvli zero, a6, e32, m1, ta, ma
-; ZVBB-NEXT: vslideup.vx v9, v12, a1
-; ZVBB-NEXT: vsetvli a2, zero, e32, mf2, ta, ma
-; ZVBB-NEXT: vle32.v v12, (a3)
-; ZVBB-NEXT: vle32.v v8, (a0)
-; ZVBB-NEXT: vsetvli zero, a6, e32, m1, ta, ma
-; ZVBB-NEXT: vslideup.vx v8, v12, a1
-; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a0, a0, 2
-; ZVBB-NEXT: add sp, sp, a0
-; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: ret
- %res = call <vscale x 7 x float> @llvm.vector.interleave7.nxv7f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4, <vscale x 1 x float> %v5, <vscale x 1 x float> %v6)
- ret <vscale x 7 x float> %res
+ %res = call <vscale x 28 x float> @llvm.vector.interleave7.nxv28f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4, <vscale x 4 x float> %v5, <vscale x 4 x float> %v6)
+ ret <vscale x 28 x float> %res
}
-define <vscale x 14 x float> @vector_interleave_nxv14f32_nxv2f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4, <vscale x 2 x float> %v5, <vscale x 2 x float> %v6) nounwind {
-; CHECK-LABEL: vector_interleave_nxv14f32_nxv2f32:
+define <vscale x 7 x double> @vector_interleave_nxv7f64_nxv1f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4, <vscale x 1 x double> %v5, <vscale x 1 x double> %v6) nounwind {
+; CHECK-LABEL: vector_interleave_nxv7f64_nxv1f64:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
@@ -7971,19 +13455,19 @@ define <vscale x 14 x float> @vector_interleave_nxv14f32_nxv2f32(<vscale x 2 x f
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: add a2, a0, a1
; CHECK-NEXT: add a3, a2, a1
-; CHECK-NEXT: vsetvli a4, zero, e32, m1, ta, ma
-; CHECK-NEXT: vsseg7e32.v v8, (a0)
-; CHECK-NEXT: vl1re32.v v10, (a3)
+; CHECK-NEXT: vsetvli a4, zero, e64, m1, ta, ma
+; CHECK-NEXT: vsseg7e64.v v8, (a0)
+; CHECK-NEXT: vl1re64.v v10, (a3)
; CHECK-NEXT: add a3, a3, a1
-; CHECK-NEXT: vl1re32.v v11, (a3)
+; CHECK-NEXT: vl1re64.v v11, (a3)
; CHECK-NEXT: add a3, a3, a1
-; CHECK-NEXT: vl1re32.v v8, (a0)
+; CHECK-NEXT: vl1re64.v v8, (a0)
; CHECK-NEXT: add a0, a3, a1
-; CHECK-NEXT: vl1re32.v v9, (a2)
-; CHECK-NEXT: vl1re32.v v12, (a3)
-; CHECK-NEXT: vl1re32.v v13, (a0)
+; CHECK-NEXT: vl1re64.v v9, (a2)
+; CHECK-NEXT: vl1re64.v v12, (a3)
+; CHECK-NEXT: vl1re64.v v13, (a0)
; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl1re32.v v14, (a0)
+; CHECK-NEXT: vl1re64.v v14, (a0)
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a1, a0, 3
; CHECK-NEXT: sub a0, a1, a0
@@ -7991,7 +13475,7 @@ define <vscale x 14 x float> @vector_interleave_nxv14f32_nxv2f32(<vscale x 2 x f
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv14f32_nxv2f32:
+; ZVBB-LABEL: vector_interleave_nxv7f64_nxv1f64:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: csrr a0, vlenb
@@ -8002,31 +13486,31 @@ define <vscale x 14 x float> @vector_interleave_nxv14f32_nxv2f32(<vscale x 2 x f
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: add a2, a0, a1
; ZVBB-NEXT: add a3, a2, a1
-; ZVBB-NEXT: vsetvli a4, zero, e32, m1, ta, ma
-; ZVBB-NEXT: vsseg7e32.v v8, (a0)
-; ZVBB-NEXT: vl1re32.v v10, (a3)
+; ZVBB-NEXT: vsetvli a4, zero, e64, m1, ta, ma
+; ZVBB-NEXT: vsseg7e64.v v8, (a0)
+; ZVBB-NEXT: vl1re64.v v10, (a3)
; ZVBB-NEXT: add a3, a3, a1
-; ZVBB-NEXT: vl1re32.v v11, (a3)
+; ZVBB-NEXT: vl1re64.v v11, (a3)
; ZVBB-NEXT: add a3, a3, a1
-; ZVBB-NEXT: vl1re32.v v8, (a0)
+; ZVBB-NEXT: vl1re64.v v8, (a0)
; ZVBB-NEXT: add a0, a3, a1
-; ZVBB-NEXT: vl1re32.v v9, (a2)
-; ZVBB-NEXT: vl1re32.v v12, (a3)
-; ZVBB-NEXT: vl1re32.v v13, (a0)
+; ZVBB-NEXT: vl1re64.v v9, (a2)
+; ZVBB-NEXT: vl1re64.v v12, (a3)
+; ZVBB-NEXT: vl1re64.v v13, (a0)
; ZVBB-NEXT: add a0, a0, a1
-; ZVBB-NEXT: vl1re32.v v14, (a0)
+; ZVBB-NEXT: vl1re64.v v14, (a0)
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a1, a0, 3
; ZVBB-NEXT: sub a0, a1, a0
; ZVBB-NEXT: add sp, sp, a0
; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
- %res = call <vscale x 14 x float> @llvm.vector.interleave7.nxv14f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4, <vscale x 2 x float> %v5, <vscale x 2 x float> %v6)
- ret <vscale x 14 x float> %res
+ %res = call <vscale x 7 x double> @llvm.vector.interleave7.nxv7f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4, <vscale x 1 x double> %v5, <vscale x 1 x double> %v6)
+ ret <vscale x 7 x double> %res
}
-define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4, <vscale x 4 x float> %v5, <vscale x 4 x float> %v6) nounwind {
-; RV32-LABEL: vector_interleave_nxv28f32_nxv4f32:
+define <vscale x 14 x double> @vector_interleave_nxv14f64_nxv2f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4, <vscale x 2 x double> %v5, <vscale x 2 x double> %v6) nounwind {
+; RV32-LABEL: vector_interleave_nxv14f64_nxv2f64:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -80
; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
@@ -8036,7 +13520,7 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
; RV32-NEXT: slli a0, a0, 5
; RV32-NEXT: sub sp, sp, a0
; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
; RV32-NEXT: vmv2r.v v26, v20
; RV32-NEXT: addi a0, sp, 64
; RV32-NEXT: vmv2r.v v24, v16
@@ -8064,51 +13548,51 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
; RV32-NEXT: vmv1r.v v22, v11
; RV32-NEXT: add a6, a7, a2
; RV32-NEXT: vmv1r.v v24, v15
-; RV32-NEXT: vsseg7e32.v v1, (a0)
+; RV32-NEXT: vsseg7e64.v v1, (a0)
; RV32-NEXT: vmv1r.v v26, v19
-; RV32-NEXT: vsseg7e32.v v21, (a1)
-; RV32-NEXT: vl1re32.v v18, (a6)
+; RV32-NEXT: vsseg7e64.v v21, (a1)
+; RV32-NEXT: vl1re64.v v18, (a6)
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re32.v v19, (a6)
+; RV32-NEXT: vl1re64.v v19, (a6)
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re32.v v20, (a6)
+; RV32-NEXT: vl1re64.v v20, (a6)
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re32.v v21, (a6)
+; RV32-NEXT: vl1re64.v v21, (a6)
; RV32-NEXT: add a6, a3, a2
-; RV32-NEXT: vl1re32.v v10, (a6)
+; RV32-NEXT: vl1re64.v v10, (a6)
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re32.v v11, (a6)
-; RV32-NEXT: vl1re32.v v8, (a0)
-; RV32-NEXT: vl1re32.v v16, (a4)
-; RV32-NEXT: vl1re32.v v9, (a3)
-; RV32-NEXT: vl1re32.v v17, (a7)
+; RV32-NEXT: vl1re64.v v11, (a6)
+; RV32-NEXT: vl1re64.v v8, (a0)
+; RV32-NEXT: vl1re64.v v16, (a4)
+; RV32-NEXT: vl1re64.v v9, (a3)
+; RV32-NEXT: vl1re64.v v17, (a7)
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: li a3, 14
; RV32-NEXT: mul a0, a0, a3
; RV32-NEXT: add a0, sp, a0
; RV32-NEXT: addi a0, a0, 64
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re32.v v12, (a6)
+; RV32-NEXT: vl1re64.v v12, (a6)
; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re32.v v13, (a6)
+; RV32-NEXT: vl1re64.v v13, (a6)
; RV32-NEXT: add a6, a6, a2
; RV32-NEXT: slli a2, a2, 3
; RV32-NEXT: add a2, a0, a2
-; RV32-NEXT: vl1re32.v v14, (a6)
-; RV32-NEXT: vl1re32.v v15, (a1)
+; RV32-NEXT: vl1re64.v v14, (a6)
+; RV32-NEXT: vl1re64.v v15, (a1)
; RV32-NEXT: add a5, a0, a5
; RV32-NEXT: vs2r.v v20, (a5)
; RV32-NEXT: vs4r.v v16, (a2)
; RV32-NEXT: vs8r.v v8, (a0)
-; RV32-NEXT: vl8re32.v v16, (a2)
-; RV32-NEXT: vl8re32.v v8, (a0)
+; RV32-NEXT: vl8re64.v v16, (a2)
+; RV32-NEXT: vl8re64.v v8, (a0)
; RV32-NEXT: addi sp, s0, -80
; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; RV32-NEXT: addi sp, sp, 80
; RV32-NEXT: ret
;
-; RV64-LABEL: vector_interleave_nxv28f32_nxv4f32:
+; RV64-LABEL: vector_interleave_nxv14f64_nxv2f64:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -80
; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -8118,7 +13602,7 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
; RV64-NEXT: slli a0, a0, 5
; RV64-NEXT: sub sp, sp, a0
; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
; RV64-NEXT: vmv2r.v v26, v20
; RV64-NEXT: addi a0, sp, 64
; RV64-NEXT: vmv2r.v v24, v16
@@ -8146,51 +13630,51 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
; RV64-NEXT: vmv1r.v v22, v11
; RV64-NEXT: add a6, a7, a2
; RV64-NEXT: vmv1r.v v24, v15
-; RV64-NEXT: vsseg7e32.v v1, (a0)
+; RV64-NEXT: vsseg7e64.v v1, (a0)
; RV64-NEXT: vmv1r.v v26, v19
-; RV64-NEXT: vsseg7e32.v v21, (a1)
-; RV64-NEXT: vl1re32.v v18, (a6)
+; RV64-NEXT: vsseg7e64.v v21, (a1)
+; RV64-NEXT: vl1re64.v v18, (a6)
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re32.v v19, (a6)
+; RV64-NEXT: vl1re64.v v19, (a6)
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re32.v v20, (a6)
+; RV64-NEXT: vl1re64.v v20, (a6)
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re32.v v21, (a6)
+; RV64-NEXT: vl1re64.v v21, (a6)
; RV64-NEXT: add a6, a3, a2
-; RV64-NEXT: vl1re32.v v10, (a6)
+; RV64-NEXT: vl1re64.v v10, (a6)
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re32.v v11, (a6)
-; RV64-NEXT: vl1re32.v v8, (a0)
-; RV64-NEXT: vl1re32.v v16, (a4)
-; RV64-NEXT: vl1re32.v v9, (a3)
-; RV64-NEXT: vl1re32.v v17, (a7)
+; RV64-NEXT: vl1re64.v v11, (a6)
+; RV64-NEXT: vl1re64.v v8, (a0)
+; RV64-NEXT: vl1re64.v v16, (a4)
+; RV64-NEXT: vl1re64.v v9, (a3)
+; RV64-NEXT: vl1re64.v v17, (a7)
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: li a3, 14
; RV64-NEXT: mul a0, a0, a3
; RV64-NEXT: add a0, sp, a0
; RV64-NEXT: addi a0, a0, 64
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re32.v v12, (a6)
+; RV64-NEXT: vl1re64.v v12, (a6)
; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re32.v v13, (a6)
+; RV64-NEXT: vl1re64.v v13, (a6)
; RV64-NEXT: add a6, a6, a2
; RV64-NEXT: slli a2, a2, 3
; RV64-NEXT: add a2, a0, a2
-; RV64-NEXT: vl1re32.v v14, (a6)
-; RV64-NEXT: vl1re32.v v15, (a1)
+; RV64-NEXT: vl1re64.v v14, (a6)
+; RV64-NEXT: vl1re64.v v15, (a1)
; RV64-NEXT: add a5, a0, a5
; RV64-NEXT: vs2r.v v20, (a5)
; RV64-NEXT: vs4r.v v16, (a2)
; RV64-NEXT: vs8r.v v8, (a0)
-; RV64-NEXT: vl8re32.v v16, (a2)
-; RV64-NEXT: vl8re32.v v8, (a0)
+; RV64-NEXT: vl8re64.v v16, (a2)
+; RV64-NEXT: vl8re64.v v8, (a0)
; RV64-NEXT: addi sp, s0, -80
; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; RV64-NEXT: addi sp, sp, 80
; RV64-NEXT: ret
;
-; ZVBB-RV32-LABEL: vector_interleave_nxv28f32_nxv4f32:
+; ZVBB-RV32-LABEL: vector_interleave_nxv14f64_nxv2f64:
; ZVBB-RV32: # %bb.0:
; ZVBB-RV32-NEXT: addi sp, sp, -80
; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
@@ -8200,7 +13684,7 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
; ZVBB-RV32-NEXT: slli a0, a0, 5
; ZVBB-RV32-NEXT: sub sp, sp, a0
; ZVBB-RV32-NEXT: andi sp, sp, -64
-; ZVBB-RV32-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
; ZVBB-RV32-NEXT: vmv2r.v v26, v20
; ZVBB-RV32-NEXT: addi a0, sp, 64
; ZVBB-RV32-NEXT: vmv2r.v v24, v16
@@ -8228,51 +13712,51 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
; ZVBB-RV32-NEXT: vmv1r.v v22, v11
; ZVBB-RV32-NEXT: add a6, a7, a2
; ZVBB-RV32-NEXT: vmv1r.v v24, v15
-; ZVBB-RV32-NEXT: vsseg7e32.v v1, (a0)
+; ZVBB-RV32-NEXT: vsseg7e64.v v1, (a0)
; ZVBB-RV32-NEXT: vmv1r.v v26, v19
-; ZVBB-RV32-NEXT: vsseg7e32.v v21, (a1)
-; ZVBB-RV32-NEXT: vl1re32.v v18, (a6)
+; ZVBB-RV32-NEXT: vsseg7e64.v v21, (a1)
+; ZVBB-RV32-NEXT: vl1re64.v v18, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re32.v v19, (a6)
+; ZVBB-RV32-NEXT: vl1re64.v v19, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re32.v v20, (a6)
+; ZVBB-RV32-NEXT: vl1re64.v v20, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re32.v v21, (a6)
+; ZVBB-RV32-NEXT: vl1re64.v v21, (a6)
; ZVBB-RV32-NEXT: add a6, a3, a2
-; ZVBB-RV32-NEXT: vl1re32.v v10, (a6)
+; ZVBB-RV32-NEXT: vl1re64.v v10, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re32.v v11, (a6)
-; ZVBB-RV32-NEXT: vl1re32.v v8, (a0)
-; ZVBB-RV32-NEXT: vl1re32.v v16, (a4)
-; ZVBB-RV32-NEXT: vl1re32.v v9, (a3)
-; ZVBB-RV32-NEXT: vl1re32.v v17, (a7)
+; ZVBB-RV32-NEXT: vl1re64.v v11, (a6)
+; ZVBB-RV32-NEXT: vl1re64.v v8, (a0)
+; ZVBB-RV32-NEXT: vl1re64.v v16, (a4)
+; ZVBB-RV32-NEXT: vl1re64.v v9, (a3)
+; ZVBB-RV32-NEXT: vl1re64.v v17, (a7)
; ZVBB-RV32-NEXT: csrr a0, vlenb
; ZVBB-RV32-NEXT: li a3, 14
; ZVBB-RV32-NEXT: mul a0, a0, a3
; ZVBB-RV32-NEXT: add a0, sp, a0
; ZVBB-RV32-NEXT: addi a0, a0, 64
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re32.v v12, (a6)
+; ZVBB-RV32-NEXT: vl1re64.v v12, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re32.v v13, (a6)
+; ZVBB-RV32-NEXT: vl1re64.v v13, (a6)
; ZVBB-RV32-NEXT: add a6, a6, a2
; ZVBB-RV32-NEXT: slli a2, a2, 3
; ZVBB-RV32-NEXT: add a2, a0, a2
-; ZVBB-RV32-NEXT: vl1re32.v v14, (a6)
-; ZVBB-RV32-NEXT: vl1re32.v v15, (a1)
+; ZVBB-RV32-NEXT: vl1re64.v v14, (a6)
+; ZVBB-RV32-NEXT: vl1re64.v v15, (a1)
; ZVBB-RV32-NEXT: add a5, a0, a5
; ZVBB-RV32-NEXT: vs2r.v v20, (a5)
; ZVBB-RV32-NEXT: vs4r.v v16, (a2)
; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
-; ZVBB-RV32-NEXT: vl8re32.v v16, (a2)
-; ZVBB-RV32-NEXT: vl8re32.v v8, (a0)
+; ZVBB-RV32-NEXT: vl8re64.v v16, (a2)
+; ZVBB-RV32-NEXT: vl8re64.v v8, (a0)
; ZVBB-RV32-NEXT: addi sp, s0, -80
; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
; ZVBB-RV32-NEXT: addi sp, sp, 80
; ZVBB-RV32-NEXT: ret
;
-; ZVBB-RV64-LABEL: vector_interleave_nxv28f32_nxv4f32:
+; ZVBB-RV64-LABEL: vector_interleave_nxv14f64_nxv2f64:
; ZVBB-RV64: # %bb.0:
; ZVBB-RV64-NEXT: addi sp, sp, -80
; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -8282,7 +13766,7 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
; ZVBB-RV64-NEXT: slli a0, a0, 5
; ZVBB-RV64-NEXT: sub sp, sp, a0
; ZVBB-RV64-NEXT: andi sp, sp, -64
-; ZVBB-RV64-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
; ZVBB-RV64-NEXT: vmv2r.v v26, v20
; ZVBB-RV64-NEXT: addi a0, sp, 64
; ZVBB-RV64-NEXT: vmv2r.v v24, v16
@@ -8310,51 +13794,51 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
; ZVBB-RV64-NEXT: vmv1r.v v22, v11
; ZVBB-RV64-NEXT: add a6, a7, a2
; ZVBB-RV64-NEXT: vmv1r.v v24, v15
-; ZVBB-RV64-NEXT: vsseg7e32.v v1, (a0)
+; ZVBB-RV64-NEXT: vsseg7e64.v v1, (a0)
; ZVBB-RV64-NEXT: vmv1r.v v26, v19
-; ZVBB-RV64-NEXT: vsseg7e32.v v21, (a1)
-; ZVBB-RV64-NEXT: vl1re32.v v18, (a6)
+; ZVBB-RV64-NEXT: vsseg7e64.v v21, (a1)
+; ZVBB-RV64-NEXT: vl1re64.v v18, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re32.v v19, (a6)
+; ZVBB-RV64-NEXT: vl1re64.v v19, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re32.v v20, (a6)
+; ZVBB-RV64-NEXT: vl1re64.v v20, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re32.v v21, (a6)
+; ZVBB-RV64-NEXT: vl1re64.v v21, (a6)
; ZVBB-RV64-NEXT: add a6, a3, a2
-; ZVBB-RV64-NEXT: vl1re32.v v10, (a6)
+; ZVBB-RV64-NEXT: vl1re64.v v10, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re32.v v11, (a6)
-; ZVBB-RV64-NEXT: vl1re32.v v8, (a0)
-; ZVBB-RV64-NEXT: vl1re32.v v16, (a4)
-; ZVBB-RV64-NEXT: vl1re32.v v9, (a3)
-; ZVBB-RV64-NEXT: vl1re32.v v17, (a7)
+; ZVBB-RV64-NEXT: vl1re64.v v11, (a6)
+; ZVBB-RV64-NEXT: vl1re64.v v8, (a0)
+; ZVBB-RV64-NEXT: vl1re64.v v16, (a4)
+; ZVBB-RV64-NEXT: vl1re64.v v9, (a3)
+; ZVBB-RV64-NEXT: vl1re64.v v17, (a7)
; ZVBB-RV64-NEXT: csrr a0, vlenb
; ZVBB-RV64-NEXT: li a3, 14
; ZVBB-RV64-NEXT: mul a0, a0, a3
; ZVBB-RV64-NEXT: add a0, sp, a0
; ZVBB-RV64-NEXT: addi a0, a0, 64
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re32.v v12, (a6)
+; ZVBB-RV64-NEXT: vl1re64.v v12, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re32.v v13, (a6)
+; ZVBB-RV64-NEXT: vl1re64.v v13, (a6)
; ZVBB-RV64-NEXT: add a6, a6, a2
; ZVBB-RV64-NEXT: slli a2, a2, 3
; ZVBB-RV64-NEXT: add a2, a0, a2
-; ZVBB-RV64-NEXT: vl1re32.v v14, (a6)
-; ZVBB-RV64-NEXT: vl1re32.v v15, (a1)
+; ZVBB-RV64-NEXT: vl1re64.v v14, (a6)
+; ZVBB-RV64-NEXT: vl1re64.v v15, (a1)
; ZVBB-RV64-NEXT: add a5, a0, a5
; ZVBB-RV64-NEXT: vs2r.v v20, (a5)
; ZVBB-RV64-NEXT: vs4r.v v16, (a2)
; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
-; ZVBB-RV64-NEXT: vl8re32.v v16, (a2)
-; ZVBB-RV64-NEXT: vl8re32.v v8, (a0)
+; ZVBB-RV64-NEXT: vl8re64.v v16, (a2)
+; ZVBB-RV64-NEXT: vl8re64.v v8, (a0)
; ZVBB-RV64-NEXT: addi sp, s0, -80
; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZVBB-RV64-NEXT: addi sp, sp, 80
; ZVBB-RV64-NEXT: ret
;
-; ZIP-LABEL: vector_interleave_nxv28f32_nxv4f32:
+; ZIP-LABEL: vector_interleave_nxv14f64_nxv2f64:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -80
; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
@@ -8364,7 +13848,7 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
; ZIP-NEXT: slli a0, a0, 5
; ZIP-NEXT: sub sp, sp, a0
; ZIP-NEXT: andi sp, sp, -64
-; ZIP-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZIP-NEXT: vsetvli a0, zero, e64, m1, ta, ma
; ZIP-NEXT: vmv2r.v v26, v20
; ZIP-NEXT: addi a0, sp, 64
; ZIP-NEXT: vmv2r.v v24, v16
@@ -8392,529 +13876,1147 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
; ZIP-NEXT: vmv1r.v v22, v11
; ZIP-NEXT: add a6, a7, a2
; ZIP-NEXT: vmv1r.v v24, v15
-; ZIP-NEXT: vsseg7e32.v v1, (a0)
+; ZIP-NEXT: vsseg7e64.v v1, (a0)
; ZIP-NEXT: vmv1r.v v26, v19
-; ZIP-NEXT: vsseg7e32.v v21, (a1)
-; ZIP-NEXT: vl1re32.v v18, (a6)
+; ZIP-NEXT: vsseg7e64.v v21, (a1)
+; ZIP-NEXT: vl1re64.v v18, (a6)
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re32.v v19, (a6)
+; ZIP-NEXT: vl1re64.v v19, (a6)
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re32.v v20, (a6)
+; ZIP-NEXT: vl1re64.v v20, (a6)
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re32.v v21, (a6)
+; ZIP-NEXT: vl1re64.v v21, (a6)
; ZIP-NEXT: add a6, a3, a2
-; ZIP-NEXT: vl1re32.v v10, (a6)
+; ZIP-NEXT: vl1re64.v v10, (a6)
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re32.v v11, (a6)
-; ZIP-NEXT: vl1re32.v v8, (a0)
-; ZIP-NEXT: vl1re32.v v16, (a4)
-; ZIP-NEXT: vl1re32.v v9, (a3)
-; ZIP-NEXT: vl1re32.v v17, (a7)
+; ZIP-NEXT: vl1re64.v v11, (a6)
+; ZIP-NEXT: vl1re64.v v8, (a0)
+; ZIP-NEXT: vl1re64.v v16, (a4)
+; ZIP-NEXT: vl1re64.v v9, (a3)
+; ZIP-NEXT: vl1re64.v v17, (a7)
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: li a3, 14
; ZIP-NEXT: mul a0, a0, a3
; ZIP-NEXT: add a0, sp, a0
; ZIP-NEXT: addi a0, a0, 64
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re32.v v12, (a6)
+; ZIP-NEXT: vl1re64.v v12, (a6)
; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re32.v v13, (a6)
+; ZIP-NEXT: vl1re64.v v13, (a6)
; ZIP-NEXT: add a6, a6, a2
; ZIP-NEXT: slli a2, a2, 3
; ZIP-NEXT: add a2, a0, a2
-; ZIP-NEXT: vl1re32.v v14, (a6)
-; ZIP-NEXT: vl1re32.v v15, (a1)
+; ZIP-NEXT: vl1re64.v v14, (a6)
+; ZIP-NEXT: vl1re64.v v15, (a1)
; ZIP-NEXT: add a5, a0, a5
; ZIP-NEXT: vs2r.v v20, (a5)
; ZIP-NEXT: vs4r.v v16, (a2)
; ZIP-NEXT: vs8r.v v8, (a0)
-; ZIP-NEXT: vl8re32.v v16, (a2)
-; ZIP-NEXT: vl8re32.v v8, (a0)
+; ZIP-NEXT: vl8re64.v v16, (a2)
+; ZIP-NEXT: vl8re64.v v8, (a0)
; ZIP-NEXT: addi sp, s0, -80
; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
; ZIP-NEXT: addi sp, sp, 80
; ZIP-NEXT: ret
- %res = call <vscale x 28 x float> @llvm.vector.interleave7.nxv28f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4, <vscale x 4 x float> %v5, <vscale x 4 x float> %v6)
- ret <vscale x 28 x float> %res
+ %res = call <vscale x 14 x double> @llvm.vector.interleave7.nxv14f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4, <vscale x 2 x double> %v5, <vscale x 2 x double> %v6)
+ ret <vscale x 14 x double> %res
+}
+
+define <vscale x 16 x half> @vector_interleave_nxv16f16_nxv2f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4, <vscale x 2 x half> %v5, <vscale x 2 x half> %v6, <vscale x 2 x half> %v7) nounwind {
+; CHECK-LABEL: vector_interleave_nxv16f16_nxv2f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a2, a1, 1
+; CHECK-NEXT: add a3, a0, a2
+; CHECK-NEXT: add a4, a3, a2
+; CHECK-NEXT: add a5, a4, a2
+; CHECK-NEXT: add a6, a5, a2
+; CHECK-NEXT: add a7, a6, a2
+; CHECK-NEXT: vsetvli t0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vsseg8e16.v v8, (a0)
+; CHECK-NEXT: add t0, a7, a2
+; CHECK-NEXT: add a2, t0, a2
+; CHECK-NEXT: vle16.v v11, (t0)
+; CHECK-NEXT: vle16.v v8, (a2)
+; CHECK-NEXT: srli a1, a1, 2
+; CHECK-NEXT: add a2, a1, a1
+; CHECK-NEXT: vle16.v v9, (a7)
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v11, v8, a1
+; CHECK-NEXT: vsetvli a7, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vle16.v v10, (a6)
+; CHECK-NEXT: vle16.v v8, (a5)
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v10, v9, a1
+; CHECK-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vle16.v v9, (a4)
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v8, a1
+; CHECK-NEXT: vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vle16.v v12, (a3)
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v12, a1
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16f16_nxv2f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 2
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a2, a1, 1
+; ZVBB-NEXT: add a3, a0, a2
+; ZVBB-NEXT: add a4, a3, a2
+; ZVBB-NEXT: add a5, a4, a2
+; ZVBB-NEXT: add a6, a5, a2
+; ZVBB-NEXT: add a7, a6, a2
+; ZVBB-NEXT: vsetvli t0, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vsseg8e16.v v8, (a0)
+; ZVBB-NEXT: add t0, a7, a2
+; ZVBB-NEXT: add a2, t0, a2
+; ZVBB-NEXT: vle16.v v11, (t0)
+; ZVBB-NEXT: vle16.v v8, (a2)
+; ZVBB-NEXT: srli a1, a1, 2
+; ZVBB-NEXT: add a2, a1, a1
+; ZVBB-NEXT: vle16.v v9, (a7)
+; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v11, v8, a1
+; ZVBB-NEXT: vsetvli a7, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vle16.v v10, (a6)
+; ZVBB-NEXT: vle16.v v8, (a5)
+; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v10, v9, a1
+; ZVBB-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vle16.v v9, (a4)
+; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v9, v8, a1
+; ZVBB-NEXT: vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vle16.v v12, (a3)
+; ZVBB-NEXT: vle16.v v8, (a0)
+; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v8, v12, a1
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 2
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 16 x half> @llvm.vector.interleave8.nxv16f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4, <vscale x 2 x half> %v5, <vscale x 2 x half> %v6, <vscale x 2 x half> %v7)
+ ret <vscale x 16 x half> %res
+}
+
+define <vscale x 32 x half> @vector_interleave_nxv32f16_nxv4f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4, <vscale x 4 x half> %v5, <vscale x 4 x half> %v6, <vscale x 4 x half> %v7) nounwind {
+; CHECK-LABEL: vector_interleave_nxv32f16_nxv4f16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: add a4, a3, a1
+; CHECK-NEXT: add a5, a4, a1
+; CHECK-NEXT: add a6, a5, a1
+; CHECK-NEXT: add a7, a6, a1
+; CHECK-NEXT: vsetvli t0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vsseg8e16.v v8, (a0)
+; CHECK-NEXT: vl1re16.v v14, (a7)
+; CHECK-NEXT: add a1, a7, a1
+; CHECK-NEXT: vl1re16.v v15, (a1)
+; CHECK-NEXT: vl1re16.v v12, (a5)
+; CHECK-NEXT: vl1re16.v v13, (a6)
+; CHECK-NEXT: vl1re16.v v10, (a3)
+; CHECK-NEXT: vl1re16.v v11, (a4)
+; CHECK-NEXT: vl1re16.v v8, (a0)
+; CHECK-NEXT: vl1re16.v v9, (a2)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv32f16_nxv4f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: add a4, a3, a1
+; ZVBB-NEXT: add a5, a4, a1
+; ZVBB-NEXT: add a6, a5, a1
+; ZVBB-NEXT: add a7, a6, a1
+; ZVBB-NEXT: vsetvli t0, zero, e16, m1, ta, ma
+; ZVBB-NEXT: vsseg8e16.v v8, (a0)
+; ZVBB-NEXT: vl1re16.v v14, (a7)
+; ZVBB-NEXT: add a1, a7, a1
+; ZVBB-NEXT: vl1re16.v v15, (a1)
+; ZVBB-NEXT: vl1re16.v v12, (a5)
+; ZVBB-NEXT: vl1re16.v v13, (a6)
+; ZVBB-NEXT: vl1re16.v v10, (a3)
+; ZVBB-NEXT: vl1re16.v v11, (a4)
+; ZVBB-NEXT: vl1re16.v v8, (a0)
+; ZVBB-NEXT: vl1re16.v v9, (a2)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 32 x half> @llvm.vector.interleave8.nxv32f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4, <vscale x 4 x half> %v5, <vscale x 4 x half> %v6, <vscale x 4 x half> %v7)
+ ret <vscale x 32 x half> %res
}
-define <vscale x 7 x double> @vector_interleave_nxv7f64_nxv1f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4, <vscale x 1 x double> %v5, <vscale x 1 x double> %v6) nounwind {
-; CHECK-LABEL: vector_interleave_nxv7f64_nxv1f64:
+define <vscale x 64 x half> @vector_interleave_nxv64f16_nxv8f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4, <vscale x 8 x half> %v5, <vscale x 8 x half> %v6, <vscale x 8 x half> %v7) nounwind {
+; CHECK-LABEL: vector_interleave_nxv64f16_nxv8f16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 3
-; CHECK-NEXT: sub a0, a1, a0
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv2r.v v28, v22
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv2r.v v26, v18
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: add a2, a0, a3
+; CHECK-NEXT: add a4, a1, a3
+; CHECK-NEXT: add a5, a2, a3
+; CHECK-NEXT: vmv1r.v v1, v8
+; CHECK-NEXT: vmv2r.v v24, v14
+; CHECK-NEXT: add a6, a4, a3
+; CHECK-NEXT: vmv2r.v v22, v10
+; CHECK-NEXT: vmv1r.v v2, v22
+; CHECK-NEXT: add a7, a5, a3
+; CHECK-NEXT: vmv1r.v v3, v12
+; CHECK-NEXT: add t0, a6, a3
+; CHECK-NEXT: vmv1r.v v4, v24
+; CHECK-NEXT: add t1, a7, a3
+; CHECK-NEXT: vmv1r.v v5, v16
+; CHECK-NEXT: add t2, t0, a3
+; CHECK-NEXT: vmv1r.v v6, v26
+; CHECK-NEXT: add t3, t1, a3
+; CHECK-NEXT: vmv1r.v v7, v20
+; CHECK-NEXT: add t4, t2, a3
+; CHECK-NEXT: vmv1r.v v8, v28
+; CHECK-NEXT: vmv1r.v v22, v9
+; CHECK-NEXT: add t5, t3, a3
+; CHECK-NEXT: vmv1r.v v24, v13
+; CHECK-NEXT: add t6, t4, a3
+; CHECK-NEXT: vmv1r.v v26, v17
+; CHECK-NEXT: vsseg8e16.v v1, (a0)
+; CHECK-NEXT: vmv1r.v v28, v21
+; CHECK-NEXT: vsseg8e16.v v22, (a1)
+; CHECK-NEXT: vl1re16.v v14, (t5)
+; CHECK-NEXT: add t5, t5, a3
+; CHECK-NEXT: add a3, t6, a3
+; CHECK-NEXT: vl1re16.v v22, (t6)
+; CHECK-NEXT: vl1re16.v v15, (t5)
+; CHECK-NEXT: vl1re16.v v23, (a3)
+; CHECK-NEXT: vl1re16.v v12, (t1)
+; CHECK-NEXT: vl1re16.v v20, (t2)
+; CHECK-NEXT: vl1re16.v v13, (t3)
+; CHECK-NEXT: vl1re16.v v21, (t4)
+; CHECK-NEXT: vl1re16.v v10, (a5)
+; CHECK-NEXT: vl1re16.v v18, (a6)
+; CHECK-NEXT: vl1re16.v v11, (a7)
+; CHECK-NEXT: vl1re16.v v19, (t0)
+; CHECK-NEXT: vl1re16.v v8, (a0)
+; CHECK-NEXT: vl1re16.v v16, (a1)
+; CHECK-NEXT: vl1re16.v v9, (a2)
+; CHECK-NEXT: vl1re16.v v17, (a4)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv64f16_nxv8f16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 4
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-NEXT: vmv2r.v v28, v22
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: vmv2r.v v26, v18
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 3
+; ZVBB-NEXT: add a1, sp, a1
+; ZVBB-NEXT: addi a1, a1, 16
+; ZVBB-NEXT: csrr a3, vlenb
+; ZVBB-NEXT: add a2, a0, a3
+; ZVBB-NEXT: add a4, a1, a3
+; ZVBB-NEXT: add a5, a2, a3
+; ZVBB-NEXT: vmv1r.v v1, v8
+; ZVBB-NEXT: vmv2r.v v24, v14
+; ZVBB-NEXT: add a6, a4, a3
+; ZVBB-NEXT: vmv2r.v v22, v10
+; ZVBB-NEXT: vmv1r.v v2, v22
+; ZVBB-NEXT: add a7, a5, a3
+; ZVBB-NEXT: vmv1r.v v3, v12
+; ZVBB-NEXT: add t0, a6, a3
+; ZVBB-NEXT: vmv1r.v v4, v24
+; ZVBB-NEXT: add t1, a7, a3
+; ZVBB-NEXT: vmv1r.v v5, v16
+; ZVBB-NEXT: add t2, t0, a3
+; ZVBB-NEXT: vmv1r.v v6, v26
+; ZVBB-NEXT: add t3, t1, a3
+; ZVBB-NEXT: vmv1r.v v7, v20
+; ZVBB-NEXT: add t4, t2, a3
+; ZVBB-NEXT: vmv1r.v v8, v28
+; ZVBB-NEXT: vmv1r.v v22, v9
+; ZVBB-NEXT: add t5, t3, a3
+; ZVBB-NEXT: vmv1r.v v24, v13
+; ZVBB-NEXT: add t6, t4, a3
+; ZVBB-NEXT: vmv1r.v v26, v17
+; ZVBB-NEXT: vsseg8e16.v v1, (a0)
+; ZVBB-NEXT: vmv1r.v v28, v21
+; ZVBB-NEXT: vsseg8e16.v v22, (a1)
+; ZVBB-NEXT: vl1re16.v v14, (t5)
+; ZVBB-NEXT: add t5, t5, a3
+; ZVBB-NEXT: add a3, t6, a3
+; ZVBB-NEXT: vl1re16.v v22, (t6)
+; ZVBB-NEXT: vl1re16.v v15, (t5)
+; ZVBB-NEXT: vl1re16.v v23, (a3)
+; ZVBB-NEXT: vl1re16.v v12, (t1)
+; ZVBB-NEXT: vl1re16.v v20, (t2)
+; ZVBB-NEXT: vl1re16.v v13, (t3)
+; ZVBB-NEXT: vl1re16.v v21, (t4)
+; ZVBB-NEXT: vl1re16.v v10, (a5)
+; ZVBB-NEXT: vl1re16.v v18, (a6)
+; ZVBB-NEXT: vl1re16.v v11, (a7)
+; ZVBB-NEXT: vl1re16.v v19, (t0)
+; ZVBB-NEXT: vl1re16.v v8, (a0)
+; ZVBB-NEXT: vl1re16.v v16, (a1)
+; ZVBB-NEXT: vl1re16.v v9, (a2)
+; ZVBB-NEXT: vl1re16.v v17, (a4)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 4
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 64 x half> @llvm.vector.interleave8.nxv64f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4, <vscale x 8 x half> %v5, <vscale x 8 x half> %v6, <vscale x 8 x half> %v7)
+ ret <vscale x 64 x half> %res
+}
+
+define <vscale x 16 x bfloat> @vector_interleave_nxv16bf16_nxv2bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4, <vscale x 2 x bfloat> %v5, <vscale x 2 x bfloat> %v6, <vscale x 2 x bfloat> %v7) nounwind {
+; CHECK-LABEL: vector_interleave_nxv16bf16_nxv2bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a2, a1, 1
+; CHECK-NEXT: add a3, a0, a2
+; CHECK-NEXT: add a4, a3, a2
+; CHECK-NEXT: add a5, a4, a2
+; CHECK-NEXT: add a6, a5, a2
+; CHECK-NEXT: add a7, a6, a2
+; CHECK-NEXT: vsetvli t0, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vsseg8e16.v v8, (a0)
+; CHECK-NEXT: add t0, a7, a2
+; CHECK-NEXT: add a2, t0, a2
+; CHECK-NEXT: vle16.v v11, (t0)
+; CHECK-NEXT: vle16.v v8, (a2)
+; CHECK-NEXT: srli a1, a1, 2
+; CHECK-NEXT: add a2, a1, a1
+; CHECK-NEXT: vle16.v v9, (a7)
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v11, v8, a1
+; CHECK-NEXT: vsetvli a7, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vle16.v v10, (a6)
+; CHECK-NEXT: vle16.v v8, (a5)
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v10, v9, a1
+; CHECK-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vle16.v v9, (a4)
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v8, a1
+; CHECK-NEXT: vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT: vle16.v v12, (a3)
+; CHECK-NEXT: vle16.v v8, (a0)
+; CHECK-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v12, a1
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16bf16_nxv2bf16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 2
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a2, a1, 1
+; ZVBB-NEXT: add a3, a0, a2
+; ZVBB-NEXT: add a4, a3, a2
+; ZVBB-NEXT: add a5, a4, a2
+; ZVBB-NEXT: add a6, a5, a2
+; ZVBB-NEXT: add a7, a6, a2
+; ZVBB-NEXT: vsetvli t0, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vsseg8e16.v v8, (a0)
+; ZVBB-NEXT: add t0, a7, a2
+; ZVBB-NEXT: add a2, t0, a2
+; ZVBB-NEXT: vle16.v v11, (t0)
+; ZVBB-NEXT: vle16.v v8, (a2)
+; ZVBB-NEXT: srli a1, a1, 2
+; ZVBB-NEXT: add a2, a1, a1
+; ZVBB-NEXT: vle16.v v9, (a7)
+; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v11, v8, a1
+; ZVBB-NEXT: vsetvli a7, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vle16.v v10, (a6)
+; ZVBB-NEXT: vle16.v v8, (a5)
+; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v10, v9, a1
+; ZVBB-NEXT: vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vle16.v v9, (a4)
+; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v9, v8, a1
+; ZVBB-NEXT: vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT: vle16.v v12, (a3)
+; ZVBB-NEXT: vle16.v v8, (a0)
+; ZVBB-NEXT: vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v8, v12, a1
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 2
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 16 x bfloat> @llvm.vector.interleave8.nxv16bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4, <vscale x 2 x bfloat> %v5, <vscale x 2 x bfloat> %v6, <vscale x 2 x bfloat> %v7)
+ ret <vscale x 16 x bfloat> %res
+}
+
+define <vscale x 32 x bfloat> @vector_interleave_nxv32bf16_nxv4bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4, <vscale x 4 x bfloat> %v5, <vscale x 4 x bfloat> %v6, <vscale x 4 x bfloat> %v7) nounwind {
+; CHECK-LABEL: vector_interleave_nxv32bf16_nxv4bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: sub sp, sp, a0
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: add a2, a0, a1
; CHECK-NEXT: add a3, a2, a1
-; CHECK-NEXT: vsetvli a4, zero, e64, m1, ta, ma
-; CHECK-NEXT: vsseg7e64.v v8, (a0)
-; CHECK-NEXT: vl1re64.v v10, (a3)
-; CHECK-NEXT: add a3, a3, a1
-; CHECK-NEXT: vl1re64.v v11, (a3)
-; CHECK-NEXT: add a3, a3, a1
-; CHECK-NEXT: vl1re64.v v8, (a0)
-; CHECK-NEXT: add a0, a3, a1
-; CHECK-NEXT: vl1re64.v v9, (a2)
-; CHECK-NEXT: vl1re64.v v12, (a3)
-; CHECK-NEXT: vl1re64.v v13, (a0)
-; CHECK-NEXT: add a0, a0, a1
-; CHECK-NEXT: vl1re64.v v14, (a0)
+; CHECK-NEXT: add a4, a3, a1
+; CHECK-NEXT: add a5, a4, a1
+; CHECK-NEXT: add a6, a5, a1
+; CHECK-NEXT: add a7, a6, a1
+; CHECK-NEXT: vsetvli t0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vsseg8e16.v v8, (a0)
+; CHECK-NEXT: vl1re16.v v14, (a7)
+; CHECK-NEXT: add a1, a7, a1
+; CHECK-NEXT: vl1re16.v v15, (a1)
+; CHECK-NEXT: vl1re16.v v12, (a5)
+; CHECK-NEXT: vl1re16.v v13, (a6)
+; CHECK-NEXT: vl1re16.v v10, (a3)
+; CHECK-NEXT: vl1re16.v v11, (a4)
+; CHECK-NEXT: vl1re16.v v8, (a0)
+; CHECK-NEXT: vl1re16.v v9, (a2)
; CHECK-NEXT: csrr a0, vlenb
-; CHECK-NEXT: slli a1, a0, 3
-; CHECK-NEXT: sub a0, a1, a0
+; CHECK-NEXT: slli a0, a0, 3
; CHECK-NEXT: add sp, sp, a0
; CHECK-NEXT: addi sp, sp, 16
; CHECK-NEXT: ret
;
-; ZVBB-LABEL: vector_interleave_nxv7f64_nxv1f64:
+; ZVBB-LABEL: vector_interleave_nxv32bf16_nxv4bf16:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a1, a0, 3
-; ZVBB-NEXT: sub a0, a1, a0
+; ZVBB-NEXT: slli a0, a0, 3
; ZVBB-NEXT: sub sp, sp, a0
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: add a2, a0, a1
; ZVBB-NEXT: add a3, a2, a1
-; ZVBB-NEXT: vsetvli a4, zero, e64, m1, ta, ma
-; ZVBB-NEXT: vsseg7e64.v v8, (a0)
-; ZVBB-NEXT: vl1re64.v v10, (a3)
-; ZVBB-NEXT: add a3, a3, a1
-; ZVBB-NEXT: vl1re64.v v11, (a3)
-; ZVBB-NEXT: add a3, a3, a1
-; ZVBB-NEXT: vl1re64.v v8, (a0)
-; ZVBB-NEXT: add a0, a3, a1
-; ZVBB-NEXT: vl1re64.v v9, (a2)
-; ZVBB-NEXT: vl1re64.v v12, (a3)
-; ZVBB-NEXT: vl1re64.v v13, (a0)
-; ZVBB-NEXT: add a0, a0, a1
-; ZVBB-NEXT: vl1re64.v v14, (a0)
+; ZVBB-NEXT: add a4, a3, a1
+; ZVBB-NEXT: add a5, a4, a1
+; ZVBB-NEXT: add a6, a5, a1
+; ZVBB-NEXT: add a7, a6, a1
+; ZVBB-NEXT: vsetvli t0, zero, e16, m1, ta, ma
+; ZVBB-NEXT: vsseg8e16.v v8, (a0)
+; ZVBB-NEXT: vl1re16.v v14, (a7)
+; ZVBB-NEXT: add a1, a7, a1
+; ZVBB-NEXT: vl1re16.v v15, (a1)
+; ZVBB-NEXT: vl1re16.v v12, (a5)
+; ZVBB-NEXT: vl1re16.v v13, (a6)
+; ZVBB-NEXT: vl1re16.v v10, (a3)
+; ZVBB-NEXT: vl1re16.v v11, (a4)
+; ZVBB-NEXT: vl1re16.v v8, (a0)
+; ZVBB-NEXT: vl1re16.v v9, (a2)
; ZVBB-NEXT: csrr a0, vlenb
-; ZVBB-NEXT: slli a1, a0, 3
-; ZVBB-NEXT: sub a0, a1, a0
+; ZVBB-NEXT: slli a0, a0, 3
; ZVBB-NEXT: add sp, sp, a0
; ZVBB-NEXT: addi sp, sp, 16
; ZVBB-NEXT: ret
- %res = call <vscale x 7 x double> @llvm.vector.interleave7.nxv7f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4, <vscale x 1 x double> %v5, <vscale x 1 x double> %v6)
- ret <vscale x 7 x double> %res
+ %res = call <vscale x 32 x bfloat> @llvm.vector.interleave8.nxv32bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4, <vscale x 4 x bfloat> %v5, <vscale x 4 x bfloat> %v6, <vscale x 4 x bfloat> %v7)
+ ret <vscale x 32 x bfloat> %res
}
-define <vscale x 14 x double> @vector_interleave_nxv14f64_nxv2f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4, <vscale x 2 x double> %v5, <vscale x 2 x double> %v6) nounwind {
-; RV32-LABEL: vector_interleave_nxv14f64_nxv2f64:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -80
-; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
-; RV32-NEXT: addi s0, sp, 80
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 5
-; RV32-NEXT: sub sp, sp, a0
-; RV32-NEXT: andi sp, sp, -64
-; RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT: vmv2r.v v26, v20
-; RV32-NEXT: addi a0, sp, 64
-; RV32-NEXT: vmv2r.v v24, v16
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a2, a1, 3
-; RV32-NEXT: sub a1, a2, a1
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 64
-; RV32-NEXT: vmv2r.v v22, v12
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: vmv2r.v v20, v8
-; RV32-NEXT: vmv1r.v v1, v20
-; RV32-NEXT: vmv1r.v v3, v22
-; RV32-NEXT: vmv1r.v v5, v24
-; RV32-NEXT: vmv1r.v v7, v26
-; RV32-NEXT: add a3, a0, a2
-; RV32-NEXT: vmv1r.v v2, v10
-; RV32-NEXT: add a4, a1, a2
-; RV32-NEXT: slli a5, a2, 2
-; RV32-NEXT: vmv1r.v v4, v14
-; RV32-NEXT: slli a6, a2, 4
-; RV32-NEXT: add a7, a4, a2
-; RV32-NEXT: vmv1r.v v6, v18
-; RV32-NEXT: sub a5, a6, a5
-; RV32-NEXT: vmv1r.v v22, v11
-; RV32-NEXT: add a6, a7, a2
-; RV32-NEXT: vmv1r.v v24, v15
-; RV32-NEXT: vsseg7e64.v v1, (a0)
-; RV32-NEXT: vmv1r.v v26, v19
-; RV32-NEXT: vsseg7e64.v v21, (a1)
-; RV32-NEXT: vl1re64.v v18, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re64.v v19, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re64.v v20, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re64.v v21, (a6)
-; RV32-NEXT: add a6, a3, a2
-; RV32-NEXT: vl1re64.v v10, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re64.v v11, (a6)
-; RV32-NEXT: vl1re64.v v8, (a0)
-; RV32-NEXT: vl1re64.v v16, (a4)
-; RV32-NEXT: vl1re64.v v9, (a3)
-; RV32-NEXT: vl1re64.v v17, (a7)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: li a3, 14
-; RV32-NEXT: mul a0, a0, a3
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 64
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re64.v v12, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: vl1re64.v v13, (a6)
-; RV32-NEXT: add a6, a6, a2
-; RV32-NEXT: slli a2, a2, 3
-; RV32-NEXT: add a2, a0, a2
-; RV32-NEXT: vl1re64.v v14, (a6)
-; RV32-NEXT: vl1re64.v v15, (a1)
-; RV32-NEXT: add a5, a0, a5
-; RV32-NEXT: vs2r.v v20, (a5)
-; RV32-NEXT: vs4r.v v16, (a2)
-; RV32-NEXT: vs8r.v v8, (a0)
-; RV32-NEXT: vl8re64.v v16, (a2)
-; RV32-NEXT: vl8re64.v v8, (a0)
-; RV32-NEXT: addi sp, s0, -80
-; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 80
-; RV32-NEXT: ret
+define <vscale x 64 x bfloat> @vector_interleave_nxv64bf16_nxv8bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4, <vscale x 8 x bfloat> %v5, <vscale x 8 x bfloat> %v6, <vscale x 8 x bfloat> %v7) nounwind {
+; CHECK-LABEL: vector_interleave_nxv64bf16_nxv8bf16:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT: vmv2r.v v28, v22
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv2r.v v26, v18
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: add a2, a0, a3
+; CHECK-NEXT: add a4, a1, a3
+; CHECK-NEXT: add a5, a2, a3
+; CHECK-NEXT: vmv1r.v v1, v8
+; CHECK-NEXT: vmv2r.v v24, v14
+; CHECK-NEXT: add a6, a4, a3
+; CHECK-NEXT: vmv2r.v v22, v10
+; CHECK-NEXT: vmv1r.v v2, v22
+; CHECK-NEXT: add a7, a5, a3
+; CHECK-NEXT: vmv1r.v v3, v12
+; CHECK-NEXT: add t0, a6, a3
+; CHECK-NEXT: vmv1r.v v4, v24
+; CHECK-NEXT: add t1, a7, a3
+; CHECK-NEXT: vmv1r.v v5, v16
+; CHECK-NEXT: add t2, t0, a3
+; CHECK-NEXT: vmv1r.v v6, v26
+; CHECK-NEXT: add t3, t1, a3
+; CHECK-NEXT: vmv1r.v v7, v20
+; CHECK-NEXT: add t4, t2, a3
+; CHECK-NEXT: vmv1r.v v8, v28
+; CHECK-NEXT: vmv1r.v v22, v9
+; CHECK-NEXT: add t5, t3, a3
+; CHECK-NEXT: vmv1r.v v24, v13
+; CHECK-NEXT: add t6, t4, a3
+; CHECK-NEXT: vmv1r.v v26, v17
+; CHECK-NEXT: vsseg8e16.v v1, (a0)
+; CHECK-NEXT: vmv1r.v v28, v21
+; CHECK-NEXT: vsseg8e16.v v22, (a1)
+; CHECK-NEXT: vl1re16.v v14, (t5)
+; CHECK-NEXT: add t5, t5, a3
+; CHECK-NEXT: add a3, t6, a3
+; CHECK-NEXT: vl1re16.v v22, (t6)
+; CHECK-NEXT: vl1re16.v v15, (t5)
+; CHECK-NEXT: vl1re16.v v23, (a3)
+; CHECK-NEXT: vl1re16.v v12, (t1)
+; CHECK-NEXT: vl1re16.v v20, (t2)
+; CHECK-NEXT: vl1re16.v v13, (t3)
+; CHECK-NEXT: vl1re16.v v21, (t4)
+; CHECK-NEXT: vl1re16.v v10, (a5)
+; CHECK-NEXT: vl1re16.v v18, (a6)
+; CHECK-NEXT: vl1re16.v v11, (a7)
+; CHECK-NEXT: vl1re16.v v19, (t0)
+; CHECK-NEXT: vl1re16.v v8, (a0)
+; CHECK-NEXT: vl1re16.v v16, (a1)
+; CHECK-NEXT: vl1re16.v v9, (a2)
+; CHECK-NEXT: vl1re16.v v17, (a4)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv64bf16_nxv8bf16:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 4
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-NEXT: vmv2r.v v28, v22
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: vmv2r.v v26, v18
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 3
+; ZVBB-NEXT: add a1, sp, a1
+; ZVBB-NEXT: addi a1, a1, 16
+; ZVBB-NEXT: csrr a3, vlenb
+; ZVBB-NEXT: add a2, a0, a3
+; ZVBB-NEXT: add a4, a1, a3
+; ZVBB-NEXT: add a5, a2, a3
+; ZVBB-NEXT: vmv1r.v v1, v8
+; ZVBB-NEXT: vmv2r.v v24, v14
+; ZVBB-NEXT: add a6, a4, a3
+; ZVBB-NEXT: vmv2r.v v22, v10
+; ZVBB-NEXT: vmv1r.v v2, v22
+; ZVBB-NEXT: add a7, a5, a3
+; ZVBB-NEXT: vmv1r.v v3, v12
+; ZVBB-NEXT: add t0, a6, a3
+; ZVBB-NEXT: vmv1r.v v4, v24
+; ZVBB-NEXT: add t1, a7, a3
+; ZVBB-NEXT: vmv1r.v v5, v16
+; ZVBB-NEXT: add t2, t0, a3
+; ZVBB-NEXT: vmv1r.v v6, v26
+; ZVBB-NEXT: add t3, t1, a3
+; ZVBB-NEXT: vmv1r.v v7, v20
+; ZVBB-NEXT: add t4, t2, a3
+; ZVBB-NEXT: vmv1r.v v8, v28
+; ZVBB-NEXT: vmv1r.v v22, v9
+; ZVBB-NEXT: add t5, t3, a3
+; ZVBB-NEXT: vmv1r.v v24, v13
+; ZVBB-NEXT: add t6, t4, a3
+; ZVBB-NEXT: vmv1r.v v26, v17
+; ZVBB-NEXT: vsseg8e16.v v1, (a0)
+; ZVBB-NEXT: vmv1r.v v28, v21
+; ZVBB-NEXT: vsseg8e16.v v22, (a1)
+; ZVBB-NEXT: vl1re16.v v14, (t5)
+; ZVBB-NEXT: add t5, t5, a3
+; ZVBB-NEXT: add a3, t6, a3
+; ZVBB-NEXT: vl1re16.v v22, (t6)
+; ZVBB-NEXT: vl1re16.v v15, (t5)
+; ZVBB-NEXT: vl1re16.v v23, (a3)
+; ZVBB-NEXT: vl1re16.v v12, (t1)
+; ZVBB-NEXT: vl1re16.v v20, (t2)
+; ZVBB-NEXT: vl1re16.v v13, (t3)
+; ZVBB-NEXT: vl1re16.v v21, (t4)
+; ZVBB-NEXT: vl1re16.v v10, (a5)
+; ZVBB-NEXT: vl1re16.v v18, (a6)
+; ZVBB-NEXT: vl1re16.v v11, (a7)
+; ZVBB-NEXT: vl1re16.v v19, (t0)
+; ZVBB-NEXT: vl1re16.v v8, (a0)
+; ZVBB-NEXT: vl1re16.v v16, (a1)
+; ZVBB-NEXT: vl1re16.v v9, (a2)
+; ZVBB-NEXT: vl1re16.v v17, (a4)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 4
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 64 x bfloat> @llvm.vector.interleave8.nxv64bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4, <vscale x 8 x bfloat> %v5, <vscale x 8 x bfloat> %v6, <vscale x 8 x bfloat> %v7)
+ ret <vscale x 64 x bfloat> %res
+}
+
+define <vscale x 8 x float> @vector_interleave_nxv8f32_nxv1f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4, <vscale x 1 x float> %v5, <vscale x 1 x float> %v6, <vscale x 1 x float> %v8) nounwind {
+; CHECK-LABEL: vector_interleave_nxv8f32_nxv1f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: srli a2, a1, 1
+; CHECK-NEXT: add a3, a0, a2
+; CHECK-NEXT: add a4, a3, a2
+; CHECK-NEXT: add a5, a4, a2
+; CHECK-NEXT: add a6, a5, a2
+; CHECK-NEXT: add a7, a6, a2
+; CHECK-NEXT: vsetvli t0, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vsseg8e32.v v8, (a0)
+; CHECK-NEXT: add t0, a7, a2
+; CHECK-NEXT: add a2, t0, a2
+; CHECK-NEXT: vle32.v v11, (t0)
+; CHECK-NEXT: vle32.v v8, (a2)
+; CHECK-NEXT: srli a1, a1, 3
+; CHECK-NEXT: add a2, a1, a1
+; CHECK-NEXT: vle32.v v9, (a7)
+; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v11, v8, a1
+; CHECK-NEXT: vsetvli a7, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vle32.v v10, (a6)
+; CHECK-NEXT: vle32.v v8, (a5)
+; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v10, v9, a1
+; CHECK-NEXT: vsetvli a5, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vle32.v v9, (a4)
+; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v8, a1
+; CHECK-NEXT: vsetvli a4, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vle32.v v12, (a3)
+; CHECK-NEXT: vle32.v v8, (a0)
+; CHECK-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v12, a1
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 2
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
;
-; RV64-LABEL: vector_interleave_nxv14f64_nxv2f64:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -80
-; RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT: addi s0, sp, 80
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 5
-; RV64-NEXT: sub sp, sp, a0
-; RV64-NEXT: andi sp, sp, -64
-; RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT: vmv2r.v v26, v20
-; RV64-NEXT: addi a0, sp, 64
-; RV64-NEXT: vmv2r.v v24, v16
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a2, a1, 3
-; RV64-NEXT: sub a1, a2, a1
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 64
-; RV64-NEXT: vmv2r.v v22, v12
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: vmv2r.v v20, v8
-; RV64-NEXT: vmv1r.v v1, v20
-; RV64-NEXT: vmv1r.v v3, v22
-; RV64-NEXT: vmv1r.v v5, v24
-; RV64-NEXT: vmv1r.v v7, v26
-; RV64-NEXT: add a3, a0, a2
-; RV64-NEXT: vmv1r.v v2, v10
-; RV64-NEXT: add a4, a1, a2
-; RV64-NEXT: slli a5, a2, 2
-; RV64-NEXT: vmv1r.v v4, v14
-; RV64-NEXT: slli a6, a2, 4
-; RV64-NEXT: add a7, a4, a2
-; RV64-NEXT: vmv1r.v v6, v18
-; RV64-NEXT: sub a5, a6, a5
-; RV64-NEXT: vmv1r.v v22, v11
-; RV64-NEXT: add a6, a7, a2
-; RV64-NEXT: vmv1r.v v24, v15
-; RV64-NEXT: vsseg7e64.v v1, (a0)
-; RV64-NEXT: vmv1r.v v26, v19
-; RV64-NEXT: vsseg7e64.v v21, (a1)
-; RV64-NEXT: vl1re64.v v18, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re64.v v19, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re64.v v20, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re64.v v21, (a6)
-; RV64-NEXT: add a6, a3, a2
-; RV64-NEXT: vl1re64.v v10, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re64.v v11, (a6)
-; RV64-NEXT: vl1re64.v v8, (a0)
-; RV64-NEXT: vl1re64.v v16, (a4)
-; RV64-NEXT: vl1re64.v v9, (a3)
-; RV64-NEXT: vl1re64.v v17, (a7)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: li a3, 14
-; RV64-NEXT: mul a0, a0, a3
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 64
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re64.v v12, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: vl1re64.v v13, (a6)
-; RV64-NEXT: add a6, a6, a2
-; RV64-NEXT: slli a2, a2, 3
-; RV64-NEXT: add a2, a0, a2
-; RV64-NEXT: vl1re64.v v14, (a6)
-; RV64-NEXT: vl1re64.v v15, (a1)
-; RV64-NEXT: add a5, a0, a5
-; RV64-NEXT: vs2r.v v20, (a5)
-; RV64-NEXT: vs4r.v v16, (a2)
-; RV64-NEXT: vs8r.v v8, (a0)
-; RV64-NEXT: vl8re64.v v16, (a2)
-; RV64-NEXT: vl8re64.v v8, (a0)
-; RV64-NEXT: addi sp, s0, -80
-; RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 80
-; RV64-NEXT: ret
+; ZVBB-LABEL: vector_interleave_nxv8f32_nxv1f32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 2
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: srli a2, a1, 1
+; ZVBB-NEXT: add a3, a0, a2
+; ZVBB-NEXT: add a4, a3, a2
+; ZVBB-NEXT: add a5, a4, a2
+; ZVBB-NEXT: add a6, a5, a2
+; ZVBB-NEXT: add a7, a6, a2
+; ZVBB-NEXT: vsetvli t0, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vsseg8e32.v v8, (a0)
+; ZVBB-NEXT: add t0, a7, a2
+; ZVBB-NEXT: add a2, t0, a2
+; ZVBB-NEXT: vle32.v v11, (t0)
+; ZVBB-NEXT: vle32.v v8, (a2)
+; ZVBB-NEXT: srli a1, a1, 3
+; ZVBB-NEXT: add a2, a1, a1
+; ZVBB-NEXT: vle32.v v9, (a7)
+; ZVBB-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v11, v8, a1
+; ZVBB-NEXT: vsetvli a7, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vle32.v v10, (a6)
+; ZVBB-NEXT: vle32.v v8, (a5)
+; ZVBB-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v10, v9, a1
+; ZVBB-NEXT: vsetvli a5, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vle32.v v9, (a4)
+; ZVBB-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v9, v8, a1
+; ZVBB-NEXT: vsetvli a4, zero, e32, mf2, ta, ma
+; ZVBB-NEXT: vle32.v v12, (a3)
+; ZVBB-NEXT: vle32.v v8, (a0)
+; ZVBB-NEXT: vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT: vslideup.vx v8, v12, a1
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 2
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 8 x float> @llvm.vector.interleave8.nxv8f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4, <vscale x 1 x float> %v5, <vscale x 1 x float> %v6, <vscale x 1 x float> %v8)
+ ret <vscale x 8 x float> %res
+}
+
+define <vscale x 16 x float> @vector_interleave_nxv16f32_nxv2f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4, <vscale x 2 x float> %v5, <vscale x 2 x float> %v6, <vscale x 2 x float> %v7) nounwind {
+; CHECK-LABEL: vector_interleave_nxv16f32_nxv2f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: add a4, a3, a1
+; CHECK-NEXT: add a5, a4, a1
+; CHECK-NEXT: add a6, a5, a1
+; CHECK-NEXT: add a7, a6, a1
+; CHECK-NEXT: vsetvli t0, zero, e32, m1, ta, ma
+; CHECK-NEXT: vsseg8e32.v v8, (a0)
+; CHECK-NEXT: vl1re32.v v14, (a7)
+; CHECK-NEXT: add a1, a7, a1
+; CHECK-NEXT: vl1re32.v v15, (a1)
+; CHECK-NEXT: vl1re32.v v12, (a5)
+; CHECK-NEXT: vl1re32.v v13, (a6)
+; CHECK-NEXT: vl1re32.v v10, (a3)
+; CHECK-NEXT: vl1re32.v v11, (a4)
+; CHECK-NEXT: vl1re32.v v8, (a0)
+; CHECK-NEXT: vl1re32.v v9, (a2)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
;
-; ZVBB-RV32-LABEL: vector_interleave_nxv14f64_nxv2f64:
-; ZVBB-RV32: # %bb.0:
-; ZVBB-RV32-NEXT: addi sp, sp, -80
-; ZVBB-RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill
-; ZVBB-RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill
-; ZVBB-RV32-NEXT: addi s0, sp, 80
-; ZVBB-RV32-NEXT: csrr a0, vlenb
-; ZVBB-RV32-NEXT: slli a0, a0, 5
-; ZVBB-RV32-NEXT: sub sp, sp, a0
-; ZVBB-RV32-NEXT: andi sp, sp, -64
-; ZVBB-RV32-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; ZVBB-RV32-NEXT: vmv2r.v v26, v20
-; ZVBB-RV32-NEXT: addi a0, sp, 64
-; ZVBB-RV32-NEXT: vmv2r.v v24, v16
-; ZVBB-RV32-NEXT: csrr a1, vlenb
-; ZVBB-RV32-NEXT: slli a2, a1, 3
-; ZVBB-RV32-NEXT: sub a1, a2, a1
-; ZVBB-RV32-NEXT: add a1, sp, a1
-; ZVBB-RV32-NEXT: addi a1, a1, 64
-; ZVBB-RV32-NEXT: vmv2r.v v22, v12
-; ZVBB-RV32-NEXT: csrr a2, vlenb
-; ZVBB-RV32-NEXT: vmv2r.v v20, v8
-; ZVBB-RV32-NEXT: vmv1r.v v1, v20
-; ZVBB-RV32-NEXT: vmv1r.v v3, v22
-; ZVBB-RV32-NEXT: vmv1r.v v5, v24
-; ZVBB-RV32-NEXT: vmv1r.v v7, v26
-; ZVBB-RV32-NEXT: add a3, a0, a2
-; ZVBB-RV32-NEXT: vmv1r.v v2, v10
-; ZVBB-RV32-NEXT: add a4, a1, a2
-; ZVBB-RV32-NEXT: slli a5, a2, 2
-; ZVBB-RV32-NEXT: vmv1r.v v4, v14
-; ZVBB-RV32-NEXT: slli a6, a2, 4
-; ZVBB-RV32-NEXT: add a7, a4, a2
-; ZVBB-RV32-NEXT: vmv1r.v v6, v18
-; ZVBB-RV32-NEXT: sub a5, a6, a5
-; ZVBB-RV32-NEXT: vmv1r.v v22, v11
-; ZVBB-RV32-NEXT: add a6, a7, a2
-; ZVBB-RV32-NEXT: vmv1r.v v24, v15
-; ZVBB-RV32-NEXT: vsseg7e64.v v1, (a0)
-; ZVBB-RV32-NEXT: vmv1r.v v26, v19
-; ZVBB-RV32-NEXT: vsseg7e64.v v21, (a1)
-; ZVBB-RV32-NEXT: vl1re64.v v18, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re64.v v19, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re64.v v20, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re64.v v21, (a6)
-; ZVBB-RV32-NEXT: add a6, a3, a2
-; ZVBB-RV32-NEXT: vl1re64.v v10, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re64.v v11, (a6)
-; ZVBB-RV32-NEXT: vl1re64.v v8, (a0)
-; ZVBB-RV32-NEXT: vl1re64.v v16, (a4)
-; ZVBB-RV32-NEXT: vl1re64.v v9, (a3)
-; ZVBB-RV32-NEXT: vl1re64.v v17, (a7)
-; ZVBB-RV32-NEXT: csrr a0, vlenb
-; ZVBB-RV32-NEXT: li a3, 14
-; ZVBB-RV32-NEXT: mul a0, a0, a3
-; ZVBB-RV32-NEXT: add a0, sp, a0
-; ZVBB-RV32-NEXT: addi a0, a0, 64
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re64.v v12, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: vl1re64.v v13, (a6)
-; ZVBB-RV32-NEXT: add a6, a6, a2
-; ZVBB-RV32-NEXT: slli a2, a2, 3
-; ZVBB-RV32-NEXT: add a2, a0, a2
-; ZVBB-RV32-NEXT: vl1re64.v v14, (a6)
-; ZVBB-RV32-NEXT: vl1re64.v v15, (a1)
-; ZVBB-RV32-NEXT: add a5, a0, a5
-; ZVBB-RV32-NEXT: vs2r.v v20, (a5)
-; ZVBB-RV32-NEXT: vs4r.v v16, (a2)
-; ZVBB-RV32-NEXT: vs8r.v v8, (a0)
-; ZVBB-RV32-NEXT: vl8re64.v v16, (a2)
-; ZVBB-RV32-NEXT: vl8re64.v v8, (a0)
-; ZVBB-RV32-NEXT: addi sp, s0, -80
-; ZVBB-RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload
-; ZVBB-RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload
-; ZVBB-RV32-NEXT: addi sp, sp, 80
-; ZVBB-RV32-NEXT: ret
+; ZVBB-LABEL: vector_interleave_nxv16f32_nxv2f32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: add a4, a3, a1
+; ZVBB-NEXT: add a5, a4, a1
+; ZVBB-NEXT: add a6, a5, a1
+; ZVBB-NEXT: add a7, a6, a1
+; ZVBB-NEXT: vsetvli t0, zero, e32, m1, ta, ma
+; ZVBB-NEXT: vsseg8e32.v v8, (a0)
+; ZVBB-NEXT: vl1re32.v v14, (a7)
+; ZVBB-NEXT: add a1, a7, a1
+; ZVBB-NEXT: vl1re32.v v15, (a1)
+; ZVBB-NEXT: vl1re32.v v12, (a5)
+; ZVBB-NEXT: vl1re32.v v13, (a6)
+; ZVBB-NEXT: vl1re32.v v10, (a3)
+; ZVBB-NEXT: vl1re32.v v11, (a4)
+; ZVBB-NEXT: vl1re32.v v8, (a0)
+; ZVBB-NEXT: vl1re32.v v9, (a2)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 16 x float> @llvm.vector.interleave8.nxv16f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4, <vscale x 2 x float> %v5, <vscale x 2 x float> %v6, <vscale x 2 x float> %v7)
+ ret <vscale x 16 x float> %res
+}
+
+define <vscale x 32 x float> @vector_interleave_nxv32f32_nxv4f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4, <vscale x 4 x float> %v5, <vscale x 4 x float> %v6, <vscale x 4 x float> %v7) nounwind {
+; CHECK-LABEL: vector_interleave_nxv32f32_nxv4f32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmv2r.v v28, v22
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv2r.v v26, v18
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: add a2, a0, a3
+; CHECK-NEXT: add a4, a1, a3
+; CHECK-NEXT: add a5, a2, a3
+; CHECK-NEXT: vmv1r.v v1, v8
+; CHECK-NEXT: vmv2r.v v24, v14
+; CHECK-NEXT: add a6, a4, a3
+; CHECK-NEXT: vmv2r.v v22, v10
+; CHECK-NEXT: vmv1r.v v2, v22
+; CHECK-NEXT: add a7, a5, a3
+; CHECK-NEXT: vmv1r.v v3, v12
+; CHECK-NEXT: add t0, a6, a3
+; CHECK-NEXT: vmv1r.v v4, v24
+; CHECK-NEXT: add t1, a7, a3
+; CHECK-NEXT: vmv1r.v v5, v16
+; CHECK-NEXT: add t2, t0, a3
+; CHECK-NEXT: vmv1r.v v6, v26
+; CHECK-NEXT: add t3, t1, a3
+; CHECK-NEXT: vmv1r.v v7, v20
+; CHECK-NEXT: add t4, t2, a3
+; CHECK-NEXT: vmv1r.v v8, v28
+; CHECK-NEXT: vmv1r.v v22, v9
+; CHECK-NEXT: add t5, t3, a3
+; CHECK-NEXT: vmv1r.v v24, v13
+; CHECK-NEXT: add t6, t4, a3
+; CHECK-NEXT: vmv1r.v v26, v17
+; CHECK-NEXT: vsseg8e32.v v1, (a0)
+; CHECK-NEXT: vmv1r.v v28, v21
+; CHECK-NEXT: vsseg8e32.v v22, (a1)
+; CHECK-NEXT: vl1re32.v v14, (t5)
+; CHECK-NEXT: add t5, t5, a3
+; CHECK-NEXT: add a3, t6, a3
+; CHECK-NEXT: vl1re32.v v22, (t6)
+; CHECK-NEXT: vl1re32.v v15, (t5)
+; CHECK-NEXT: vl1re32.v v23, (a3)
+; CHECK-NEXT: vl1re32.v v12, (t1)
+; CHECK-NEXT: vl1re32.v v20, (t2)
+; CHECK-NEXT: vl1re32.v v13, (t3)
+; CHECK-NEXT: vl1re32.v v21, (t4)
+; CHECK-NEXT: vl1re32.v v10, (a5)
+; CHECK-NEXT: vl1re32.v v18, (a6)
+; CHECK-NEXT: vl1re32.v v11, (a7)
+; CHECK-NEXT: vl1re32.v v19, (t0)
+; CHECK-NEXT: vl1re32.v v8, (a0)
+; CHECK-NEXT: vl1re32.v v16, (a1)
+; CHECK-NEXT: vl1re32.v v9, (a2)
+; CHECK-NEXT: vl1re32.v v17, (a4)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
;
-; ZVBB-RV64-LABEL: vector_interleave_nxv14f64_nxv2f64:
-; ZVBB-RV64: # %bb.0:
-; ZVBB-RV64-NEXT: addi sp, sp, -80
-; ZVBB-RV64-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
-; ZVBB-RV64-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
-; ZVBB-RV64-NEXT: addi s0, sp, 80
-; ZVBB-RV64-NEXT: csrr a0, vlenb
-; ZVBB-RV64-NEXT: slli a0, a0, 5
-; ZVBB-RV64-NEXT: sub sp, sp, a0
-; ZVBB-RV64-NEXT: andi sp, sp, -64
-; ZVBB-RV64-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; ZVBB-RV64-NEXT: vmv2r.v v26, v20
-; ZVBB-RV64-NEXT: addi a0, sp, 64
-; ZVBB-RV64-NEXT: vmv2r.v v24, v16
-; ZVBB-RV64-NEXT: csrr a1, vlenb
-; ZVBB-RV64-NEXT: slli a2, a1, 3
-; ZVBB-RV64-NEXT: sub a1, a2, a1
-; ZVBB-RV64-NEXT: add a1, sp, a1
-; ZVBB-RV64-NEXT: addi a1, a1, 64
-; ZVBB-RV64-NEXT: vmv2r.v v22, v12
-; ZVBB-RV64-NEXT: csrr a2, vlenb
-; ZVBB-RV64-NEXT: vmv2r.v v20, v8
-; ZVBB-RV64-NEXT: vmv1r.v v1, v20
-; ZVBB-RV64-NEXT: vmv1r.v v3, v22
-; ZVBB-RV64-NEXT: vmv1r.v v5, v24
-; ZVBB-RV64-NEXT: vmv1r.v v7, v26
-; ZVBB-RV64-NEXT: add a3, a0, a2
-; ZVBB-RV64-NEXT: vmv1r.v v2, v10
-; ZVBB-RV64-NEXT: add a4, a1, a2
-; ZVBB-RV64-NEXT: slli a5, a2, 2
-; ZVBB-RV64-NEXT: vmv1r.v v4, v14
-; ZVBB-RV64-NEXT: slli a6, a2, 4
-; ZVBB-RV64-NEXT: add a7, a4, a2
-; ZVBB-RV64-NEXT: vmv1r.v v6, v18
-; ZVBB-RV64-NEXT: sub a5, a6, a5
-; ZVBB-RV64-NEXT: vmv1r.v v22, v11
-; ZVBB-RV64-NEXT: add a6, a7, a2
-; ZVBB-RV64-NEXT: vmv1r.v v24, v15
-; ZVBB-RV64-NEXT: vsseg7e64.v v1, (a0)
-; ZVBB-RV64-NEXT: vmv1r.v v26, v19
-; ZVBB-RV64-NEXT: vsseg7e64.v v21, (a1)
-; ZVBB-RV64-NEXT: vl1re64.v v18, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re64.v v19, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re64.v v20, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re64.v v21, (a6)
-; ZVBB-RV64-NEXT: add a6, a3, a2
-; ZVBB-RV64-NEXT: vl1re64.v v10, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re64.v v11, (a6)
-; ZVBB-RV64-NEXT: vl1re64.v v8, (a0)
-; ZVBB-RV64-NEXT: vl1re64.v v16, (a4)
-; ZVBB-RV64-NEXT: vl1re64.v v9, (a3)
-; ZVBB-RV64-NEXT: vl1re64.v v17, (a7)
-; ZVBB-RV64-NEXT: csrr a0, vlenb
-; ZVBB-RV64-NEXT: li a3, 14
-; ZVBB-RV64-NEXT: mul a0, a0, a3
-; ZVBB-RV64-NEXT: add a0, sp, a0
-; ZVBB-RV64-NEXT: addi a0, a0, 64
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re64.v v12, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: vl1re64.v v13, (a6)
-; ZVBB-RV64-NEXT: add a6, a6, a2
-; ZVBB-RV64-NEXT: slli a2, a2, 3
-; ZVBB-RV64-NEXT: add a2, a0, a2
-; ZVBB-RV64-NEXT: vl1re64.v v14, (a6)
-; ZVBB-RV64-NEXT: vl1re64.v v15, (a1)
-; ZVBB-RV64-NEXT: add a5, a0, a5
-; ZVBB-RV64-NEXT: vs2r.v v20, (a5)
-; ZVBB-RV64-NEXT: vs4r.v v16, (a2)
-; ZVBB-RV64-NEXT: vs8r.v v8, (a0)
-; ZVBB-RV64-NEXT: vl8re64.v v16, (a2)
-; ZVBB-RV64-NEXT: vl8re64.v v8, (a0)
-; ZVBB-RV64-NEXT: addi sp, s0, -80
-; ZVBB-RV64-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
-; ZVBB-RV64-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
-; ZVBB-RV64-NEXT: addi sp, sp, 80
-; ZVBB-RV64-NEXT: ret
+; ZVBB-LABEL: vector_interleave_nxv32f32_nxv4f32:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 4
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-NEXT: vmv2r.v v28, v22
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: vmv2r.v v26, v18
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 3
+; ZVBB-NEXT: add a1, sp, a1
+; ZVBB-NEXT: addi a1, a1, 16
+; ZVBB-NEXT: csrr a3, vlenb
+; ZVBB-NEXT: add a2, a0, a3
+; ZVBB-NEXT: add a4, a1, a3
+; ZVBB-NEXT: add a5, a2, a3
+; ZVBB-NEXT: vmv1r.v v1, v8
+; ZVBB-NEXT: vmv2r.v v24, v14
+; ZVBB-NEXT: add a6, a4, a3
+; ZVBB-NEXT: vmv2r.v v22, v10
+; ZVBB-NEXT: vmv1r.v v2, v22
+; ZVBB-NEXT: add a7, a5, a3
+; ZVBB-NEXT: vmv1r.v v3, v12
+; ZVBB-NEXT: add t0, a6, a3
+; ZVBB-NEXT: vmv1r.v v4, v24
+; ZVBB-NEXT: add t1, a7, a3
+; ZVBB-NEXT: vmv1r.v v5, v16
+; ZVBB-NEXT: add t2, t0, a3
+; ZVBB-NEXT: vmv1r.v v6, v26
+; ZVBB-NEXT: add t3, t1, a3
+; ZVBB-NEXT: vmv1r.v v7, v20
+; ZVBB-NEXT: add t4, t2, a3
+; ZVBB-NEXT: vmv1r.v v8, v28
+; ZVBB-NEXT: vmv1r.v v22, v9
+; ZVBB-NEXT: add t5, t3, a3
+; ZVBB-NEXT: vmv1r.v v24, v13
+; ZVBB-NEXT: add t6, t4, a3
+; ZVBB-NEXT: vmv1r.v v26, v17
+; ZVBB-NEXT: vsseg8e32.v v1, (a0)
+; ZVBB-NEXT: vmv1r.v v28, v21
+; ZVBB-NEXT: vsseg8e32.v v22, (a1)
+; ZVBB-NEXT: vl1re32.v v14, (t5)
+; ZVBB-NEXT: add t5, t5, a3
+; ZVBB-NEXT: add a3, t6, a3
+; ZVBB-NEXT: vl1re32.v v22, (t6)
+; ZVBB-NEXT: vl1re32.v v15, (t5)
+; ZVBB-NEXT: vl1re32.v v23, (a3)
+; ZVBB-NEXT: vl1re32.v v12, (t1)
+; ZVBB-NEXT: vl1re32.v v20, (t2)
+; ZVBB-NEXT: vl1re32.v v13, (t3)
+; ZVBB-NEXT: vl1re32.v v21, (t4)
+; ZVBB-NEXT: vl1re32.v v10, (a5)
+; ZVBB-NEXT: vl1re32.v v18, (a6)
+; ZVBB-NEXT: vl1re32.v v11, (a7)
+; ZVBB-NEXT: vl1re32.v v19, (t0)
+; ZVBB-NEXT: vl1re32.v v8, (a0)
+; ZVBB-NEXT: vl1re32.v v16, (a1)
+; ZVBB-NEXT: vl1re32.v v9, (a2)
+; ZVBB-NEXT: vl1re32.v v17, (a4)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 4
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 32 x float> @llvm.vector.interleave8.nxv32f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4, <vscale x 4 x float> %v5, <vscale x 4 x float> %v6, <vscale x 4 x float> %v7)
+ ret <vscale x 32 x float> %res
+}
+
+define <vscale x 8 x double> @vector_interleave_nxv8f64_nxv1f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4, <vscale x 1 x double> %v5, <vscale x 1 x double> %v6, <vscale x 1 x double> %v8) nounwind {
+; CHECK-LABEL: vector_interleave_nxv8f64_nxv1f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: add a2, a0, a1
+; CHECK-NEXT: add a3, a2, a1
+; CHECK-NEXT: add a4, a3, a1
+; CHECK-NEXT: add a5, a4, a1
+; CHECK-NEXT: add a6, a5, a1
+; CHECK-NEXT: add a7, a6, a1
+; CHECK-NEXT: vsetvli t0, zero, e64, m1, ta, ma
+; CHECK-NEXT: vsseg8e64.v v8, (a0)
+; CHECK-NEXT: vl1re64.v v14, (a7)
+; CHECK-NEXT: add a1, a7, a1
+; CHECK-NEXT: vl1re64.v v15, (a1)
+; CHECK-NEXT: vl1re64.v v12, (a5)
+; CHECK-NEXT: vl1re64.v v13, (a6)
+; CHECK-NEXT: vl1re64.v v10, (a3)
+; CHECK-NEXT: vl1re64.v v11, (a4)
+; CHECK-NEXT: vl1re64.v v8, (a0)
+; CHECK-NEXT: vl1re64.v v9, (a2)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 3
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
;
-; ZIP-LABEL: vector_interleave_nxv14f64_nxv2f64:
-; ZIP: # %bb.0:
-; ZIP-NEXT: addi sp, sp, -80
-; ZIP-NEXT: sd ra, 72(sp) # 8-byte Folded Spill
-; ZIP-NEXT: sd s0, 64(sp) # 8-byte Folded Spill
-; ZIP-NEXT: addi s0, sp, 80
-; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: slli a0, a0, 5
-; ZIP-NEXT: sub sp, sp, a0
-; ZIP-NEXT: andi sp, sp, -64
-; ZIP-NEXT: vsetvli a0, zero, e64, m1, ta, ma
-; ZIP-NEXT: vmv2r.v v26, v20
-; ZIP-NEXT: addi a0, sp, 64
-; ZIP-NEXT: vmv2r.v v24, v16
-; ZIP-NEXT: csrr a1, vlenb
-; ZIP-NEXT: slli a2, a1, 3
-; ZIP-NEXT: sub a1, a2, a1
-; ZIP-NEXT: add a1, sp, a1
-; ZIP-NEXT: addi a1, a1, 64
-; ZIP-NEXT: vmv2r.v v22, v12
-; ZIP-NEXT: csrr a2, vlenb
-; ZIP-NEXT: vmv2r.v v20, v8
-; ZIP-NEXT: vmv1r.v v1, v20
-; ZIP-NEXT: vmv1r.v v3, v22
-; ZIP-NEXT: vmv1r.v v5, v24
-; ZIP-NEXT: vmv1r.v v7, v26
-; ZIP-NEXT: add a3, a0, a2
-; ZIP-NEXT: vmv1r.v v2, v10
-; ZIP-NEXT: add a4, a1, a2
-; ZIP-NEXT: slli a5, a2, 2
-; ZIP-NEXT: vmv1r.v v4, v14
-; ZIP-NEXT: slli a6, a2, 4
-; ZIP-NEXT: add a7, a4, a2
-; ZIP-NEXT: vmv1r.v v6, v18
-; ZIP-NEXT: sub a5, a6, a5
-; ZIP-NEXT: vmv1r.v v22, v11
-; ZIP-NEXT: add a6, a7, a2
-; ZIP-NEXT: vmv1r.v v24, v15
-; ZIP-NEXT: vsseg7e64.v v1, (a0)
-; ZIP-NEXT: vmv1r.v v26, v19
-; ZIP-NEXT: vsseg7e64.v v21, (a1)
-; ZIP-NEXT: vl1re64.v v18, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re64.v v19, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re64.v v20, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re64.v v21, (a6)
-; ZIP-NEXT: add a6, a3, a2
-; ZIP-NEXT: vl1re64.v v10, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re64.v v11, (a6)
-; ZIP-NEXT: vl1re64.v v8, (a0)
-; ZIP-NEXT: vl1re64.v v16, (a4)
-; ZIP-NEXT: vl1re64.v v9, (a3)
-; ZIP-NEXT: vl1re64.v v17, (a7)
-; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: li a3, 14
-; ZIP-NEXT: mul a0, a0, a3
-; ZIP-NEXT: add a0, sp, a0
-; ZIP-NEXT: addi a0, a0, 64
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re64.v v12, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: vl1re64.v v13, (a6)
-; ZIP-NEXT: add a6, a6, a2
-; ZIP-NEXT: slli a2, a2, 3
-; ZIP-NEXT: add a2, a0, a2
-; ZIP-NEXT: vl1re64.v v14, (a6)
-; ZIP-NEXT: vl1re64.v v15, (a1)
-; ZIP-NEXT: add a5, a0, a5
-; ZIP-NEXT: vs2r.v v20, (a5)
-; ZIP-NEXT: vs4r.v v16, (a2)
-; ZIP-NEXT: vs8r.v v8, (a0)
-; ZIP-NEXT: vl8re64.v v16, (a2)
-; ZIP-NEXT: vl8re64.v v8, (a0)
-; ZIP-NEXT: addi sp, s0, -80
-; ZIP-NEXT: ld ra, 72(sp) # 8-byte Folded Reload
-; ZIP-NEXT: ld s0, 64(sp) # 8-byte Folded Reload
-; ZIP-NEXT: addi sp, sp, 80
-; ZIP-NEXT: ret
- %res = call <vscale x 14 x double> @llvm.vector.interleave7.nxv14f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4, <vscale x 2 x double> %v5, <vscale x 2 x double> %v6)
- ret <vscale x 14 x double> %res
+; ZVBB-LABEL: vector_interleave_nxv8f64_nxv1f64:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: add a2, a0, a1
+; ZVBB-NEXT: add a3, a2, a1
+; ZVBB-NEXT: add a4, a3, a1
+; ZVBB-NEXT: add a5, a4, a1
+; ZVBB-NEXT: add a6, a5, a1
+; ZVBB-NEXT: add a7, a6, a1
+; ZVBB-NEXT: vsetvli t0, zero, e64, m1, ta, ma
+; ZVBB-NEXT: vsseg8e64.v v8, (a0)
+; ZVBB-NEXT: vl1re64.v v14, (a7)
+; ZVBB-NEXT: add a1, a7, a1
+; ZVBB-NEXT: vl1re64.v v15, (a1)
+; ZVBB-NEXT: vl1re64.v v12, (a5)
+; ZVBB-NEXT: vl1re64.v v13, (a6)
+; ZVBB-NEXT: vl1re64.v v10, (a3)
+; ZVBB-NEXT: vl1re64.v v11, (a4)
+; ZVBB-NEXT: vl1re64.v v8, (a0)
+; ZVBB-NEXT: vl1re64.v v9, (a2)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 3
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 8 x double> @llvm.vector.interleave8.nxv8f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4, <vscale x 1 x double> %v5, <vscale x 1 x double> %v6, <vscale x 1 x double> %v8)
+ ret <vscale x 8 x double> %res
+}
+
+define <vscale x 16 x double> @vector_interleave_nxv16f64_nxv2f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4, <vscale x 2 x double> %v5, <vscale x 2 x double> %v6, <vscale x 2 x double> %v7) nounwind {
+; CHECK-LABEL: vector_interleave_nxv16f64_nxv2f64:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT: vmv2r.v v28, v22
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vmv2r.v v26, v18
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: csrr a3, vlenb
+; CHECK-NEXT: add a2, a0, a3
+; CHECK-NEXT: add a4, a1, a3
+; CHECK-NEXT: add a5, a2, a3
+; CHECK-NEXT: vmv1r.v v1, v8
+; CHECK-NEXT: vmv2r.v v24, v14
+; CHECK-NEXT: add a6, a4, a3
+; CHECK-NEXT: vmv2r.v v22, v10
+; CHECK-NEXT: vmv1r.v v2, v22
+; CHECK-NEXT: add a7, a5, a3
+; CHECK-NEXT: vmv1r.v v3, v12
+; CHECK-NEXT: add t0, a6, a3
+; CHECK-NEXT: vmv1r.v v4, v24
+; CHECK-NEXT: add t1, a7, a3
+; CHECK-NEXT: vmv1r.v v5, v16
+; CHECK-NEXT: add t2, t0, a3
+; CHECK-NEXT: vmv1r.v v6, v26
+; CHECK-NEXT: add t3, t1, a3
+; CHECK-NEXT: vmv1r.v v7, v20
+; CHECK-NEXT: add t4, t2, a3
+; CHECK-NEXT: vmv1r.v v8, v28
+; CHECK-NEXT: vmv1r.v v22, v9
+; CHECK-NEXT: add t5, t3, a3
+; CHECK-NEXT: vmv1r.v v24, v13
+; CHECK-NEXT: add t6, t4, a3
+; CHECK-NEXT: vmv1r.v v26, v17
+; CHECK-NEXT: vsseg8e64.v v1, (a0)
+; CHECK-NEXT: vmv1r.v v28, v21
+; CHECK-NEXT: vsseg8e64.v v22, (a1)
+; CHECK-NEXT: vl1re64.v v14, (t5)
+; CHECK-NEXT: add t5, t5, a3
+; CHECK-NEXT: add a3, t6, a3
+; CHECK-NEXT: vl1re64.v v22, (t6)
+; CHECK-NEXT: vl1re64.v v15, (t5)
+; CHECK-NEXT: vl1re64.v v23, (a3)
+; CHECK-NEXT: vl1re64.v v12, (t1)
+; CHECK-NEXT: vl1re64.v v20, (t2)
+; CHECK-NEXT: vl1re64.v v13, (t3)
+; CHECK-NEXT: vl1re64.v v21, (t4)
+; CHECK-NEXT: vl1re64.v v10, (a5)
+; CHECK-NEXT: vl1re64.v v18, (a6)
+; CHECK-NEXT: vl1re64.v v11, (a7)
+; CHECK-NEXT: vl1re64.v v19, (t0)
+; CHECK-NEXT: vl1re64.v v8, (a0)
+; CHECK-NEXT: vl1re64.v v16, (a1)
+; CHECK-NEXT: vl1re64.v v9, (a2)
+; CHECK-NEXT: vl1re64.v v17, (a4)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: slli a0, a0, 4
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16f64_nxv2f64:
+; ZVBB: # %bb.0:
+; ZVBB-NEXT: addi sp, sp, -16
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 4
+; ZVBB-NEXT: sub sp, sp, a0
+; ZVBB-NEXT: vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-NEXT: vmv2r.v v28, v22
+; ZVBB-NEXT: addi a0, sp, 16
+; ZVBB-NEXT: vmv2r.v v26, v18
+; ZVBB-NEXT: csrr a1, vlenb
+; ZVBB-NEXT: slli a1, a1, 3
+; ZVBB-NEXT: add a1, sp, a1
+; ZVBB-NEXT: addi a1, a1, 16
+; ZVBB-NEXT: csrr a3, vlenb
+; ZVBB-NEXT: add a2, a0, a3
+; ZVBB-NEXT: add a4, a1, a3
+; ZVBB-NEXT: add a5, a2, a3
+; ZVBB-NEXT: vmv1r.v v1, v8
+; ZVBB-NEXT: vmv2r.v v24, v14
+; ZVBB-NEXT: add a6, a4, a3
+; ZVBB-NEXT: vmv2r.v v22, v10
+; ZVBB-NEXT: vmv1r.v v2, v22
+; ZVBB-NEXT: add a7, a5, a3
+; ZVBB-NEXT: vmv1r.v v3, v12
+; ZVBB-NEXT: add t0, a6, a3
+; ZVBB-NEXT: vmv1r.v v4, v24
+; ZVBB-NEXT: add t1, a7, a3
+; ZVBB-NEXT: vmv1r.v v5, v16
+; ZVBB-NEXT: add t2, t0, a3
+; ZVBB-NEXT: vmv1r.v v6, v26
+; ZVBB-NEXT: add t3, t1, a3
+; ZVBB-NEXT: vmv1r.v v7, v20
+; ZVBB-NEXT: add t4, t2, a3
+; ZVBB-NEXT: vmv1r.v v8, v28
+; ZVBB-NEXT: vmv1r.v v22, v9
+; ZVBB-NEXT: add t5, t3, a3
+; ZVBB-NEXT: vmv1r.v v24, v13
+; ZVBB-NEXT: add t6, t4, a3
+; ZVBB-NEXT: vmv1r.v v26, v17
+; ZVBB-NEXT: vsseg8e64.v v1, (a0)
+; ZVBB-NEXT: vmv1r.v v28, v21
+; ZVBB-NEXT: vsseg8e64.v v22, (a1)
+; ZVBB-NEXT: vl1re64.v v14, (t5)
+; ZVBB-NEXT: add t5, t5, a3
+; ZVBB-NEXT: add a3, t6, a3
+; ZVBB-NEXT: vl1re64.v v22, (t6)
+; ZVBB-NEXT: vl1re64.v v15, (t5)
+; ZVBB-NEXT: vl1re64.v v23, (a3)
+; ZVBB-NEXT: vl1re64.v v12, (t1)
+; ZVBB-NEXT: vl1re64.v v20, (t2)
+; ZVBB-NEXT: vl1re64.v v13, (t3)
+; ZVBB-NEXT: vl1re64.v v21, (t4)
+; ZVBB-NEXT: vl1re64.v v10, (a5)
+; ZVBB-NEXT: vl1re64.v v18, (a6)
+; ZVBB-NEXT: vl1re64.v v11, (a7)
+; ZVBB-NEXT: vl1re64.v v19, (t0)
+; ZVBB-NEXT: vl1re64.v v8, (a0)
+; ZVBB-NEXT: vl1re64.v v16, (a1)
+; ZVBB-NEXT: vl1re64.v v9, (a2)
+; ZVBB-NEXT: vl1re64.v v17, (a4)
+; ZVBB-NEXT: csrr a0, vlenb
+; ZVBB-NEXT: slli a0, a0, 4
+; ZVBB-NEXT: add sp, sp, a0
+; ZVBB-NEXT: addi sp, sp, 16
+; ZVBB-NEXT: ret
+ %res = call <vscale x 16 x double> @llvm.vector.interleave8.nxv16f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4, <vscale x 2 x double> %v5, <vscale x 2 x double> %v6, <vscale x 2 x double> %v7)
+ ret <vscale x 16 x double> %res
}
>From 777ccf859ec87e5d2f5d3aad36cbcf920ea1b648 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 21 May 2025 12:38:13 +0100
Subject: [PATCH 2/5] Add nounwind to avoid cfi directives
---
.../RISCV/rvv/vector-deinterleave-fixed.ll | 106 +----------
.../RISCV/rvv/vector-interleave-fixed.ll | 168 ++----------------
2 files changed, 18 insertions(+), 256 deletions(-)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index a3ad0b26efd4d..b2bc01e3c5174 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -189,15 +189,13 @@ define {<8 x i64>, <8 x i64>} @vector_deinterleave_v8i64_v16i64(<16 x i64> %vec)
ret {<8 x i64>, <8 x i64>} %retval
}
-define {<2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v6i32(<6 x i32> %v) {
+define {<2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v6i32(<6 x i32> %v) nounwind {
; CHECK-LABEL: vector_deinterleave3_v2i32_v6i32:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: vsetivli zero, 2, e32, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v12, v8, 2
@@ -215,23 +213,19 @@ define {<2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v6i32(<6 x
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%res = call {<2 x i32>, <2 x i32>, <2 x i32>} @llvm.vector.deinterleave3.v6i32(<6 x i32> %v)
ret {<2 x i32>, <2 x i32>, <2 x i32>} %res
}
-define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v8i32(<8 x i32> %v) {
+define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v8i32(<8 x i32> %v) nounwind {
; CHECK-LABEL: vector_deinterleave3_v2i32_v8i32:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: vsetivli zero, 2, e32, m2, ta, ma
; CHECK-NEXT: vslidedown.vi v10, v8, 6
@@ -251,23 +245,19 @@ define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%res = call {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @llvm.vector.deinterleave4.v8i32(<8 x i32> %v)
ret {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} %res
}
-define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterleave5_v2i16_v10i16(<10 x i16> %v) {
+define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterleave5_v2i16_v10i16(<10 x i16> %v) nounwind {
; CHECK-LABEL: vector_deinterleave5_v2i16_v10i16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v12, v8, 6
@@ -292,23 +282,19 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterle
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%res = call {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @llvm.vector.deinterleave5.v10i16(<10 x i16> %v)
ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res
}
-define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterleave6_v2i16_v12i16(<12 x i16> %v) {
+define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterleave6_v2i16_v12i16(<12 x i16> %v) nounwind {
; CHECK-LABEL: vector_deinterleave6_v2i16_v12i16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: vsetivli zero, 2, e16, m1, ta, ma
; CHECK-NEXT: vslidedown.vi v14, v8, 6
@@ -335,29 +321,22 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vecto
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
%res = call {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @llvm.vector.deinterleave6.v12i16(<12 x i16> %v)
ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res
}
-define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave7_v14i8_v2i8(<14 x i8> %v) {
+define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave7_v14i8_v2i8(<14 x i8> %v) nounwind {
; RV32-LABEL: vector_deinterleave7_v14i8_v2i8:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s1, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: .cfi_offset s1, -12
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 2
; RV32-NEXT: sub sp, sp, a0
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
; RV32-NEXT: addi a0, sp, 32
; RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
; RV32-NEXT: csrr s1, vlenb
@@ -424,31 +403,21 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @v
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a0, a0, 2
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
; RV32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s1, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore ra
-; RV32-NEXT: .cfi_restore s0
-; RV32-NEXT: .cfi_restore s1
; RV32-NEXT: addi sp, sp, 48
-; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: vector_deinterleave7_v14i8_v2i8:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -64
-; RV64-NEXT: .cfi_def_cfa_offset 64
; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
; RV64-NEXT: sd s1, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: .cfi_offset s1, -24
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 2
; RV64-NEXT: sub sp, sp, a0
-; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 4 * vlenb
; RV64-NEXT: addi a0, sp, 32
; RV64-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
; RV64-NEXT: csrr s1, vlenb
@@ -515,31 +484,21 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @v
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a0, a0, 2
; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: .cfi_def_cfa sp, 64
; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s1, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT: .cfi_restore ra
-; RV64-NEXT: .cfi_restore s0
-; RV64-NEXT: .cfi_restore s1
; RV64-NEXT: addi sp, sp, 64
-; RV64-NEXT: .cfi_def_cfa_offset 0
; RV64-NEXT: ret
;
; ZIP-LABEL: vector_deinterleave7_v14i8_v2i8:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -64
-; ZIP-NEXT: .cfi_def_cfa_offset 64
; ZIP-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
; ZIP-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
; ZIP-NEXT: sd s1, 40(sp) # 8-byte Folded Spill
-; ZIP-NEXT: .cfi_offset ra, -8
-; ZIP-NEXT: .cfi_offset s0, -16
-; ZIP-NEXT: .cfi_offset s1, -24
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 2
; ZIP-NEXT: sub sp, sp, a0
-; ZIP-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 4 * vlenb
; ZIP-NEXT: addi a0, sp, 32
; ZIP-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
; ZIP-NEXT: csrr s1, vlenb
@@ -606,42 +565,29 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @v
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 2
; ZIP-NEXT: add sp, sp, a0
-; ZIP-NEXT: .cfi_def_cfa sp, 64
; ZIP-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
; ZIP-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
; ZIP-NEXT: ld s1, 40(sp) # 8-byte Folded Reload
-; ZIP-NEXT: .cfi_restore ra
-; ZIP-NEXT: .cfi_restore s0
-; ZIP-NEXT: .cfi_restore s1
; ZIP-NEXT: addi sp, sp, 64
-; ZIP-NEXT: .cfi_def_cfa_offset 0
; ZIP-NEXT: ret
%res = call {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @llvm.vector.deinterleave7.v14i8(<14 x i8> %v)
ret {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} %res
}
-define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave8_v16i8_v2i8(<16 x i8> %v) {
+define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave8_v16i8_v2i8(<16 x i8> %v) nounwind {
; RV32-LABEL: vector_deinterleave8_v16i8_v2i8:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: .cfi_def_cfa_offset 48
; RV32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s1, 36(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s2, 32(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s3, 28(sp) # 4-byte Folded Spill
; RV32-NEXT: sw s4, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT: .cfi_offset ra, -4
-; RV32-NEXT: .cfi_offset s0, -8
-; RV32-NEXT: .cfi_offset s1, -12
-; RV32-NEXT: .cfi_offset s2, -16
-; RV32-NEXT: .cfi_offset s3, -20
-; RV32-NEXT: .cfi_offset s4, -24
; RV32-NEXT: csrr a0, vlenb
; RV32-NEXT: slli a1, a0, 1
; RV32-NEXT: add a0, a1, a0
; RV32-NEXT: sub sp, sp, a0
-; RV32-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 3 * vlenb
; RV32-NEXT: addi a0, sp, 16
; RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
; RV32-NEXT: csrr s1, vlenb
@@ -701,44 +647,28 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2
; RV32-NEXT: slli a1, a0, 1
; RV32-NEXT: add a0, a1, a0
; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: .cfi_def_cfa sp, 48
; RV32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s1, 36(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s2, 32(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s3, 28(sp) # 4-byte Folded Reload
; RV32-NEXT: lw s4, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: .cfi_restore ra
-; RV32-NEXT: .cfi_restore s0
-; RV32-NEXT: .cfi_restore s1
-; RV32-NEXT: .cfi_restore s2
-; RV32-NEXT: .cfi_restore s3
-; RV32-NEXT: .cfi_restore s4
; RV32-NEXT: addi sp, sp, 48
-; RV32-NEXT: .cfi_def_cfa_offset 0
; RV32-NEXT: ret
;
; RV64-LABEL: vector_deinterleave8_v16i8_v2i8:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -64
-; RV64-NEXT: .cfi_def_cfa_offset 64
; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
; RV64-NEXT: sd s1, 40(sp) # 8-byte Folded Spill
; RV64-NEXT: sd s2, 32(sp) # 8-byte Folded Spill
; RV64-NEXT: sd s3, 24(sp) # 8-byte Folded Spill
; RV64-NEXT: sd s4, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT: .cfi_offset ra, -8
-; RV64-NEXT: .cfi_offset s0, -16
-; RV64-NEXT: .cfi_offset s1, -24
-; RV64-NEXT: .cfi_offset s2, -32
-; RV64-NEXT: .cfi_offset s3, -40
-; RV64-NEXT: .cfi_offset s4, -48
; RV64-NEXT: csrr a0, vlenb
; RV64-NEXT: slli a1, a0, 1
; RV64-NEXT: add a0, a1, a0
; RV64-NEXT: sub sp, sp, a0
-; RV64-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 3 * vlenb
; RV64-NEXT: addi a0, sp, 16
; RV64-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
; RV64-NEXT: csrr s1, vlenb
@@ -798,44 +728,28 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2
; RV64-NEXT: slli a1, a0, 1
; RV64-NEXT: add a0, a1, a0
; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: .cfi_def_cfa sp, 64
; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s1, 40(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s2, 32(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s3, 24(sp) # 8-byte Folded Reload
; RV64-NEXT: ld s4, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT: .cfi_restore ra
-; RV64-NEXT: .cfi_restore s0
-; RV64-NEXT: .cfi_restore s1
-; RV64-NEXT: .cfi_restore s2
-; RV64-NEXT: .cfi_restore s3
-; RV64-NEXT: .cfi_restore s4
; RV64-NEXT: addi sp, sp, 64
-; RV64-NEXT: .cfi_def_cfa_offset 0
; RV64-NEXT: ret
;
; ZIP-LABEL: vector_deinterleave8_v16i8_v2i8:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -64
-; ZIP-NEXT: .cfi_def_cfa_offset 64
; ZIP-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
; ZIP-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
; ZIP-NEXT: sd s1, 40(sp) # 8-byte Folded Spill
; ZIP-NEXT: sd s2, 32(sp) # 8-byte Folded Spill
; ZIP-NEXT: sd s3, 24(sp) # 8-byte Folded Spill
; ZIP-NEXT: sd s4, 16(sp) # 8-byte Folded Spill
-; ZIP-NEXT: .cfi_offset ra, -8
-; ZIP-NEXT: .cfi_offset s0, -16
-; ZIP-NEXT: .cfi_offset s1, -24
-; ZIP-NEXT: .cfi_offset s2, -32
-; ZIP-NEXT: .cfi_offset s3, -40
-; ZIP-NEXT: .cfi_offset s4, -48
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a1, a0, 1
; ZIP-NEXT: add a0, a1, a0
; ZIP-NEXT: sub sp, sp, a0
-; ZIP-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 3 * vlenb
; ZIP-NEXT: addi a0, sp, 16
; ZIP-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
; ZIP-NEXT: csrr s1, vlenb
@@ -895,21 +809,13 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2
; ZIP-NEXT: slli a1, a0, 1
; ZIP-NEXT: add a0, a1, a0
; ZIP-NEXT: add sp, sp, a0
-; ZIP-NEXT: .cfi_def_cfa sp, 64
; ZIP-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
; ZIP-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
; ZIP-NEXT: ld s1, 40(sp) # 8-byte Folded Reload
; ZIP-NEXT: ld s2, 32(sp) # 8-byte Folded Reload
; ZIP-NEXT: ld s3, 24(sp) # 8-byte Folded Reload
; ZIP-NEXT: ld s4, 16(sp) # 8-byte Folded Reload
-; ZIP-NEXT: .cfi_restore ra
-; ZIP-NEXT: .cfi_restore s0
-; ZIP-NEXT: .cfi_restore s1
-; ZIP-NEXT: .cfi_restore s2
-; ZIP-NEXT: .cfi_restore s3
-; ZIP-NEXT: .cfi_restore s4
; ZIP-NEXT: addi sp, sp, 64
-; ZIP-NEXT: .cfi_def_cfa_offset 0
; ZIP-NEXT: ret
%res = call {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @llvm.vector.deinterleave8.v16i8(<16 x i8> %v)
ret {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} %res
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
index faf7903c21614..3dc83d50ee3f3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
@@ -167,15 +167,13 @@ define <4 x i64> @vector_interleave_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b) {
ret <4 x i64> %res
}
-define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) {
+define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind {
; CHECK-LABEL: vector_interleave3_v6i32_v2i32:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a1, a1, 1
@@ -193,19 +191,15 @@ define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
;
; ZVBB-LABEL: vector_interleave3_v6i32_v2i32:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
-; ZVBB-NEXT: .cfi_def_cfa_offset 16
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 1
; ZVBB-NEXT: sub sp, sp, a0
-; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: srli a1, a1, 1
@@ -223,19 +217,15 @@ define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 1
; ZVBB-NEXT: add sp, sp, a0
-; ZVBB-NEXT: .cfi_def_cfa sp, 16
; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: .cfi_def_cfa_offset 0
; ZVBB-NEXT: ret
;
; ZIP-LABEL: vector_interleave3_v6i32_v2i32:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -16
-; ZIP-NEXT: .cfi_def_cfa_offset 16
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 1
; ZIP-NEXT: sub sp, sp, a0
-; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; ZIP-NEXT: addi a0, sp, 16
; ZIP-NEXT: csrr a1, vlenb
; ZIP-NEXT: srli a1, a1, 1
@@ -253,23 +243,19 @@ define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 1
; ZIP-NEXT: add sp, sp, a0
-; ZIP-NEXT: .cfi_def_cfa sp, 16
; ZIP-NEXT: addi sp, sp, 16
-; ZIP-NEXT: .cfi_def_cfa_offset 0
; ZIP-NEXT: ret
%res = call <6 x i32> @llvm.vector.interleave3.v6i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c)
ret <6 x i32> %res
}
-define <8 x i32> @vector_interleave4_v8i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
+define <8 x i32> @vector_interleave4_v8i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) nounwind {
; CHECK-LABEL: vector_interleave4_v8i32_v2i32:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a1, a1, 1
@@ -290,19 +276,15 @@ define <8 x i32> @vector_interleave4_v8i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
;
; ZVBB-LABEL: vector_interleave4_v8i32_v2i32:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
-; ZVBB-NEXT: .cfi_def_cfa_offset 16
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 1
; ZVBB-NEXT: sub sp, sp, a0
-; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: srli a1, a1, 1
@@ -323,19 +305,15 @@ define <8 x i32> @vector_interleave4_v8i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 1
; ZVBB-NEXT: add sp, sp, a0
-; ZVBB-NEXT: .cfi_def_cfa sp, 16
; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: .cfi_def_cfa_offset 0
; ZVBB-NEXT: ret
;
; ZIP-LABEL: vector_interleave4_v8i32_v2i32:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -16
-; ZIP-NEXT: .cfi_def_cfa_offset 16
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 1
; ZIP-NEXT: sub sp, sp, a0
-; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; ZIP-NEXT: addi a0, sp, 16
; ZIP-NEXT: csrr a1, vlenb
; ZIP-NEXT: srli a1, a1, 1
@@ -356,23 +334,19 @@ define <8 x i32> @vector_interleave4_v8i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 1
; ZIP-NEXT: add sp, sp, a0
-; ZIP-NEXT: .cfi_def_cfa sp, 16
; ZIP-NEXT: addi sp, sp, 16
-; ZIP-NEXT: .cfi_def_cfa_offset 0
; ZIP-NEXT: ret
%res = call <8 x i32> @llvm.vector.interleave4.v8i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d)
ret <8 x i32> %res
}
-define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e) {
+define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e) nounwind {
; CHECK-LABEL: vector_interleave5_v10i16_v2i16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a1, a1, 2
@@ -397,19 +371,15 @@ define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
;
; ZVBB-LABEL: vector_interleave5_v10i16_v2i16:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
-; ZVBB-NEXT: .cfi_def_cfa_offset 16
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 1
; ZVBB-NEXT: sub sp, sp, a0
-; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: srli a1, a1, 2
@@ -434,19 +404,15 @@ define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 1
; ZVBB-NEXT: add sp, sp, a0
-; ZVBB-NEXT: .cfi_def_cfa sp, 16
; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: .cfi_def_cfa_offset 0
; ZVBB-NEXT: ret
;
; ZIP-LABEL: vector_interleave5_v10i16_v2i16:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -16
-; ZIP-NEXT: .cfi_def_cfa_offset 16
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 1
; ZIP-NEXT: sub sp, sp, a0
-; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; ZIP-NEXT: addi a0, sp, 16
; ZIP-NEXT: csrr a1, vlenb
; ZIP-NEXT: srli a1, a1, 2
@@ -471,23 +437,19 @@ define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 1
; ZIP-NEXT: add sp, sp, a0
-; ZIP-NEXT: .cfi_def_cfa sp, 16
; ZIP-NEXT: addi sp, sp, 16
-; ZIP-NEXT: .cfi_def_cfa_offset 0
; ZIP-NEXT: ret
%res = call <10 x i16> @llvm.vector.interleave5.v10i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e)
ret <10 x i16> %res
}
-define <12 x i16> @vector_interleave6_v12i16_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e, <2 x i16> %f) {
+define <12 x i16> @vector_interleave6_v12i16_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e, <2 x i16> %f) nounwind {
; CHECK-LABEL: vector_interleave6_v12i16_v2i16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a1, a1, 2
@@ -515,19 +477,15 @@ define <12 x i16> @vector_interleave6_v12i16_v2i16(<2 x i16> %a, <2 x i16> %b, <
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
;
; ZVBB-LABEL: vector_interleave6_v12i16_v2i16:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
-; ZVBB-NEXT: .cfi_def_cfa_offset 16
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 1
; ZVBB-NEXT: sub sp, sp, a0
-; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: srli a1, a1, 2
@@ -555,19 +513,15 @@ define <12 x i16> @vector_interleave6_v12i16_v2i16(<2 x i16> %a, <2 x i16> %b, <
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 1
; ZVBB-NEXT: add sp, sp, a0
-; ZVBB-NEXT: .cfi_def_cfa sp, 16
; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: .cfi_def_cfa_offset 0
; ZVBB-NEXT: ret
;
; ZIP-LABEL: vector_interleave6_v12i16_v2i16:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -16
-; ZIP-NEXT: .cfi_def_cfa_offset 16
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 1
; ZIP-NEXT: sub sp, sp, a0
-; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; ZIP-NEXT: addi a0, sp, 16
; ZIP-NEXT: csrr a1, vlenb
; ZIP-NEXT: srli a1, a1, 2
@@ -595,22 +549,18 @@ define <12 x i16> @vector_interleave6_v12i16_v2i16(<2 x i16> %a, <2 x i16> %b, <
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 1
; ZIP-NEXT: add sp, sp, a0
-; ZIP-NEXT: .cfi_def_cfa sp, 16
; ZIP-NEXT: addi sp, sp, 16
-; ZIP-NEXT: .cfi_def_cfa_offset 0
; ZIP-NEXT: ret
%res = call <12 x i16> @llvm.vector.interleave6.v12i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e, <2 x i16> %f)
ret <12 x i16> %res
}
-define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g) {
+define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g) nounwind {
; CHECK-LABEL: vector_interleave7_v14i8_v2i8:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a1, a1, 3
@@ -641,18 +591,14 @@ define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i
; CHECK-NEXT: vslideup.vi v8, v12, 8
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
;
; ZVBB-LABEL: vector_interleave7_v14i8_v2i8:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
-; ZVBB-NEXT: .cfi_def_cfa_offset 16
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: sub sp, sp, a0
-; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: srli a1, a1, 3
@@ -683,18 +629,14 @@ define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i
; ZVBB-NEXT: vslideup.vi v8, v12, 8
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: add sp, sp, a0
-; ZVBB-NEXT: .cfi_def_cfa sp, 16
; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: .cfi_def_cfa_offset 0
; ZVBB-NEXT: ret
;
; ZIP-LABEL: vector_interleave7_v14i8_v2i8:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -16
-; ZIP-NEXT: .cfi_def_cfa_offset 16
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: sub sp, sp, a0
-; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
; ZIP-NEXT: addi a0, sp, 16
; ZIP-NEXT: csrr a1, vlenb
; ZIP-NEXT: srli a1, a1, 3
@@ -725,22 +667,18 @@ define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i
; ZIP-NEXT: vslideup.vi v8, v12, 8
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: add sp, sp, a0
-; ZIP-NEXT: .cfi_def_cfa sp, 16
; ZIP-NEXT: addi sp, sp, 16
-; ZIP-NEXT: .cfi_def_cfa_offset 0
; ZIP-NEXT: ret
%res = call <14 x i8> @llvm.vector.interleave7.v14i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g)
ret <14 x i8> %res
}
-define <16 x i8> @vector_interleave8_v16i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g, <2 x i8> %h) {
+define <16 x i8> @vector_interleave8_v16i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g, <2 x i8> %h) nounwind {
; CHECK-LABEL: vector_interleave8_v16i8_v2i8:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a1, a1, 3
@@ -774,18 +712,14 @@ define <16 x i8> @vector_interleave8_v16i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i
; CHECK-NEXT: vslideup.vi v8, v10, 8
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
;
; ZVBB-LABEL: vector_interleave8_v16i8_v2i8:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
-; ZVBB-NEXT: .cfi_def_cfa_offset 16
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: sub sp, sp, a0
-; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: srli a1, a1, 3
@@ -819,18 +753,14 @@ define <16 x i8> @vector_interleave8_v16i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i
; ZVBB-NEXT: vslideup.vi v8, v10, 8
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: add sp, sp, a0
-; ZVBB-NEXT: .cfi_def_cfa sp, 16
; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: .cfi_def_cfa_offset 0
; ZVBB-NEXT: ret
;
; ZIP-LABEL: vector_interleave8_v16i8_v2i8:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -16
-; ZIP-NEXT: .cfi_def_cfa_offset 16
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: sub sp, sp, a0
-; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
; ZIP-NEXT: addi a0, sp, 16
; ZIP-NEXT: csrr a1, vlenb
; ZIP-NEXT: srli a1, a1, 3
@@ -864,9 +794,7 @@ define <16 x i8> @vector_interleave8_v16i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i
; ZIP-NEXT: vslideup.vi v8, v10, 8
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: add sp, sp, a0
-; ZIP-NEXT: .cfi_def_cfa sp, 16
; ZIP-NEXT: addi sp, sp, 16
-; ZIP-NEXT: .cfi_def_cfa_offset 0
; ZIP-NEXT: ret
%res = call <16 x i8> @llvm.vector.interleave8.v16i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g, <2 x i8> %h)
ret <16 x i8> %res
@@ -1064,15 +992,13 @@ define <4 x double> @vector_interleave_v4f64_v2f64(<2 x double> %a, <2 x double>
ret <4 x double> %res
}
-define <6 x float> @vector_interleave3_v6f32_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+define <6 x float> @vector_interleave3_v6f32_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind {
; CHECK-LABEL: vector_interleave3_v6f32_v2f32:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a1, a1, 1
@@ -1090,19 +1016,15 @@ define <6 x float> @vector_interleave3_v6f32_v2f32(<2 x float> %a, <2 x float> %
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
;
; ZVBB-LABEL: vector_interleave3_v6f32_v2f32:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
-; ZVBB-NEXT: .cfi_def_cfa_offset 16
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 1
; ZVBB-NEXT: sub sp, sp, a0
-; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: srli a1, a1, 1
@@ -1120,19 +1042,15 @@ define <6 x float> @vector_interleave3_v6f32_v2f32(<2 x float> %a, <2 x float> %
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 1
; ZVBB-NEXT: add sp, sp, a0
-; ZVBB-NEXT: .cfi_def_cfa sp, 16
; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: .cfi_def_cfa_offset 0
; ZVBB-NEXT: ret
;
; ZIP-LABEL: vector_interleave3_v6f32_v2f32:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -16
-; ZIP-NEXT: .cfi_def_cfa_offset 16
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 1
; ZIP-NEXT: sub sp, sp, a0
-; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; ZIP-NEXT: addi a0, sp, 16
; ZIP-NEXT: csrr a1, vlenb
; ZIP-NEXT: srli a1, a1, 1
@@ -1150,23 +1068,19 @@ define <6 x float> @vector_interleave3_v6f32_v2f32(<2 x float> %a, <2 x float> %
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 1
; ZIP-NEXT: add sp, sp, a0
-; ZIP-NEXT: .cfi_def_cfa sp, 16
; ZIP-NEXT: addi sp, sp, 16
-; ZIP-NEXT: .cfi_def_cfa_offset 0
; ZIP-NEXT: ret
%res = call <6 x float> @llvm.vector.interleave3.v6f32(<2 x float> %a, <2 x float> %b, <2 x float> %c)
ret <6 x float> %res
}
-define <8 x float> @vector_interleave4_v8f32_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) {
+define <8 x float> @vector_interleave4_v8f32_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) nounwind {
; CHECK-LABEL: vector_interleave4_v8f32_v2f32:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a1, a1, 1
@@ -1187,19 +1101,15 @@ define <8 x float> @vector_interleave4_v8f32_v2f32(<2 x float> %a, <2 x float> %
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
;
; ZVBB-LABEL: vector_interleave4_v8f32_v2f32:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
-; ZVBB-NEXT: .cfi_def_cfa_offset 16
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 1
; ZVBB-NEXT: sub sp, sp, a0
-; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: srli a1, a1, 1
@@ -1220,19 +1130,15 @@ define <8 x float> @vector_interleave4_v8f32_v2f32(<2 x float> %a, <2 x float> %
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 1
; ZVBB-NEXT: add sp, sp, a0
-; ZVBB-NEXT: .cfi_def_cfa sp, 16
; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: .cfi_def_cfa_offset 0
; ZVBB-NEXT: ret
;
; ZIP-LABEL: vector_interleave4_v8f32_v2f32:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -16
-; ZIP-NEXT: .cfi_def_cfa_offset 16
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 1
; ZIP-NEXT: sub sp, sp, a0
-; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; ZIP-NEXT: addi a0, sp, 16
; ZIP-NEXT: csrr a1, vlenb
; ZIP-NEXT: srli a1, a1, 1
@@ -1253,23 +1159,19 @@ define <8 x float> @vector_interleave4_v8f32_v2f32(<2 x float> %a, <2 x float> %
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 1
; ZIP-NEXT: add sp, sp, a0
-; ZIP-NEXT: .cfi_def_cfa sp, 16
; ZIP-NEXT: addi sp, sp, 16
-; ZIP-NEXT: .cfi_def_cfa_offset 0
; ZIP-NEXT: ret
%res = call <8 x float> @llvm.vector.interleave4.v8f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d)
ret <8 x float> %res
}
-define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e) {
+define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e) nounwind {
; CHECK-LABEL: vector_interleave5_v10f16_v2f16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a1, a1, 2
@@ -1294,19 +1196,15 @@ define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
;
; ZVBB-LABEL: vector_interleave5_v10f16_v2f16:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
-; ZVBB-NEXT: .cfi_def_cfa_offset 16
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 1
; ZVBB-NEXT: sub sp, sp, a0
-; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: srli a1, a1, 2
@@ -1331,19 +1229,15 @@ define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 1
; ZVBB-NEXT: add sp, sp, a0
-; ZVBB-NEXT: .cfi_def_cfa sp, 16
; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: .cfi_def_cfa_offset 0
; ZVBB-NEXT: ret
;
; ZIP-LABEL: vector_interleave5_v10f16_v2f16:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -16
-; ZIP-NEXT: .cfi_def_cfa_offset 16
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 1
; ZIP-NEXT: sub sp, sp, a0
-; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; ZIP-NEXT: addi a0, sp, 16
; ZIP-NEXT: csrr a1, vlenb
; ZIP-NEXT: srli a1, a1, 2
@@ -1368,23 +1262,19 @@ define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 1
; ZIP-NEXT: add sp, sp, a0
-; ZIP-NEXT: .cfi_def_cfa sp, 16
; ZIP-NEXT: addi sp, sp, 16
-; ZIP-NEXT: .cfi_def_cfa_offset 0
; ZIP-NEXT: ret
%res = call <10 x half> @llvm.vector.interleave5.v10f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e)
ret <10 x half> %res
}
-define <12 x half> @vector_interleave6_v12f16_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e, <2 x half> %f) {
+define <12 x half> @vector_interleave6_v12f16_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e, <2 x half> %f) nounwind {
; CHECK-LABEL: vector_interleave6_v12f16_v2f16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a1, a1, 2
@@ -1412,19 +1302,15 @@ define <12 x half> @vector_interleave6_v12f16_v2f16(<2 x half> %a, <2 x half> %b
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
;
; ZVBB-LABEL: vector_interleave6_v12f16_v2f16:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
-; ZVBB-NEXT: .cfi_def_cfa_offset 16
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 1
; ZVBB-NEXT: sub sp, sp, a0
-; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: srli a1, a1, 2
@@ -1452,19 +1338,15 @@ define <12 x half> @vector_interleave6_v12f16_v2f16(<2 x half> %a, <2 x half> %b
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 1
; ZVBB-NEXT: add sp, sp, a0
-; ZVBB-NEXT: .cfi_def_cfa sp, 16
; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: .cfi_def_cfa_offset 0
; ZVBB-NEXT: ret
;
; ZIP-LABEL: vector_interleave6_v12f16_v2f16:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -16
-; ZIP-NEXT: .cfi_def_cfa_offset 16
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 1
; ZIP-NEXT: sub sp, sp, a0
-; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; ZIP-NEXT: addi a0, sp, 16
; ZIP-NEXT: csrr a1, vlenb
; ZIP-NEXT: srli a1, a1, 2
@@ -1492,23 +1374,19 @@ define <12 x half> @vector_interleave6_v12f16_v2f16(<2 x half> %a, <2 x half> %b
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 1
; ZIP-NEXT: add sp, sp, a0
-; ZIP-NEXT: .cfi_def_cfa sp, 16
; ZIP-NEXT: addi sp, sp, 16
-; ZIP-NEXT: .cfi_def_cfa_offset 0
; ZIP-NEXT: ret
%res = call <12 x half> @llvm.vector.interleave6.v12f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e, <2 x half> %f)
ret <12 x half> %res
}
-define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g) {
+define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g) nounwind {
; CHECK-LABEL: vector_interleave7_v7f16_v1f16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a1, a1, 2
@@ -1540,19 +1418,15 @@ define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b,
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
;
; ZVBB-LABEL: vector_interleave7_v7f16_v1f16:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
-; ZVBB-NEXT: .cfi_def_cfa_offset 16
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 1
; ZVBB-NEXT: sub sp, sp, a0
-; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: srli a1, a1, 2
@@ -1584,19 +1458,15 @@ define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b,
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 1
; ZVBB-NEXT: add sp, sp, a0
-; ZVBB-NEXT: .cfi_def_cfa sp, 16
; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: .cfi_def_cfa_offset 0
; ZVBB-NEXT: ret
;
; ZIP-LABEL: vector_interleave7_v7f16_v1f16:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -16
-; ZIP-NEXT: .cfi_def_cfa_offset 16
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 1
; ZIP-NEXT: sub sp, sp, a0
-; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; ZIP-NEXT: addi a0, sp, 16
; ZIP-NEXT: csrr a1, vlenb
; ZIP-NEXT: srli a1, a1, 2
@@ -1628,23 +1498,19 @@ define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b,
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 1
; ZIP-NEXT: add sp, sp, a0
-; ZIP-NEXT: .cfi_def_cfa sp, 16
; ZIP-NEXT: addi sp, sp, 16
-; ZIP-NEXT: .cfi_def_cfa_offset 0
; ZIP-NEXT: ret
%res = call <7 x half> @llvm.vector.interleave7.v7f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g)
ret <7 x half> %res
}
-define <8 x half> @vector_interleave8_v8f16_v1f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g, <1 x half> %h) {
+define <8 x half> @vector_interleave8_v8f16_v1f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g, <1 x half> %h) nounwind {
; CHECK-LABEL: vector_interleave8_v8f16_v1f16:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: sub sp, sp, a0
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; CHECK-NEXT: addi a0, sp, 16
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a1, a1, 2
@@ -1679,19 +1545,15 @@ define <8 x half> @vector_interleave8_v8f16_v1f16(<1 x half> %a, <1 x half> %b,
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: slli a0, a0, 1
; CHECK-NEXT: add sp, sp, a0
-; CHECK-NEXT: .cfi_def_cfa sp, 16
; CHECK-NEXT: addi sp, sp, 16
-; CHECK-NEXT: .cfi_def_cfa_offset 0
; CHECK-NEXT: ret
;
; ZVBB-LABEL: vector_interleave8_v8f16_v1f16:
; ZVBB: # %bb.0:
; ZVBB-NEXT: addi sp, sp, -16
-; ZVBB-NEXT: .cfi_def_cfa_offset 16
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 1
; ZVBB-NEXT: sub sp, sp, a0
-; ZVBB-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; ZVBB-NEXT: addi a0, sp, 16
; ZVBB-NEXT: csrr a1, vlenb
; ZVBB-NEXT: srli a1, a1, 2
@@ -1726,19 +1588,15 @@ define <8 x half> @vector_interleave8_v8f16_v1f16(<1 x half> %a, <1 x half> %b,
; ZVBB-NEXT: csrr a0, vlenb
; ZVBB-NEXT: slli a0, a0, 1
; ZVBB-NEXT: add sp, sp, a0
-; ZVBB-NEXT: .cfi_def_cfa sp, 16
; ZVBB-NEXT: addi sp, sp, 16
-; ZVBB-NEXT: .cfi_def_cfa_offset 0
; ZVBB-NEXT: ret
;
; ZIP-LABEL: vector_interleave8_v8f16_v1f16:
; ZIP: # %bb.0:
; ZIP-NEXT: addi sp, sp, -16
-; ZIP-NEXT: .cfi_def_cfa_offset 16
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 1
; ZIP-NEXT: sub sp, sp, a0
-; ZIP-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; ZIP-NEXT: addi a0, sp, 16
; ZIP-NEXT: csrr a1, vlenb
; ZIP-NEXT: srli a1, a1, 2
@@ -1773,9 +1631,7 @@ define <8 x half> @vector_interleave8_v8f16_v1f16(<1 x half> %a, <1 x half> %b,
; ZIP-NEXT: csrr a0, vlenb
; ZIP-NEXT: slli a0, a0, 1
; ZIP-NEXT: add sp, sp, a0
-; ZIP-NEXT: .cfi_def_cfa sp, 16
; ZIP-NEXT: addi sp, sp, 16
-; ZIP-NEXT: .cfi_def_cfa_offset 0
; ZIP-NEXT: ret
%res = call <8 x half> @llvm.vector.interleave8.v8f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g, <1 x half> %h)
ret <8 x half> %res
>From d0166d963c4f9b5361f6b5c8b714116ba0d80bbf Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 21 May 2025 12:39:10 +0100
Subject: [PATCH 3/5] Fix test name
---
llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index b2bc01e3c5174..2bf96b4fdf39b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -219,8 +219,8 @@ define {<2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v6i32(<6 x
ret {<2 x i32>, <2 x i32>, <2 x i32>} %res
}
-define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v8i32(<8 x i32> %v) nounwind {
-; CHECK-LABEL: vector_deinterleave3_v2i32_v8i32:
+define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave4_v2i32_v8i32(<8 x i32> %v) nounwind {
+; CHECK-LABEL: vector_deinterleave4_v2i32_v8i32:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -16
; CHECK-NEXT: csrr a0, vlenb
>From fdbcca443c3c0732b5453c3ebc24a91e06afdb70 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 21 May 2025 12:43:20 +0100
Subject: [PATCH 4/5] Use +m
---
.../RISCV/rvv/vector-deinterleave-fixed.ll | 575 +++---------------
1 file changed, 88 insertions(+), 487 deletions(-)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index 2bf96b4fdf39b..aab2f08277831 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh | FileCheck %s --check-prefixes=CHECK,V,RV32
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh | FileCheck %s --check-prefixes=CHECK,V,RV64
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh,+experimental-xrivosvizip | FileCheck %s --check-prefixes=CHECK,ZIP
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+m,+zvfh | FileCheck %s --check-prefixes=CHECK,V,RV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+m,+zvfh | FileCheck %s --check-prefixes=CHECK,V,RV64
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+m,+zvfh,+experimental-xrivosvizip | FileCheck %s --check-prefixes=CHECK,ZIP
; Integers
@@ -328,495 +328,93 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vecto
}
define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave7_v14i8_v2i8(<14 x i8> %v) nounwind {
-; RV32-LABEL: vector_deinterleave7_v14i8_v2i8:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s1, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: sub sp, sp, a0
-; RV32-NEXT: addi a0, sp, 32
-; RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: csrr s1, vlenb
-; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v11, v8, 10
-; RV32-NEXT: vslidedown.vi v10, v8, 8
-; RV32-NEXT: vslidedown.vi v9, v8, 2
-; RV32-NEXT: srli s0, s1, 3
-; RV32-NEXT: add a0, s0, s0
-; RV32-NEXT: vsetvli zero, a0, e8, mf2, tu, ma
-; RV32-NEXT: vslideup.vx v10, v11, s0
-; RV32-NEXT: vmv1r.v v11, v8
-; RV32-NEXT: vslideup.vx v11, v9, s0
-; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v9, v8, 12
-; RV32-NEXT: srli a0, s1, 2
-; RV32-NEXT: add a1, a0, s0
-; RV32-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
-; RV32-NEXT: vslideup.vx v10, v9, a0
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: slli a2, a2, 1
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 32
-; RV32-NEXT: vs1r.v v10, (a2) # vscale x 8-byte Folded Spill
-; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v9, v8, 4
-; RV32-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
-; RV32-NEXT: vslideup.vx v11, v9, a0
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 32
-; RV32-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: li a1, 3
-; RV32-NEXT: mv a0, s0
-; RV32-NEXT: call __mulsi3
-; RV32-NEXT: add s0, a0, s0
-; RV32-NEXT: addi a1, sp, 32
-; RV32-NEXT: vl1r.v v8, (a1) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v8, 6
-; RV32-NEXT: srli s1, s1, 1
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 32
-; RV32-NEXT: vl1r.v v9, (a1) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetvli zero, s0, e8, mf2, ta, ma
-; RV32-NEXT: vslideup.vx v9, v8, a0
-; RV32-NEXT: add a0, s1, s1
-; RV32-NEXT: csrr a1, vlenb
-; RV32-NEXT: slli a1, a1, 1
-; RV32-NEXT: add a1, sp, a1
-; RV32-NEXT: addi a1, a1, 32
-; RV32-NEXT: vl1r.v v8, (a1) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; RV32-NEXT: vslideup.vx v9, v8, s1
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a1, a0, 1
-; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 32
-; RV32-NEXT: vs1r.v v9, (a0)
-; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; RV32-NEXT: vlseg7e8.v v8, (a0)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 2
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s1, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 48
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vector_deinterleave7_v14i8_v2i8:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -64
-; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s1, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: sub sp, sp, a0
-; RV64-NEXT: addi a0, sp, 32
-; RV64-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT: csrr s1, vlenb
-; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v11, v8, 10
-; RV64-NEXT: vslidedown.vi v10, v8, 8
-; RV64-NEXT: vslidedown.vi v9, v8, 2
-; RV64-NEXT: srli s0, s1, 3
-; RV64-NEXT: add a0, s0, s0
-; RV64-NEXT: vsetvli zero, a0, e8, mf2, tu, ma
-; RV64-NEXT: vslideup.vx v10, v11, s0
-; RV64-NEXT: vmv1r.v v11, v8
-; RV64-NEXT: vslideup.vx v11, v9, s0
-; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 12
-; RV64-NEXT: srli a0, s1, 2
-; RV64-NEXT: add a1, a0, s0
-; RV64-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
-; RV64-NEXT: vslideup.vx v10, v9, a0
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: slli a2, a2, 1
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 32
-; RV64-NEXT: vs1r.v v10, (a2) # vscale x 8-byte Folded Spill
-; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v9, v8, 4
-; RV64-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
-; RV64-NEXT: vslideup.vx v11, v9, a0
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 32
-; RV64-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT: li a1, 3
-; RV64-NEXT: mv a0, s0
-; RV64-NEXT: call __muldi3
-; RV64-NEXT: add s0, a0, s0
-; RV64-NEXT: addi a1, sp, 32
-; RV64-NEXT: vl1r.v v8, (a1) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v8, 6
-; RV64-NEXT: srli s1, s1, 1
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 32
-; RV64-NEXT: vl1r.v v9, (a1) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vsetvli zero, s0, e8, mf2, ta, ma
-; RV64-NEXT: vslideup.vx v9, v8, a0
-; RV64-NEXT: add a0, s1, s1
-; RV64-NEXT: csrr a1, vlenb
-; RV64-NEXT: slli a1, a1, 1
-; RV64-NEXT: add a1, sp, a1
-; RV64-NEXT: addi a1, a1, 32
-; RV64-NEXT: vl1r.v v8, (a1) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; RV64-NEXT: vslideup.vx v9, v8, s1
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a1, a0, 1
-; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 32
-; RV64-NEXT: vs1r.v v9, (a0)
-; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; RV64-NEXT: vlseg7e8.v v8, (a0)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 2
-; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s1, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 64
-; RV64-NEXT: ret
-;
-; ZIP-LABEL: vector_deinterleave7_v14i8_v2i8:
-; ZIP: # %bb.0:
-; ZIP-NEXT: addi sp, sp, -64
-; ZIP-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
-; ZIP-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
-; ZIP-NEXT: sd s1, 40(sp) # 8-byte Folded Spill
-; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: slli a0, a0, 2
-; ZIP-NEXT: sub sp, sp, a0
-; ZIP-NEXT: addi a0, sp, 32
-; ZIP-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; ZIP-NEXT: csrr s1, vlenb
-; ZIP-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; ZIP-NEXT: vslidedown.vi v11, v8, 10
-; ZIP-NEXT: vslidedown.vi v10, v8, 8
-; ZIP-NEXT: vslidedown.vi v9, v8, 2
-; ZIP-NEXT: srli s0, s1, 3
-; ZIP-NEXT: add a0, s0, s0
-; ZIP-NEXT: vsetvli zero, a0, e8, mf2, tu, ma
-; ZIP-NEXT: vslideup.vx v10, v11, s0
-; ZIP-NEXT: vmv1r.v v11, v8
-; ZIP-NEXT: vslideup.vx v11, v9, s0
-; ZIP-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; ZIP-NEXT: vslidedown.vi v9, v8, 12
-; ZIP-NEXT: srli a0, s1, 2
-; ZIP-NEXT: add a1, a0, s0
-; ZIP-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
-; ZIP-NEXT: vslideup.vx v10, v9, a0
-; ZIP-NEXT: csrr a2, vlenb
-; ZIP-NEXT: slli a2, a2, 1
-; ZIP-NEXT: add a2, sp, a2
-; ZIP-NEXT: addi a2, a2, 32
-; ZIP-NEXT: vs1r.v v10, (a2) # vscale x 8-byte Folded Spill
-; ZIP-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; ZIP-NEXT: vslidedown.vi v9, v8, 4
-; ZIP-NEXT: vsetvli zero, a1, e8, mf2, tu, ma
-; ZIP-NEXT: vslideup.vx v11, v9, a0
-; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: add a0, sp, a0
-; ZIP-NEXT: addi a0, a0, 32
-; ZIP-NEXT: vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; ZIP-NEXT: li a1, 3
-; ZIP-NEXT: mv a0, s0
-; ZIP-NEXT: call __muldi3
-; ZIP-NEXT: add s0, a0, s0
-; ZIP-NEXT: addi a1, sp, 32
-; ZIP-NEXT: vl1r.v v8, (a1) # vscale x 8-byte Folded Reload
-; ZIP-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; ZIP-NEXT: vslidedown.vi v8, v8, 6
-; ZIP-NEXT: srli s1, s1, 1
-; ZIP-NEXT: csrr a1, vlenb
-; ZIP-NEXT: add a1, sp, a1
-; ZIP-NEXT: addi a1, a1, 32
-; ZIP-NEXT: vl1r.v v9, (a1) # vscale x 8-byte Folded Reload
-; ZIP-NEXT: vsetvli zero, s0, e8, mf2, ta, ma
-; ZIP-NEXT: vslideup.vx v9, v8, a0
-; ZIP-NEXT: add a0, s1, s1
-; ZIP-NEXT: csrr a1, vlenb
-; ZIP-NEXT: slli a1, a1, 1
-; ZIP-NEXT: add a1, sp, a1
-; ZIP-NEXT: addi a1, a1, 32
-; ZIP-NEXT: vl1r.v v8, (a1) # vscale x 8-byte Folded Reload
-; ZIP-NEXT: vsetvli zero, a0, e8, m1, ta, ma
-; ZIP-NEXT: vslideup.vx v9, v8, s1
-; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: slli a1, a0, 1
-; ZIP-NEXT: add a0, a1, a0
-; ZIP-NEXT: add a0, sp, a0
-; ZIP-NEXT: addi a0, a0, 32
-; ZIP-NEXT: vs1r.v v9, (a0)
-; ZIP-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; ZIP-NEXT: vlseg7e8.v v8, (a0)
-; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: slli a0, a0, 2
-; ZIP-NEXT: add sp, sp, a0
-; ZIP-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
-; ZIP-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
-; ZIP-NEXT: ld s1, 40(sp) # 8-byte Folded Reload
-; ZIP-NEXT: addi sp, sp, 64
-; ZIP-NEXT: ret
+; CHECK-LABEL: vector_deinterleave7_v14i8_v2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vsetivli zero, 2, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v9, v8, 10
+; CHECK-NEXT: vslidedown.vi v10, v8, 8
+; CHECK-NEXT: vslidedown.vi v11, v8, 12
+; CHECK-NEXT: vslidedown.vi v12, v8, 2
+; CHECK-NEXT: vslidedown.vi v13, v8, 4
+; CHECK-NEXT: vslidedown.vi v14, v8, 6
+; CHECK-NEXT: srli a1, a0, 3
+; CHECK-NEXT: srli a2, a0, 2
+; CHECK-NEXT: srli a0, a0, 1
+; CHECK-NEXT: add a3, a1, a1
+; CHECK-NEXT: add a4, a2, a1
+; CHECK-NEXT: vsetvli zero, a3, e8, mf2, tu, ma
+; CHECK-NEXT: vslideup.vx v10, v9, a1
+; CHECK-NEXT: vslideup.vx v8, v12, a1
+; CHECK-NEXT: slli a3, a1, 1
+; CHECK-NEXT: vsetvli zero, a4, e8, mf2, tu, ma
+; CHECK-NEXT: vslideup.vx v10, v11, a2
+; CHECK-NEXT: vslideup.vx v8, v13, a2
+; CHECK-NEXT: add a2, a0, a0
+; CHECK-NEXT: add a3, a3, a1
+; CHECK-NEXT: add a1, a3, a1
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v14, a3
+; CHECK-NEXT: vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v10, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs1r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
+; CHECK-NEXT: vlseg7e8.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
%res = call {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @llvm.vector.deinterleave7.v14i8(<14 x i8> %v)
ret {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} %res
}
define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave8_v16i8_v2i8(<16 x i8> %v) nounwind {
-; RV32-LABEL: vector_deinterleave8_v16i8_v2i8:
-; RV32: # %bb.0:
-; RV32-NEXT: addi sp, sp, -48
-; RV32-NEXT: sw ra, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s0, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s1, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s2, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s3, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT: sw s4, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a1, a0, 1
-; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: sub sp, sp, a0
-; RV32-NEXT: addi a0, sp, 16
-; RV32-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: csrr s1, vlenb
-; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 10
-; RV32-NEXT: vslidedown.vi v9, v8, 8
-; RV32-NEXT: srli s0, s1, 3
-; RV32-NEXT: srli s2, s1, 2
-; RV32-NEXT: add s3, s0, s0
-; RV32-NEXT: add s4, s2, s0
-; RV32-NEXT: vsetvli zero, s3, e8, mf2, tu, ma
-; RV32-NEXT: vslideup.vx v9, v10, s0
-; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v10, v8, 12
-; RV32-NEXT: vsetvli zero, s4, e8, mf2, tu, ma
-; RV32-NEXT: vslideup.vx v9, v10, s2
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT: li a1, 3
-; RV32-NEXT: mv a0, s0
-; RV32-NEXT: call __mulsi3
-; RV32-NEXT: add a1, a0, s0
-; RV32-NEXT: addi a2, sp, 16
-; RV32-NEXT: vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; RV32-NEXT: vslidedown.vi v8, v12, 14
-; RV32-NEXT: vslidedown.vi v9, v12, 2
-; RV32-NEXT: vmv1r.v v10, v12
-; RV32-NEXT: vslidedown.vi v11, v12, 4
-; RV32-NEXT: vslidedown.vi v12, v12, 6
-; RV32-NEXT: srli s1, s1, 1
-; RV32-NEXT: csrr a2, vlenb
-; RV32-NEXT: add a2, sp, a2
-; RV32-NEXT: addi a2, a2, 16
-; RV32-NEXT: vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; RV32-NEXT: vslideup.vx v13, v8, a0
-; RV32-NEXT: vsetvli zero, s3, e8, mf2, tu, ma
-; RV32-NEXT: vslideup.vx v10, v9, s0
-; RV32-NEXT: add a2, s1, s1
-; RV32-NEXT: vsetvli zero, s4, e8, mf2, tu, ma
-; RV32-NEXT: vslideup.vx v10, v11, s2
-; RV32-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; RV32-NEXT: vslideup.vx v10, v12, a0
-; RV32-NEXT: vsetvli zero, a2, e8, m1, ta, ma
-; RV32-NEXT: vslideup.vx v10, v13, s1
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a0, a0, 1
-; RV32-NEXT: add a0, sp, a0
-; RV32-NEXT: addi a0, a0, 16
-; RV32-NEXT: vs1r.v v10, (a0)
-; RV32-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; RV32-NEXT: vlseg8e8.v v8, (a0)
-; RV32-NEXT: csrr a0, vlenb
-; RV32-NEXT: slli a1, a0, 1
-; RV32-NEXT: add a0, a1, a0
-; RV32-NEXT: add sp, sp, a0
-; RV32-NEXT: lw ra, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s0, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s1, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s2, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s3, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT: lw s4, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT: addi sp, sp, 48
-; RV32-NEXT: ret
-;
-; RV64-LABEL: vector_deinterleave8_v16i8_v2i8:
-; RV64: # %bb.0:
-; RV64-NEXT: addi sp, sp, -64
-; RV64-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s1, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s2, 32(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s3, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT: sd s4, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a1, a0, 1
-; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: sub sp, sp, a0
-; RV64-NEXT: addi a0, sp, 16
-; RV64-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT: csrr s1, vlenb
-; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 10
-; RV64-NEXT: vslidedown.vi v9, v8, 8
-; RV64-NEXT: srli s0, s1, 3
-; RV64-NEXT: srli s2, s1, 2
-; RV64-NEXT: add s3, s0, s0
-; RV64-NEXT: add s4, s2, s0
-; RV64-NEXT: vsetvli zero, s3, e8, mf2, tu, ma
-; RV64-NEXT: vslideup.vx v9, v10, s0
-; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v10, v8, 12
-; RV64-NEXT: vsetvli zero, s4, e8, mf2, tu, ma
-; RV64-NEXT: vslideup.vx v9, v10, s2
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT: li a1, 3
-; RV64-NEXT: mv a0, s0
-; RV64-NEXT: call __muldi3
-; RV64-NEXT: add a1, a0, s0
-; RV64-NEXT: addi a2, sp, 16
-; RV64-NEXT: vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; RV64-NEXT: vslidedown.vi v8, v12, 14
-; RV64-NEXT: vslidedown.vi v9, v12, 2
-; RV64-NEXT: vmv1r.v v10, v12
-; RV64-NEXT: vslidedown.vi v11, v12, 4
-; RV64-NEXT: vslidedown.vi v12, v12, 6
-; RV64-NEXT: srli s1, s1, 1
-; RV64-NEXT: csrr a2, vlenb
-; RV64-NEXT: add a2, sp, a2
-; RV64-NEXT: addi a2, a2, 16
-; RV64-NEXT: vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; RV64-NEXT: vslideup.vx v13, v8, a0
-; RV64-NEXT: vsetvli zero, s3, e8, mf2, tu, ma
-; RV64-NEXT: vslideup.vx v10, v9, s0
-; RV64-NEXT: add a2, s1, s1
-; RV64-NEXT: vsetvli zero, s4, e8, mf2, tu, ma
-; RV64-NEXT: vslideup.vx v10, v11, s2
-; RV64-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; RV64-NEXT: vslideup.vx v10, v12, a0
-; RV64-NEXT: vsetvli zero, a2, e8, m1, ta, ma
-; RV64-NEXT: vslideup.vx v10, v13, s1
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a0, a0, 1
-; RV64-NEXT: add a0, sp, a0
-; RV64-NEXT: addi a0, a0, 16
-; RV64-NEXT: vs1r.v v10, (a0)
-; RV64-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; RV64-NEXT: vlseg8e8.v v8, (a0)
-; RV64-NEXT: csrr a0, vlenb
-; RV64-NEXT: slli a1, a0, 1
-; RV64-NEXT: add a0, a1, a0
-; RV64-NEXT: add sp, sp, a0
-; RV64-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s1, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s2, 32(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s3, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT: ld s4, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT: addi sp, sp, 64
-; RV64-NEXT: ret
-;
-; ZIP-LABEL: vector_deinterleave8_v16i8_v2i8:
-; ZIP: # %bb.0:
-; ZIP-NEXT: addi sp, sp, -64
-; ZIP-NEXT: sd ra, 56(sp) # 8-byte Folded Spill
-; ZIP-NEXT: sd s0, 48(sp) # 8-byte Folded Spill
-; ZIP-NEXT: sd s1, 40(sp) # 8-byte Folded Spill
-; ZIP-NEXT: sd s2, 32(sp) # 8-byte Folded Spill
-; ZIP-NEXT: sd s3, 24(sp) # 8-byte Folded Spill
-; ZIP-NEXT: sd s4, 16(sp) # 8-byte Folded Spill
-; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: slli a1, a0, 1
-; ZIP-NEXT: add a0, a1, a0
-; ZIP-NEXT: sub sp, sp, a0
-; ZIP-NEXT: addi a0, sp, 16
-; ZIP-NEXT: vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; ZIP-NEXT: csrr s1, vlenb
-; ZIP-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; ZIP-NEXT: vslidedown.vi v10, v8, 10
-; ZIP-NEXT: vslidedown.vi v9, v8, 8
-; ZIP-NEXT: srli s0, s1, 3
-; ZIP-NEXT: srli s2, s1, 2
-; ZIP-NEXT: add s3, s0, s0
-; ZIP-NEXT: add s4, s2, s0
-; ZIP-NEXT: vsetvli zero, s3, e8, mf2, tu, ma
-; ZIP-NEXT: vslideup.vx v9, v10, s0
-; ZIP-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; ZIP-NEXT: vslidedown.vi v10, v8, 12
-; ZIP-NEXT: vsetvli zero, s4, e8, mf2, tu, ma
-; ZIP-NEXT: vslideup.vx v9, v10, s2
-; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: add a0, sp, a0
-; ZIP-NEXT: addi a0, a0, 16
-; ZIP-NEXT: vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; ZIP-NEXT: li a1, 3
-; ZIP-NEXT: mv a0, s0
-; ZIP-NEXT: call __muldi3
-; ZIP-NEXT: add a1, a0, s0
-; ZIP-NEXT: addi a2, sp, 16
-; ZIP-NEXT: vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
-; ZIP-NEXT: vsetivli zero, 2, e8, m1, ta, ma
-; ZIP-NEXT: vslidedown.vi v8, v12, 14
-; ZIP-NEXT: vslidedown.vi v9, v12, 2
-; ZIP-NEXT: vmv1r.v v10, v12
-; ZIP-NEXT: vslidedown.vi v11, v12, 4
-; ZIP-NEXT: vslidedown.vi v12, v12, 6
-; ZIP-NEXT: srli s1, s1, 1
-; ZIP-NEXT: csrr a2, vlenb
-; ZIP-NEXT: add a2, sp, a2
-; ZIP-NEXT: addi a2, a2, 16
-; ZIP-NEXT: vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
-; ZIP-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; ZIP-NEXT: vslideup.vx v13, v8, a0
-; ZIP-NEXT: vsetvli zero, s3, e8, mf2, tu, ma
-; ZIP-NEXT: vslideup.vx v10, v9, s0
-; ZIP-NEXT: add a2, s1, s1
-; ZIP-NEXT: vsetvli zero, s4, e8, mf2, tu, ma
-; ZIP-NEXT: vslideup.vx v10, v11, s2
-; ZIP-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
-; ZIP-NEXT: vslideup.vx v10, v12, a0
-; ZIP-NEXT: vsetvli zero, a2, e8, m1, ta, ma
-; ZIP-NEXT: vslideup.vx v10, v13, s1
-; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: slli a0, a0, 1
-; ZIP-NEXT: add a0, sp, a0
-; ZIP-NEXT: addi a0, a0, 16
-; ZIP-NEXT: vs1r.v v10, (a0)
-; ZIP-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
-; ZIP-NEXT: vlseg8e8.v v8, (a0)
-; ZIP-NEXT: csrr a0, vlenb
-; ZIP-NEXT: slli a1, a0, 1
-; ZIP-NEXT: add a0, a1, a0
-; ZIP-NEXT: add sp, sp, a0
-; ZIP-NEXT: ld ra, 56(sp) # 8-byte Folded Reload
-; ZIP-NEXT: ld s0, 48(sp) # 8-byte Folded Reload
-; ZIP-NEXT: ld s1, 40(sp) # 8-byte Folded Reload
-; ZIP-NEXT: ld s2, 32(sp) # 8-byte Folded Reload
-; ZIP-NEXT: ld s3, 24(sp) # 8-byte Folded Reload
-; ZIP-NEXT: ld s4, 16(sp) # 8-byte Folded Reload
-; ZIP-NEXT: addi sp, sp, 64
-; ZIP-NEXT: ret
+; CHECK-LABEL: vector_deinterleave8_v16i8_v2i8:
+; CHECK: # %bb.0:
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: sub sp, sp, a0
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: vsetivli zero, 2, e8, m1, ta, ma
+; CHECK-NEXT: vslidedown.vi v9, v8, 10
+; CHECK-NEXT: vslidedown.vi v10, v8, 8
+; CHECK-NEXT: vslidedown.vi v11, v8, 12
+; CHECK-NEXT: vslidedown.vi v12, v8, 14
+; CHECK-NEXT: vslidedown.vi v13, v8, 2
+; CHECK-NEXT: vslidedown.vi v14, v8, 4
+; CHECK-NEXT: vslidedown.vi v15, v8, 6
+; CHECK-NEXT: srli a1, a0, 3
+; CHECK-NEXT: srli a2, a0, 2
+; CHECK-NEXT: srli a0, a0, 1
+; CHECK-NEXT: add a3, a1, a1
+; CHECK-NEXT: add a4, a2, a1
+; CHECK-NEXT: slli a5, a1, 1
+; CHECK-NEXT: add a6, a0, a0
+; CHECK-NEXT: vsetvli zero, a3, e8, mf2, tu, ma
+; CHECK-NEXT: vslideup.vx v10, v9, a1
+; CHECK-NEXT: add a5, a5, a1
+; CHECK-NEXT: vslideup.vx v8, v13, a1
+; CHECK-NEXT: vsetvli zero, a4, e8, mf2, tu, ma
+; CHECK-NEXT: vslideup.vx v10, v11, a2
+; CHECK-NEXT: add a1, a5, a1
+; CHECK-NEXT: vslideup.vx v8, v14, a2
+; CHECK-NEXT: vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT: vslideup.vx v10, v12, a5
+; CHECK-NEXT: vslideup.vx v8, v15, a5
+; CHECK-NEXT: vsetvli zero, a6, e8, m1, ta, ma
+; CHECK-NEXT: vslideup.vx v8, v10, a0
+; CHECK-NEXT: addi a0, sp, 16
+; CHECK-NEXT: vs1r.v v8, (a0)
+; CHECK-NEXT: vsetvli a1, zero, e8, mf8, ta, ma
+; CHECK-NEXT: vlseg8e8.v v8, (a0)
+; CHECK-NEXT: csrr a0, vlenb
+; CHECK-NEXT: add sp, sp, a0
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: ret
%res = call {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @llvm.vector.deinterleave8.v16i8(<16 x i8> %v)
ret {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} %res
}
@@ -1221,3 +819,6 @@ define {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>,
%res = call {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} @llvm.vector.deinterleave8.v8f16(<8 x half> %v)
ret {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} %res
}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}
>From 7d93db61d15fbac338c4c691811e7bc5960bcd6a Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 21 May 2025 12:54:53 +0100
Subject: [PATCH 5/5] Use >=/<= and update comment
---
llvm/include/llvm/IR/Intrinsics.h | 10 +++-------
1 file changed, 3 insertions(+), 7 deletions(-)
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index b64784909fc25..5a810784a54bf 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -169,7 +169,7 @@ namespace Intrinsic {
AArch64Svcount,
} Kind;
- // These three have to be contiguous.
+ // These six have to be contiguous.
static_assert(OneFourthVecArgument == OneThirdVecArgument + 1 &&
OneFifthVecArgument == OneFourthVecArgument + 1 &&
OneSixthVecArgument == OneFifthVecArgument + 1 &&
@@ -194,9 +194,7 @@ namespace Intrinsic {
unsigned getArgumentNumber() const {
assert(Kind == Argument || Kind == ExtendArgument ||
Kind == TruncArgument || Kind == HalfVecArgument ||
- Kind == OneThirdVecArgument || Kind == OneFourthVecArgument ||
- Kind == OneFifthVecArgument || Kind == OneSixthVecArgument ||
- Kind == OneSeventhVecArgument || Kind == OneEighthVecArgument ||
+ (Kind >= OneThirdVecArgument && Kind <= OneEighthVecArgument) ||
Kind == SameVecWidthArgument || Kind == VecElementArgument ||
Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
Kind == VecOfBitcastsToInt);
@@ -205,9 +203,7 @@ namespace Intrinsic {
ArgKind getArgumentKind() const {
assert(Kind == Argument || Kind == ExtendArgument ||
Kind == TruncArgument || Kind == HalfVecArgument ||
- Kind == OneThirdVecArgument || Kind == OneFourthVecArgument ||
- Kind == OneFifthVecArgument || Kind == OneSixthVecArgument ||
- Kind == OneSeventhVecArgument || Kind == OneEighthVecArgument ||
+ (Kind >= OneThirdVecArgument && Kind <= OneEighthVecArgument) ||
Kind == SameVecWidthArgument || Kind == VecElementArgument ||
Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
Kind == VecOfBitcastsToInt);
More information about the llvm-commits
mailing list