[llvm] [IR] Consolidate OneNthElements IIT descriptors. NFCI (PR #141492)

Luke Lau via llvm-commits llvm-commits at lists.llvm.org
Mon May 26 06:40:25 PDT 2025


https://github.com/lukel97 created https://github.com/llvm/llvm-project/pull/141492

This replaces LLVMHalfElementsVectorType and LLVMOne{3,4,5,6,7,8}ElementsVectorType with one parameterized IIT descriptor.

The type signature is encoded as the vector type to match followed by the divisor N, and inside IITDescriptor this is stored as two 16 bit parts, similarly to LLVMVectorOfAnyPointersToElt.

This also allows us to use a foreach to delcare the [de]interleave intrinsics.

Stacked on #139893

>From 915c27d908e4a6ba43b35c957bf7aacbf408dc91 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 12 May 2025 07:08:16 +0300
Subject: [PATCH 1/6] [IR] Add llvm.vector.(de)interleave4/6/8

---
 llvm/docs/LangRef.rst                         |    10 +-
 llvm/include/llvm/IR/Intrinsics.h             |    30 +-
 llvm/include/llvm/IR/Intrinsics.td            |    66 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |    18 +
 llvm/lib/IR/Intrinsics.cpp                    |    28 +-
 .../RISCV/rvv/vector-deinterleave-fixed.ll    |   502 +-
 .../CodeGen/RISCV/rvv/vector-deinterleave.ll  |  1753 +-
 .../RISCV/rvv/vector-interleave-fixed.ll      |   787 +-
 .../CodeGen/RISCV/rvv/vector-interleave.ll    | 13856 +++++++++++-----
 9 files changed, 12995 insertions(+), 4055 deletions(-)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 7296bb84b7d95..c0bc0a10ed537 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20158,7 +20158,7 @@ Arguments:
 
 The argument to this intrinsic must be a vector.
 
-'``llvm.vector.deinterleave2/3/5/7``' Intrinsic
+'``llvm.vector.deinterleave2/3/4/5/6/7/8``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
@@ -20176,8 +20176,8 @@ This is an overloaded intrinsic.
 Overview:
 """""""""
 
-The '``llvm.vector.deinterleave2/3/5/7``' intrinsics deinterleave adjacent lanes
-into 2, 3, 5, and 7 separate vectors, respectively, and return them as the
+The '``llvm.vector.deinterleave2/3/4/5/6/7/8``' intrinsics deinterleave adjacent lanes
+into 2 through to 8 separate vectors, respectively, and return them as the
 result.
 
 This intrinsic works for both fixed and scalable vectors. While this intrinsic
@@ -20199,7 +20199,7 @@ Arguments:
 The argument is a vector whose type corresponds to the logical concatenation of
 the aggregated result types.
 
-'``llvm.vector.interleave2/3/5/7``' Intrinsic
+'``llvm.vector.interleave2/3/4/5/6/7/8``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Syntax:
@@ -20217,7 +20217,7 @@ This is an overloaded intrinsic.
 Overview:
 """""""""
 
-The '``llvm.vector.interleave2/3/5/7``' intrinsic constructs a vector
+The '``llvm.vector.interleave2/3/4/5/6/7/8``' intrinsic constructs a vector
 by interleaving all the input vectors.
 
 This intrinsic works for both fixed and scalable vectors. While this intrinsic
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 6fb1bf9359b9a..b64784909fc25 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -153,8 +153,11 @@ namespace Intrinsic {
       TruncArgument,
       HalfVecArgument,
       OneThirdVecArgument,
+      OneFourthVecArgument,
       OneFifthVecArgument,
+      OneSixthVecArgument,
       OneSeventhVecArgument,
+      OneEighthVecArgument,
       SameVecWidthArgument,
       VecOfAnyPtrsToElt,
       VecElementArgument,
@@ -167,8 +170,11 @@ namespace Intrinsic {
     } Kind;
 
     // These three have to be contiguous.
-    static_assert(OneFifthVecArgument == OneThirdVecArgument + 1 &&
-                  OneSeventhVecArgument == OneFifthVecArgument + 1);
+    static_assert(OneFourthVecArgument == OneThirdVecArgument + 1 &&
+                  OneFifthVecArgument == OneFourthVecArgument + 1 &&
+                  OneSixthVecArgument == OneFifthVecArgument + 1 &&
+                  OneSeventhVecArgument == OneSixthVecArgument + 1 &&
+                  OneEighthVecArgument == OneSeventhVecArgument + 1);
     union {
       unsigned Integer_Width;
       unsigned Float_Width;
@@ -188,19 +194,23 @@ namespace Intrinsic {
     unsigned getArgumentNumber() const {
       assert(Kind == Argument || Kind == ExtendArgument ||
              Kind == TruncArgument || Kind == HalfVecArgument ||
-             Kind == OneThirdVecArgument || Kind == OneFifthVecArgument ||
-             Kind == OneSeventhVecArgument || Kind == SameVecWidthArgument ||
-             Kind == VecElementArgument || Kind == Subdivide2Argument ||
-             Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt);
+             Kind == OneThirdVecArgument || Kind == OneFourthVecArgument ||
+             Kind == OneFifthVecArgument || Kind == OneSixthVecArgument ||
+             Kind == OneSeventhVecArgument || Kind == OneEighthVecArgument ||
+             Kind == SameVecWidthArgument || Kind == VecElementArgument ||
+             Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
+             Kind == VecOfBitcastsToInt);
       return Argument_Info >> 3;
     }
     ArgKind getArgumentKind() const {
       assert(Kind == Argument || Kind == ExtendArgument ||
              Kind == TruncArgument || Kind == HalfVecArgument ||
-             Kind == OneThirdVecArgument || Kind == OneFifthVecArgument ||
-             Kind == OneSeventhVecArgument || Kind == SameVecWidthArgument ||
-             Kind == VecElementArgument || Kind == Subdivide2Argument ||
-             Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt);
+             Kind == OneThirdVecArgument || Kind == OneFourthVecArgument ||
+             Kind == OneFifthVecArgument || Kind == OneSixthVecArgument ||
+             Kind == OneSeventhVecArgument || Kind == OneEighthVecArgument ||
+             Kind == SameVecWidthArgument || Kind == VecElementArgument ||
+             Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
+             Kind == VecOfBitcastsToInt);
       return (ArgKind)(Argument_Info & 7);
     }
 
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 8d26961eebbf3..3994a543f9dcf 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -340,6 +340,9 @@ def IIT_ONE_FIFTH_VEC_ARG : IIT_Base<63>;
 def IIT_ONE_SEVENTH_VEC_ARG : IIT_Base<64>;
 def IIT_V2048: IIT_Vec<2048, 65>;
 def IIT_V4096: IIT_Vec<4096, 66>;
+def IIT_ONE_FOURTH_VEC_ARG : IIT_Base<67>;
+def IIT_ONE_SIXTH_VEC_ARG : IIT_Base<68>;
+def IIT_ONE_EIGHTH_VEC_ARG : IIT_Base<69>;
 }
 
 defvar IIT_all_FixedTypes = !filter(iit, IIT_all,
@@ -483,12 +486,21 @@ class LLVMHalfElementsVectorType<int num>
 class LLVMOneThirdElementsVectorType<int num>
   : LLVMMatchType<num, IIT_ONE_THIRD_VEC_ARG>;
 
+class LLVMOneFourthElementsVectorType<int num>
+  : LLVMMatchType<num, IIT_ONE_FOURTH_VEC_ARG>;
+
 class LLVMOneFifthElementsVectorType<int num>
   : LLVMMatchType<num, IIT_ONE_FIFTH_VEC_ARG>;
 
+class LLVMOneSixthElementsVectorType<int num>
+  : LLVMMatchType<num, IIT_ONE_SIXTH_VEC_ARG>;
+
 class LLVMOneSeventhElementsVectorType<int num>
   : LLVMMatchType<num, IIT_ONE_SEVENTH_VEC_ARG>;
 
+class LLVMOneEighthElementsVectorType<int num>
+  : LLVMMatchType<num, IIT_ONE_EIGHTH_VEC_ARG>;
+
 // Match the type of another intrinsic parameter that is expected to be a
 // vector type (i.e. <N x iM>) but with each element subdivided to
 // form a vector with more elements that are smaller than the original.
@@ -2776,6 +2788,20 @@ def int_vector_deinterleave3 : DefaultAttrsIntrinsic<[LLVMOneThirdElementsVector
                                                      [llvm_anyvector_ty],
                                                      [IntrNoMem]>;
 
+def int_vector_interleave4   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                                     [LLVMOneFourthElementsVectorType<0>,
+                                                      LLVMOneFourthElementsVectorType<0>,
+                                                      LLVMOneFourthElementsVectorType<0>,
+                                                      LLVMOneFourthElementsVectorType<0>],
+                                                     [IntrNoMem]>;
+
+def int_vector_deinterleave4 : DefaultAttrsIntrinsic<[LLVMOneFourthElementsVectorType<0>,
+                                                      LLVMOneFourthElementsVectorType<0>,
+                                                      LLVMOneFourthElementsVectorType<0>,
+                                                      LLVMOneFourthElementsVectorType<0>],
+                                                     [llvm_anyvector_ty],
+                                                     [IntrNoMem]>;
+
 def int_vector_interleave5   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                                                      [LLVMOneFifthElementsVectorType<0>,
                                                       LLVMOneFifthElementsVectorType<0>,
@@ -2792,6 +2818,24 @@ def int_vector_deinterleave5 : DefaultAttrsIntrinsic<[LLVMOneFifthElementsVector
                                                      [llvm_anyvector_ty],
                                                      [IntrNoMem]>;
 
+def int_vector_interleave6   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                                     [LLVMOneSixthElementsVectorType<0>,
+                                                      LLVMOneSixthElementsVectorType<0>,
+                                                      LLVMOneSixthElementsVectorType<0>,
+                                                      LLVMOneSixthElementsVectorType<0>,
+                                                      LLVMOneSixthElementsVectorType<0>,
+                                                      LLVMOneSixthElementsVectorType<0>],
+                                                     [IntrNoMem]>;
+
+def int_vector_deinterleave6 : DefaultAttrsIntrinsic<[LLVMOneSixthElementsVectorType<0>,
+                                                      LLVMOneSixthElementsVectorType<0>,
+                                                      LLVMOneSixthElementsVectorType<0>,
+                                                      LLVMOneSixthElementsVectorType<0>,
+                                                      LLVMOneSixthElementsVectorType<0>,
+                                                      LLVMOneSixthElementsVectorType<0>],
+                                                     [llvm_anyvector_ty],
+                                                     [IntrNoMem]>;
+
 def int_vector_interleave7   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                                                      [LLVMOneSeventhElementsVectorType<0>,
                                                       LLVMOneSeventhElementsVectorType<0>,
@@ -2812,6 +2856,28 @@ def int_vector_deinterleave7 : DefaultAttrsIntrinsic<[LLVMOneSeventhElementsVect
                                                      [llvm_anyvector_ty],
                                                      [IntrNoMem]>;
 
+def int_vector_interleave8   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                                     [LLVMOneEighthElementsVectorType<0>,
+                                                      LLVMOneEighthElementsVectorType<0>,
+                                                      LLVMOneEighthElementsVectorType<0>,
+                                                      LLVMOneEighthElementsVectorType<0>,
+                                                      LLVMOneEighthElementsVectorType<0>,
+                                                      LLVMOneEighthElementsVectorType<0>,
+                                                      LLVMOneEighthElementsVectorType<0>,
+                                                      LLVMOneEighthElementsVectorType<0>],
+                                                     [IntrNoMem]>;
+
+def int_vector_deinterleave8 : DefaultAttrsIntrinsic<[LLVMOneEighthElementsVectorType<0>,
+                                                      LLVMOneEighthElementsVectorType<0>,
+                                                      LLVMOneEighthElementsVectorType<0>,
+                                                      LLVMOneEighthElementsVectorType<0>,
+                                                      LLVMOneEighthElementsVectorType<0>,
+                                                      LLVMOneEighthElementsVectorType<0>,
+                                                      LLVMOneEighthElementsVectorType<0>,
+                                                      LLVMOneEighthElementsVectorType<0>],
+                                                     [llvm_anyvector_ty],
+                                                     [IntrNoMem]>;
+
 //===-------------- Intrinsics to perform partial reduction ---------------===//
 
 def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMMatchType<0>],
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 9d138d364bad7..10ee75a83a267 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -8181,24 +8181,42 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::vector_interleave3:
     visitVectorInterleave(I, 3);
     return;
+  case Intrinsic::vector_interleave4:
+    visitVectorInterleave(I, 4);
+    return;
   case Intrinsic::vector_interleave5:
     visitVectorInterleave(I, 5);
     return;
+  case Intrinsic::vector_interleave6:
+    visitVectorInterleave(I, 6);
+    return;
   case Intrinsic::vector_interleave7:
     visitVectorInterleave(I, 7);
     return;
+  case Intrinsic::vector_interleave8:
+    visitVectorInterleave(I, 8);
+    return;
   case Intrinsic::vector_deinterleave2:
     visitVectorDeinterleave(I, 2);
     return;
   case Intrinsic::vector_deinterleave3:
     visitVectorDeinterleave(I, 3);
     return;
+  case Intrinsic::vector_deinterleave4:
+    visitVectorDeinterleave(I, 4);
+    return;
   case Intrinsic::vector_deinterleave5:
     visitVectorDeinterleave(I, 5);
     return;
+  case Intrinsic::vector_deinterleave6:
+    visitVectorDeinterleave(I, 6);
+    return;
   case Intrinsic::vector_deinterleave7:
     visitVectorDeinterleave(I, 7);
     return;
+  case Intrinsic::vector_deinterleave8:
+    visitVectorDeinterleave(I, 8);
+    return;
   case Intrinsic::experimental_vector_compress:
     setValue(&I, DAG.getNode(ISD::VECTOR_COMPRESS, sdl,
                              getValue(I.getArgOperand(0)).getValueType(),
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index dabb5fe006b3c..28f7523476774 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -378,18 +378,36 @@ DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
         IITDescriptor::get(IITDescriptor::OneThirdVecArgument, ArgInfo));
     return;
   }
+  case IIT_ONE_FOURTH_VEC_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(
+        IITDescriptor::get(IITDescriptor::OneFourthVecArgument, ArgInfo));
+    return;
+  }
   case IIT_ONE_FIFTH_VEC_ARG: {
     unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
     OutputTable.push_back(
         IITDescriptor::get(IITDescriptor::OneFifthVecArgument, ArgInfo));
     return;
   }
+  case IIT_ONE_SIXTH_VEC_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(
+        IITDescriptor::get(IITDescriptor::OneSixthVecArgument, ArgInfo));
+    return;
+  }
   case IIT_ONE_SEVENTH_VEC_ARG: {
     unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
     OutputTable.push_back(
         IITDescriptor::get(IITDescriptor::OneSeventhVecArgument, ArgInfo));
     return;
   }
+  case IIT_ONE_EIGHTH_VEC_ARG: {
+    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    OutputTable.push_back(
+        IITDescriptor::get(IITDescriptor::OneEighthVecArgument, ArgInfo));
+    return;
+  }
   case IIT_SAME_VEC_WIDTH_ARG: {
     unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
     OutputTable.push_back(
@@ -584,11 +602,14 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
     return VectorType::getHalfElementsVectorType(
         cast<VectorType>(Tys[D.getArgumentNumber()]));
   case IITDescriptor::OneThirdVecArgument:
+  case IITDescriptor::OneFourthVecArgument:
   case IITDescriptor::OneFifthVecArgument:
+  case IITDescriptor::OneSixthVecArgument:
   case IITDescriptor::OneSeventhVecArgument:
+  case IITDescriptor::OneEighthVecArgument:
     return VectorType::getOneNthElementsVectorType(
         cast<VectorType>(Tys[D.getArgumentNumber()]),
-        3 + (D.Kind - IITDescriptor::OneThirdVecArgument) * 2);
+        3 + (D.Kind - IITDescriptor::OneThirdVecArgument));
   case IITDescriptor::SameVecWidthArgument: {
     Type *EltTy = DecodeFixedType(Infos, Tys, Context);
     Type *Ty = Tys[D.getArgumentNumber()];
@@ -974,15 +995,18 @@ matchIntrinsicType(Type *Ty, ArrayRef<Intrinsic::IITDescriptor> &Infos,
            VectorType::getHalfElementsVectorType(
                cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
   case IITDescriptor::OneThirdVecArgument:
+  case IITDescriptor::OneFourthVecArgument:
   case IITDescriptor::OneFifthVecArgument:
+  case IITDescriptor::OneSixthVecArgument:
   case IITDescriptor::OneSeventhVecArgument:
+  case IITDescriptor::OneEighthVecArgument:
     // If this is a forward reference, defer the check for later.
     if (D.getArgumentNumber() >= ArgTys.size())
       return IsDeferredCheck || DeferCheck(Ty);
     return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
            VectorType::getOneNthElementsVectorType(
                cast<VectorType>(ArgTys[D.getArgumentNumber()]),
-               3 + (D.Kind - IITDescriptor::OneThirdVecArgument) * 2) != Ty;
+               3 + (D.Kind - IITDescriptor::OneThirdVecArgument)) != Ty;
   case IITDescriptor::SameVecWidthArgument: {
     if (D.getArgumentNumber() >= ArgTys.size()) {
       // Defer check and subsequent check for the vector element type.
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index f6b5a35aa06d6..a3ad0b26efd4d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -223,6 +223,41 @@ define {<2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v6i32(<6 x
 	   ret {<2 x i32>, <2 x i32>, <2 x i32>} %res
 }
 
+define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v8i32(<8 x i32> %v) {
+; CHECK-LABEL: vector_deinterleave3_v2i32_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v10, v8, 6
+; CHECK-NEXT:    vslidedown.vi v12, v8, 4
+; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v8, 2
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    add a1, a0, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v12, v10, a0
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vmv.v.v v9, v12
+; CHECK-NEXT:    vs2r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    ret
+	   %res = call {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @llvm.vector.deinterleave4.v8i32(<8 x i32> %v)
+	   ret {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} %res
+}
 
 define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterleave5_v2i16_v10i16(<10 x i16> %v) {
 ; CHECK-LABEL: vector_deinterleave5_v2i16_v10i16:
@@ -265,6 +300,49 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterle
 	   ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res
 }
 
+define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterleave6_v2i16_v12i16(<12 x i16> %v) {
+; CHECK-LABEL: vector_deinterleave6_v2i16_v12i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetivli zero, 2, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v14, v8, 6
+; CHECK-NEXT:    vslidedown.vi v15, v8, 4
+; CHECK-NEXT:    vslidedown.vi v16, v8, 2
+; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v10, v8, 10
+; CHECK-NEXT:    vslidedown.vi v12, v8, 8
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a2, a1, a1
+; CHECK-NEXT:    add a3, a0, a0
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v15, v14, a1
+; CHECK-NEXT:    vslideup.vx v8, v16, a1
+; CHECK-NEXT:    vslideup.vx v12, v10, a1
+; CHECK-NEXT:    vsetvli zero, a3, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v15, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vmv1r.v v9, v12
+; CHECK-NEXT:    vs2r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    ret
+	   %res = call {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @llvm.vector.deinterleave6.v12i16(<12 x i16> %v)
+	   ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res
+}
+
 define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave7_v14i8_v2i8(<14 x i8> %v) {
 ; RV32-LABEL: vector_deinterleave7_v14i8_v2i8:
 ; RV32:       # %bb.0:
@@ -542,6 +620,300 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @v
 	   ret {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} %res
 }
 
+define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave8_v16i8_v2i8(<16 x i8> %v) {
+; RV32-LABEL: vector_deinterleave8_v16i8_v2i8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -48
+; RV32-NEXT:    .cfi_def_cfa_offset 48
+; RV32-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s4, 24(sp) # 4-byte Folded Spill
+; RV32-NEXT:    .cfi_offset ra, -4
+; RV32-NEXT:    .cfi_offset s0, -8
+; RV32-NEXT:    .cfi_offset s1, -12
+; RV32-NEXT:    .cfi_offset s2, -16
+; RV32-NEXT:    .cfi_offset s3, -20
+; RV32-NEXT:    .cfi_offset s4, -24
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a1, a0, 1
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    sub sp, sp, a0
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 3 * vlenb
+; RV32-NEXT:    addi a0, sp, 16
+; RV32-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV32-NEXT:    csrr s1, vlenb
+; RV32-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v10, v8, 10
+; RV32-NEXT:    vslidedown.vi v9, v8, 8
+; RV32-NEXT:    srli s0, s1, 3
+; RV32-NEXT:    srli s2, s1, 2
+; RV32-NEXT:    add s3, s0, s0
+; RV32-NEXT:    add s4, s2, s0
+; RV32-NEXT:    vsetvli zero, s3, e8, mf2, tu, ma
+; RV32-NEXT:    vslideup.vx v9, v10, s0
+; RV32-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v10, v8, 12
+; RV32-NEXT:    vsetvli zero, s4, e8, mf2, tu, ma
+; RV32-NEXT:    vslideup.vx v9, v10, s2
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV32-NEXT:    li a1, 3
+; RV32-NEXT:    mv a0, s0
+; RV32-NEXT:    call __mulsi3
+; RV32-NEXT:    add a1, a0, s0
+; RV32-NEXT:    addi a2, sp, 16
+; RV32-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v12, 14
+; RV32-NEXT:    vslidedown.vi v9, v12, 2
+; RV32-NEXT:    vmv1r.v v10, v12
+; RV32-NEXT:    vslidedown.vi v11, v12, 4
+; RV32-NEXT:    vslidedown.vi v12, v12, 6
+; RV32-NEXT:    srli s1, s1, 1
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    add a2, sp, a2
+; RV32-NEXT:    addi a2, a2, 16
+; RV32-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
+; RV32-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; RV32-NEXT:    vslideup.vx v13, v8, a0
+; RV32-NEXT:    vsetvli zero, s3, e8, mf2, tu, ma
+; RV32-NEXT:    vslideup.vx v10, v9, s0
+; RV32-NEXT:    add a2, s1, s1
+; RV32-NEXT:    vsetvli zero, s4, e8, mf2, tu, ma
+; RV32-NEXT:    vslideup.vx v10, v11, s2
+; RV32-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; RV32-NEXT:    vslideup.vx v10, v12, a0
+; RV32-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; RV32-NEXT:    vslideup.vx v10, v13, s1
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 1
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 16
+; RV32-NEXT:    vs1r.v v10, (a0)
+; RV32-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; RV32-NEXT:    vlseg8e8.v v8, (a0)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a1, a0, 1
+; RV32-NEXT:    add a0, a1, a0
+; RV32-NEXT:    add sp, sp, a0
+; RV32-NEXT:    .cfi_def_cfa sp, 48
+; RV32-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s2, 32(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s3, 28(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s4, 24(sp) # 4-byte Folded Reload
+; RV32-NEXT:    .cfi_restore ra
+; RV32-NEXT:    .cfi_restore s0
+; RV32-NEXT:    .cfi_restore s1
+; RV32-NEXT:    .cfi_restore s2
+; RV32-NEXT:    .cfi_restore s3
+; RV32-NEXT:    .cfi_restore s4
+; RV32-NEXT:    addi sp, sp, 48
+; RV32-NEXT:    .cfi_def_cfa_offset 0
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vector_deinterleave8_v16i8_v2i8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -64
+; RV64-NEXT:    .cfi_def_cfa_offset 64
+; RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s4, 16(sp) # 8-byte Folded Spill
+; RV64-NEXT:    .cfi_offset ra, -8
+; RV64-NEXT:    .cfi_offset s0, -16
+; RV64-NEXT:    .cfi_offset s1, -24
+; RV64-NEXT:    .cfi_offset s2, -32
+; RV64-NEXT:    .cfi_offset s3, -40
+; RV64-NEXT:    .cfi_offset s4, -48
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a1, a0, 1
+; RV64-NEXT:    add a0, a1, a0
+; RV64-NEXT:    sub sp, sp, a0
+; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 3 * vlenb
+; RV64-NEXT:    addi a0, sp, 16
+; RV64-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; RV64-NEXT:    csrr s1, vlenb
+; RV64-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v10, v8, 10
+; RV64-NEXT:    vslidedown.vi v9, v8, 8
+; RV64-NEXT:    srli s0, s1, 3
+; RV64-NEXT:    srli s2, s1, 2
+; RV64-NEXT:    add s3, s0, s0
+; RV64-NEXT:    add s4, s2, s0
+; RV64-NEXT:    vsetvli zero, s3, e8, mf2, tu, ma
+; RV64-NEXT:    vslideup.vx v9, v10, s0
+; RV64-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v10, v8, 12
+; RV64-NEXT:    vsetvli zero, s4, e8, mf2, tu, ma
+; RV64-NEXT:    vslideup.vx v9, v10, s2
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; RV64-NEXT:    li a1, 3
+; RV64-NEXT:    mv a0, s0
+; RV64-NEXT:    call __muldi3
+; RV64-NEXT:    add a1, a0, s0
+; RV64-NEXT:    addi a2, sp, 16
+; RV64-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v8, v12, 14
+; RV64-NEXT:    vslidedown.vi v9, v12, 2
+; RV64-NEXT:    vmv1r.v v10, v12
+; RV64-NEXT:    vslidedown.vi v11, v12, 4
+; RV64-NEXT:    vslidedown.vi v12, v12, 6
+; RV64-NEXT:    srli s1, s1, 1
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    add a2, sp, a2
+; RV64-NEXT:    addi a2, a2, 16
+; RV64-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
+; RV64-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; RV64-NEXT:    vslideup.vx v13, v8, a0
+; RV64-NEXT:    vsetvli zero, s3, e8, mf2, tu, ma
+; RV64-NEXT:    vslideup.vx v10, v9, s0
+; RV64-NEXT:    add a2, s1, s1
+; RV64-NEXT:    vsetvli zero, s4, e8, mf2, tu, ma
+; RV64-NEXT:    vslideup.vx v10, v11, s2
+; RV64-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; RV64-NEXT:    vslideup.vx v10, v12, a0
+; RV64-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; RV64-NEXT:    vslideup.vx v10, v13, s1
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 1
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 16
+; RV64-NEXT:    vs1r.v v10, (a0)
+; RV64-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; RV64-NEXT:    vlseg8e8.v v8, (a0)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a1, a0, 1
+; RV64-NEXT:    add a0, a1, a0
+; RV64-NEXT:    add sp, sp, a0
+; RV64-NEXT:    .cfi_def_cfa sp, 64
+; RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s2, 32(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s3, 24(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s4, 16(sp) # 8-byte Folded Reload
+; RV64-NEXT:    .cfi_restore ra
+; RV64-NEXT:    .cfi_restore s0
+; RV64-NEXT:    .cfi_restore s1
+; RV64-NEXT:    .cfi_restore s2
+; RV64-NEXT:    .cfi_restore s3
+; RV64-NEXT:    .cfi_restore s4
+; RV64-NEXT:    addi sp, sp, 64
+; RV64-NEXT:    .cfi_def_cfa_offset 0
+; RV64-NEXT:    ret
+;
+; ZIP-LABEL: vector_deinterleave8_v16i8_v2i8:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    addi sp, sp, -64
+; ZIP-NEXT:    .cfi_def_cfa_offset 64
+; ZIP-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    sd s4, 16(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    .cfi_offset ra, -8
+; ZIP-NEXT:    .cfi_offset s0, -16
+; ZIP-NEXT:    .cfi_offset s1, -24
+; ZIP-NEXT:    .cfi_offset s2, -32
+; ZIP-NEXT:    .cfi_offset s3, -40
+; ZIP-NEXT:    .cfi_offset s4, -48
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    slli a1, a0, 1
+; ZIP-NEXT:    add a0, a1, a0
+; ZIP-NEXT:    sub sp, sp, a0
+; ZIP-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 3 * vlenb
+; ZIP-NEXT:    addi a0, sp, 16
+; ZIP-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
+; ZIP-NEXT:    csrr s1, vlenb
+; ZIP-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; ZIP-NEXT:    vslidedown.vi v10, v8, 10
+; ZIP-NEXT:    vslidedown.vi v9, v8, 8
+; ZIP-NEXT:    srli s0, s1, 3
+; ZIP-NEXT:    srli s2, s1, 2
+; ZIP-NEXT:    add s3, s0, s0
+; ZIP-NEXT:    add s4, s2, s0
+; ZIP-NEXT:    vsetvli zero, s3, e8, mf2, tu, ma
+; ZIP-NEXT:    vslideup.vx v9, v10, s0
+; ZIP-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; ZIP-NEXT:    vslidedown.vi v10, v8, 12
+; ZIP-NEXT:    vsetvli zero, s4, e8, mf2, tu, ma
+; ZIP-NEXT:    vslideup.vx v9, v10, s2
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    add a0, sp, a0
+; ZIP-NEXT:    addi a0, a0, 16
+; ZIP-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
+; ZIP-NEXT:    li a1, 3
+; ZIP-NEXT:    mv a0, s0
+; ZIP-NEXT:    call __muldi3
+; ZIP-NEXT:    add a1, a0, s0
+; ZIP-NEXT:    addi a2, sp, 16
+; ZIP-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
+; ZIP-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; ZIP-NEXT:    vslidedown.vi v8, v12, 14
+; ZIP-NEXT:    vslidedown.vi v9, v12, 2
+; ZIP-NEXT:    vmv1r.v v10, v12
+; ZIP-NEXT:    vslidedown.vi v11, v12, 4
+; ZIP-NEXT:    vslidedown.vi v12, v12, 6
+; ZIP-NEXT:    srli s1, s1, 1
+; ZIP-NEXT:    csrr a2, vlenb
+; ZIP-NEXT:    add a2, sp, a2
+; ZIP-NEXT:    addi a2, a2, 16
+; ZIP-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
+; ZIP-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; ZIP-NEXT:    vslideup.vx v13, v8, a0
+; ZIP-NEXT:    vsetvli zero, s3, e8, mf2, tu, ma
+; ZIP-NEXT:    vslideup.vx v10, v9, s0
+; ZIP-NEXT:    add a2, s1, s1
+; ZIP-NEXT:    vsetvli zero, s4, e8, mf2, tu, ma
+; ZIP-NEXT:    vslideup.vx v10, v11, s2
+; ZIP-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; ZIP-NEXT:    vslideup.vx v10, v12, a0
+; ZIP-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; ZIP-NEXT:    vslideup.vx v10, v13, s1
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    slli a0, a0, 1
+; ZIP-NEXT:    add a0, sp, a0
+; ZIP-NEXT:    addi a0, a0, 16
+; ZIP-NEXT:    vs1r.v v10, (a0)
+; ZIP-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; ZIP-NEXT:    vlseg8e8.v v8, (a0)
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    slli a1, a0, 1
+; ZIP-NEXT:    add a0, a1, a0
+; ZIP-NEXT:    add sp, sp, a0
+; ZIP-NEXT:    .cfi_def_cfa sp, 64
+; ZIP-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    ld s2, 32(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    ld s3, 24(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    ld s4, 16(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    .cfi_restore ra
+; ZIP-NEXT:    .cfi_restore s0
+; ZIP-NEXT:    .cfi_restore s1
+; ZIP-NEXT:    .cfi_restore s2
+; ZIP-NEXT:    .cfi_restore s3
+; ZIP-NEXT:    .cfi_restore s4
+; ZIP-NEXT:    addi sp, sp, 64
+; ZIP-NEXT:    .cfi_def_cfa_offset 0
+; ZIP-NEXT:    ret
+	   %res = call {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @llvm.vector.deinterleave8.v16i8(<16 x i8> %v)
+	   ret {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} %res
+}
 
 ; Floats
 
@@ -695,8 +1067,8 @@ define {<4 x double>, <4 x double>} @vector_deinterleave_v4f64_v8f64(<8 x double
 ret {<4 x double>, <4 x double>} %retval
 }
 
-define {<2 x float>, <2 x float>, <2 x float>} @vector_deinterleave3_v632_v2f32(<6 x float> %v) {
-; CHECK-LABEL: vector_deinterleave3_v632_v2f32:
+define {<2 x float>, <2 x float>, <2 x float>} @vector_deinterleave3_v6f32_v2f32(<6 x float> %v) {
+; CHECK-LABEL: vector_deinterleave3_v6f32_v2f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
@@ -729,6 +1101,41 @@ define {<2 x float>, <2 x float>, <2 x float>} @vector_deinterleave3_v632_v2f32(
 	   ret {<2 x float>, <2 x float>, <2 x float>} %res
 }
 
+define {<2 x float>, <2 x float>, <2 x float>, <2 x float>} @vector_deinterleave4_v8f32_v2f32(<8 x float> %v) {
+; CHECK-LABEL: vector_deinterleave4_v8f32_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v10, v8, 6
+; CHECK-NEXT:    vslidedown.vi v12, v8, 4
+; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v8, 2
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    add a1, a0, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v12, v10, a0
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vmv.v.v v9, v12
+; CHECK-NEXT:    vs2r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    ret
+	   %res = call {<2 x float>, <2 x float>, <2 x float>, <2 x float>} @llvm.vector.deinterleave4.v8f32(<8 x float> %v)
+	   ret {<2 x float>, <2 x float>, <2 x float>, <2 x float>} %res
+}
 
 define {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @vector_deinterleave5_v10f16_v2f16(<10 x half> %v) {
 ; CHECK-LABEL: vector_deinterleave5_v10f16_v2f16:
@@ -771,6 +1178,49 @@ define {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @vector_dein
 	   ret {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} %res
 }
 
+define {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @vector_deinterleave6_v12f16_v2f16(<12 x half> %v) {
+; CHECK-LABEL: vector_deinterleave6_v12f16_v2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetivli zero, 2, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v14, v8, 6
+; CHECK-NEXT:    vslidedown.vi v15, v8, 4
+; CHECK-NEXT:    vslidedown.vi v16, v8, 2
+; CHECK-NEXT:    vsetivli zero, 2, e16, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v10, v8, 10
+; CHECK-NEXT:    vslidedown.vi v12, v8, 8
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a2, a1, a1
+; CHECK-NEXT:    add a3, a0, a0
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v15, v14, a1
+; CHECK-NEXT:    vslideup.vx v8, v16, a1
+; CHECK-NEXT:    vslideup.vx v12, v10, a1
+; CHECK-NEXT:    vsetvli zero, a3, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v15, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vmv1r.v v9, v12
+; CHECK-NEXT:    vs2r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    ret
+	   %res = call {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} @llvm.vector.deinterleave6.v12f16(<12 x half> %v)
+	   ret {<2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>} %res
+}
+
 define {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} @vector_deinterleave7_v7f16_v1f16(<7 x half> %v) {
 ; CHECK-LABEL: vector_deinterleave7_v7f16_v1f16:
 ; CHECK:       # %bb.0:
@@ -817,3 +1267,51 @@ define {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>,
 	   %res = call {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} @llvm.vector.deinterleave7.v7f16(<7 x half> %v)
 	   ret {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} %res
 }
+
+define {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} @vector_deinterleave8_v8f16_v1f16(<8 x half> %v) {
+; CHECK-LABEL: vector_deinterleave8_v8f16_v1f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v10, v8, 7
+; CHECK-NEXT:    vslidedown.vi v11, v8, 6
+; CHECK-NEXT:    vslidedown.vi v12, v8, 5
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    vslidedown.vi v9, v8, 4
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a2, a1, a1
+; CHECK-NEXT:    add a3, a0, a0
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v11, v10, a1
+; CHECK-NEXT:    vslideup.vx v9, v12, a1
+; CHECK-NEXT:    vsetvli zero, a3, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v11, a0
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v10, v8, 3
+; CHECK-NEXT:    vslidedown.vi v11, v8, 2
+; CHECK-NEXT:    vslidedown.vi v12, v8, 1
+; CHECK-NEXT:    vsetvli zero, a2, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v11, v10, a1
+; CHECK-NEXT:    vslideup.vx v8, v12, a1
+; CHECK-NEXT:    vsetvli zero, a3, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v11, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs2r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    ret
+	   %res = call {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} @llvm.vector.deinterleave8.v8f16(<8 x half> %v)
+	   ret {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
index e316c022727ab..6a08f5a28a295 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave.ll
@@ -468,6 +468,131 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_dein
   ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
 }
 
+define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv64i1(<vscale x 64 x i1> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv64i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmv1r.v v8, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a1
+; CHECK-NEXT:    srli a1, a0, 1
+; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v18, v10, 1, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v8, a1
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v20, v10, 1, v0
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v8, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v22, v10, 1, v0
+; CHECK-NEXT:    vs8r.v v16, (a0)
+; CHECK-NEXT:    vlseg4e8.v v8, (a0)
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmsne.vi v8, v10, 0
+; CHECK-NEXT:    vmsne.vi v9, v12, 0
+; CHECK-NEXT:    vmsne.vi v10, v14, 0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave4.nxv64i1(<vscale x 64 x i1> %vec)
+  ret {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} %retval
+}
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv64i8(<vscale x 64 x i8> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv64i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vlseg4e8.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave4.nxv48i8(<vscale x 64 x i8> %vec)
+  ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %retval
+}
+
+define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv32i16(<vscale x 32 x i16> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv32i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave4.nxv32i16(<vscale x 32 x i16> %vec)
+  ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %retval
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxv16i32(<vscale x 16 x i32> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4i32_nxv16i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> %vec)
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %retval
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv8i64(<vscale x 8 x i64> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv8i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vlseg4e64.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave4.nxv8i64(<vscale x 8 x i64> %vec)
+  ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
+}
+
 define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv80i1(<vscale x 80 x i1> %vec) nounwind {
 ; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv80i1:
 ; CHECK:       # %bb.0:
@@ -700,6 +825,240 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
   ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
 }
 
+define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv96i1(<vscale x 96 x i1> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv96i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
+; CHECK-NEXT:    srli a2, a0, 2
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a2
+; CHECK-NEXT:    srli a3, a0, 1
+; CHECK-NEXT:    vsetvli a4, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v18, v10, 1, v0
+; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v9, a3
+; CHECK-NEXT:    srli a3, a0, 3
+; CHECK-NEXT:    slli a3, a3, 1
+; CHECK-NEXT:    sub a0, a0, a3
+; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v20, v10, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v26, v10, 1, v0
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v9, a0
+; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v24, v10, 1, v0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v8, a2
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v28, v10, 1, v0
+; CHECK-NEXT:    vs8r.v v24, (a0)
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vlseg6e8.v v16, (a1)
+; CHECK-NEXT:    vlseg6e8.v v10, (a0)
+; CHECK-NEXT:    vmv2r.v v8, v16
+; CHECK-NEXT:    vmv2r.v v22, v18
+; CHECK-NEXT:    vmv2r.v v24, v20
+; CHECK-NEXT:    vmv1r.v v9, v10
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmv1r.v v10, v17
+; CHECK-NEXT:    vmsne.vi v8, v10, 0
+; CHECK-NEXT:    vmv1r.v v23, v12
+; CHECK-NEXT:    vmsne.vi v9, v22, 0
+; CHECK-NEXT:    vmv1r.v v12, v19
+; CHECK-NEXT:    vmsne.vi v10, v12, 0
+; CHECK-NEXT:    vmv1r.v v25, v14
+; CHECK-NEXT:    vmsne.vi v11, v24, 0
+; CHECK-NEXT:    vmv1r.v v14, v21
+; CHECK-NEXT:    vmsne.vi v12, v14, 0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave6.nxv96i1(<vscale x 96 x i1> %vec)
+  ret {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} %retval
+}
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv96i8(<vscale x 96 x i8> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv96i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v28, v18
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vmv2r.v v24, v14
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vmv2r.v v26, v16
+; CHECK-NEXT:    vs8r.v v24, (a1)
+; CHECK-NEXT:    vlseg6e8.v v24, (a0)
+; CHECK-NEXT:    vlseg6e8.v v18, (a1)
+; CHECK-NEXT:    vmv2r.v v8, v24
+; CHECK-NEXT:    vmv1r.v v9, v18
+; CHECK-NEXT:    vmv1r.v v18, v25
+; CHECK-NEXT:    vmv2r.v v12, v26
+; CHECK-NEXT:    vmv1r.v v13, v20
+; CHECK-NEXT:    vmv1r.v v20, v27
+; CHECK-NEXT:    vmv2r.v v16, v28
+; CHECK-NEXT:    vmv1r.v v17, v22
+; CHECK-NEXT:    vmv1r.v v22, v29
+; CHECK-NEXT:    vmv2r.v v10, v18
+; CHECK-NEXT:    vmv2r.v v14, v20
+; CHECK-NEXT:    vmv2r.v v18, v22
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave6.nxv96i8(<vscale x 96 x i8> %vec)
+  ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %retval
+}
+
+define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv48i16(<vscale x 48 x i16> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv48i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v28, v18
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vmv2r.v v24, v14
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vmv2r.v v26, v16
+; CHECK-NEXT:    vs8r.v v24, (a1)
+; CHECK-NEXT:    vlseg6e16.v v24, (a0)
+; CHECK-NEXT:    vlseg6e16.v v18, (a1)
+; CHECK-NEXT:    vmv2r.v v8, v24
+; CHECK-NEXT:    vmv1r.v v9, v18
+; CHECK-NEXT:    vmv1r.v v18, v25
+; CHECK-NEXT:    vmv2r.v v12, v26
+; CHECK-NEXT:    vmv1r.v v13, v20
+; CHECK-NEXT:    vmv1r.v v20, v27
+; CHECK-NEXT:    vmv2r.v v16, v28
+; CHECK-NEXT:    vmv1r.v v17, v22
+; CHECK-NEXT:    vmv1r.v v22, v29
+; CHECK-NEXT:    vmv2r.v v10, v18
+; CHECK-NEXT:    vmv2r.v v14, v20
+; CHECK-NEXT:    vmv2r.v v18, v22
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave6.nxv48i16(<vscale x 48 x i16> %vec)
+  ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %retval
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxv24i32(<vscale x 24 x i32> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4i32_nxv24i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v28, v18
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vmv2r.v v24, v14
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vmv2r.v v26, v16
+; CHECK-NEXT:    vs8r.v v24, (a1)
+; CHECK-NEXT:    vlseg6e32.v v24, (a0)
+; CHECK-NEXT:    vlseg6e32.v v18, (a1)
+; CHECK-NEXT:    vmv2r.v v8, v24
+; CHECK-NEXT:    vmv1r.v v9, v18
+; CHECK-NEXT:    vmv1r.v v18, v25
+; CHECK-NEXT:    vmv2r.v v12, v26
+; CHECK-NEXT:    vmv1r.v v13, v20
+; CHECK-NEXT:    vmv1r.v v20, v27
+; CHECK-NEXT:    vmv2r.v v16, v28
+; CHECK-NEXT:    vmv1r.v v17, v22
+; CHECK-NEXT:    vmv1r.v v22, v29
+; CHECK-NEXT:    vmv2r.v v10, v18
+; CHECK-NEXT:    vmv2r.v v14, v20
+; CHECK-NEXT:    vmv2r.v v18, v22
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave6.nxv24i32(<vscale x 24 x i32> %vec)
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %retval
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv12i64(<vscale x 12 x i64> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv12i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v28, v18
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vmv2r.v v24, v14
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vmv2r.v v26, v16
+; CHECK-NEXT:    vs8r.v v24, (a1)
+; CHECK-NEXT:    vlseg6e64.v v24, (a0)
+; CHECK-NEXT:    vlseg6e64.v v18, (a1)
+; CHECK-NEXT:    vmv2r.v v8, v24
+; CHECK-NEXT:    vmv1r.v v9, v18
+; CHECK-NEXT:    vmv1r.v v18, v25
+; CHECK-NEXT:    vmv2r.v v12, v26
+; CHECK-NEXT:    vmv1r.v v13, v20
+; CHECK-NEXT:    vmv1r.v v20, v27
+; CHECK-NEXT:    vmv2r.v v16, v28
+; CHECK-NEXT:    vmv1r.v v17, v22
+; CHECK-NEXT:    vmv1r.v v22, v29
+; CHECK-NEXT:    vmv2r.v v10, v18
+; CHECK-NEXT:    vmv2r.v v14, v20
+; CHECK-NEXT:    vmv2r.v v18, v22
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave6.nxv12i64(<vscale x 12 x i64> %vec)
+  ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
+}
+
 define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv112i1(<vscale x 112 x i1> %vec) nounwind {
 ; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv112i1:
 ; CHECK:       # %bb.0:
@@ -971,26 +1330,277 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
   ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
 }
 
-; Floats
-
-define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv4bf16(<vscale x 4 x bfloat> %vec) {
-; V-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16:
-; V:       # %bb.0:
-; V-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; V-NEXT:    vnsrl.wi v10, v8, 0
-; V-NEXT:    vnsrl.wi v9, v8, 16
-; V-NEXT:    vmv1r.v v8, v10
-; V-NEXT:    ret
-;
-; ZIP-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16:
-; ZIP:       # %bb.0:
-; ZIP-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZIP-NEXT:    ri.vunzip2a.vv v10, v8, v9
-; ZIP-NEXT:    ri.vunzip2b.vv v9, v8, v11
-; ZIP-NEXT:    vmv.v.v v8, v10
-; ZIP-NEXT:    ret
-%retval = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave2.nxv4bf16(<vscale x 4 x bfloat> %vec)
-ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %retval
+define {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv128i1(<vscale x 128 x i1> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv128i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmv1r.v v9, v0
+; CHECK-NEXT:    vmv.v.i v10, 0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vmerge.vim v16, v10, 1, v0
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    srli a2, a0, 1
+; CHECK-NEXT:    srli a3, a0, 3
+; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v0, a1
+; CHECK-NEXT:    slli a3, a3, 1
+; CHECK-NEXT:    vsetvli a4, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v18, v10, 1, v0
+; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v9, a2
+; CHECK-NEXT:    sub a0, a0, a3
+; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v20, v10, 1, v0
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v9, a0
+; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v22, v10, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v24, v10, 1, v0
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v8, a1
+; CHECK-NEXT:    addi a1, sp, 16
+; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v26, v10, 1, v0
+; CHECK-NEXT:    vsetvli a3, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v8, a2
+; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v28, v10, 1, v0
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v0, v8, a0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vsetvli a2, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmerge.vim v30, v10, 1, v0
+; CHECK-NEXT:    vs8r.v v24, (a0)
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vlseg8e8.v v18, (a1)
+; CHECK-NEXT:    vlseg8e8.v v10, (a0)
+; CHECK-NEXT:    vmv2r.v v8, v18
+; CHECK-NEXT:    vmv2r.v v26, v20
+; CHECK-NEXT:    vmv2r.v v28, v22
+; CHECK-NEXT:    vmv2r.v v30, v24
+; CHECK-NEXT:    vmv1r.v v9, v10
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmsne.vi v0, v8, 0
+; CHECK-NEXT:    vmv1r.v v10, v19
+; CHECK-NEXT:    vmsne.vi v8, v10, 0
+; CHECK-NEXT:    vmv1r.v v27, v12
+; CHECK-NEXT:    vmsne.vi v9, v26, 0
+; CHECK-NEXT:    vmv1r.v v12, v21
+; CHECK-NEXT:    vmsne.vi v10, v12, 0
+; CHECK-NEXT:    vmv1r.v v29, v14
+; CHECK-NEXT:    vmsne.vi v11, v28, 0
+; CHECK-NEXT:    vmv1r.v v14, v23
+; CHECK-NEXT:    vmsne.vi v12, v14, 0
+; CHECK-NEXT:    vmv1r.v v31, v16
+; CHECK-NEXT:    vmsne.vi v13, v30, 0
+; CHECK-NEXT:    vmv1r.v v16, v25
+; CHECK-NEXT:    vmsne.vi v14, v16, 0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %retval = call {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave8.nxv128i1(<vscale x 128 x i1> %vec)
+  ret {<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>} %retval
+}
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv128i8(<vscale x 128 x i8> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv128i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    vsetvli a2, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vlseg8e8.v v0, (a0)
+; CHECK-NEXT:    vlseg8e8.v v22, (a1)
+; CHECK-NEXT:    vmv2r.v v8, v0
+; CHECK-NEXT:    vmv1r.v v9, v22
+; CHECK-NEXT:    vmv1r.v v22, v1
+; CHECK-NEXT:    vmv2r.v v12, v2
+; CHECK-NEXT:    vmv1r.v v13, v24
+; CHECK-NEXT:    vmv1r.v v24, v3
+; CHECK-NEXT:    vmv2r.v v16, v4
+; CHECK-NEXT:    vmv1r.v v17, v26
+; CHECK-NEXT:    vmv1r.v v26, v5
+; CHECK-NEXT:    vmv2r.v v20, v6
+; CHECK-NEXT:    vmv1r.v v21, v28
+; CHECK-NEXT:    vmv1r.v v28, v7
+; CHECK-NEXT:    vmv2r.v v10, v22
+; CHECK-NEXT:    vmv2r.v v14, v24
+; CHECK-NEXT:    vmv2r.v v18, v26
+; CHECK-NEXT:    vmv2r.v v22, v28
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave8.nxv128i8(<vscale x 128 x i8> %vec)
+  ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %retval
+}
+
+define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv64i16(<vscale x 64 x i16> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv64i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vlseg8e16.v v0, (a0)
+; CHECK-NEXT:    vlseg8e16.v v22, (a1)
+; CHECK-NEXT:    vmv2r.v v8, v0
+; CHECK-NEXT:    vmv1r.v v9, v22
+; CHECK-NEXT:    vmv1r.v v22, v1
+; CHECK-NEXT:    vmv2r.v v12, v2
+; CHECK-NEXT:    vmv1r.v v13, v24
+; CHECK-NEXT:    vmv1r.v v24, v3
+; CHECK-NEXT:    vmv2r.v v16, v4
+; CHECK-NEXT:    vmv1r.v v17, v26
+; CHECK-NEXT:    vmv1r.v v26, v5
+; CHECK-NEXT:    vmv2r.v v20, v6
+; CHECK-NEXT:    vmv1r.v v21, v28
+; CHECK-NEXT:    vmv1r.v v28, v7
+; CHECK-NEXT:    vmv2r.v v10, v22
+; CHECK-NEXT:    vmv2r.v v14, v24
+; CHECK-NEXT:    vmv2r.v v18, v26
+; CHECK-NEXT:    vmv2r.v v22, v28
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave8.nxv64i16(<vscale x 64 x i16> %vec)
+  ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %retval
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxv32i32(<vscale x 32 x i32> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4i32_nxv32i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg8e32.v v0, (a0)
+; CHECK-NEXT:    vlseg8e32.v v22, (a1)
+; CHECK-NEXT:    vmv2r.v v8, v0
+; CHECK-NEXT:    vmv1r.v v9, v22
+; CHECK-NEXT:    vmv1r.v v22, v1
+; CHECK-NEXT:    vmv2r.v v12, v2
+; CHECK-NEXT:    vmv1r.v v13, v24
+; CHECK-NEXT:    vmv1r.v v24, v3
+; CHECK-NEXT:    vmv2r.v v16, v4
+; CHECK-NEXT:    vmv1r.v v17, v26
+; CHECK-NEXT:    vmv1r.v v26, v5
+; CHECK-NEXT:    vmv2r.v v20, v6
+; CHECK-NEXT:    vmv1r.v v21, v28
+; CHECK-NEXT:    vmv1r.v v28, v7
+; CHECK-NEXT:    vmv2r.v v10, v22
+; CHECK-NEXT:    vmv2r.v v14, v24
+; CHECK-NEXT:    vmv2r.v v18, v26
+; CHECK-NEXT:    vmv2r.v v22, v28
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave8.nxv32i32(<vscale x 32 x i32> %vec)
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %retval
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv16i64(<vscale x 16 x i64> %vec) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv16i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vlseg8e64.v v0, (a0)
+; CHECK-NEXT:    vlseg8e64.v v22, (a1)
+; CHECK-NEXT:    vmv2r.v v8, v0
+; CHECK-NEXT:    vmv1r.v v9, v22
+; CHECK-NEXT:    vmv1r.v v22, v1
+; CHECK-NEXT:    vmv2r.v v12, v2
+; CHECK-NEXT:    vmv1r.v v13, v24
+; CHECK-NEXT:    vmv1r.v v24, v3
+; CHECK-NEXT:    vmv2r.v v16, v4
+; CHECK-NEXT:    vmv1r.v v17, v26
+; CHECK-NEXT:    vmv1r.v v26, v5
+; CHECK-NEXT:    vmv2r.v v20, v6
+; CHECK-NEXT:    vmv1r.v v21, v28
+; CHECK-NEXT:    vmv1r.v v28, v7
+; CHECK-NEXT:    vmv2r.v v10, v22
+; CHECK-NEXT:    vmv2r.v v14, v24
+; CHECK-NEXT:    vmv2r.v v18, v26
+; CHECK-NEXT:    vmv2r.v v22, v28
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave8.nxv16i64(<vscale x 16 x i64> %vec)
+  ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
+}
+
+; Floats
+
+define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv4bf16(<vscale x 4 x bfloat> %vec) {
+; V-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16:
+; V:       # %bb.0:
+; V-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; V-NEXT:    vnsrl.wi v10, v8, 0
+; V-NEXT:    vnsrl.wi v9, v8, 16
+; V-NEXT:    vmv1r.v v8, v10
+; V-NEXT:    ret
+;
+; ZIP-LABEL: vector_deinterleave_nxv2bf16_nxv4bf16:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT:    ri.vunzip2a.vv v10, v8, v9
+; ZIP-NEXT:    ri.vunzip2b.vv v9, v8, v11
+; ZIP-NEXT:    vmv.v.v v8, v10
+; ZIP-NEXT:    ret
+%retval = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave2.nxv4bf16(<vscale x 4 x bfloat> %vec)
+ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %retval
 }
 
 define {<vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv4f16(<vscale x 4 x half> %vec) {
@@ -1550,35 +2160,48 @@ define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @ve
   ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %res
 }
 
-define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv10f16(<vscale x 10 x half> %arg) nounwind {
-; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv10f16:
+define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv8f16(<vscale x 8 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs2r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v11, v9, a0
-; CHECK-NEXT:    vslideup.vx v9, v11, a0
-; CHECK-NEXT:    vslidedown.vx v11, v8, a0
-; CHECK-NEXT:    vslideup.vx v8, v11, a0
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave4.nxv8f16(<vscale x 8 x half> %arg)
+  ret {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} %res
+}
+
+define {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv16f16(<vscale x 16 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs4r.v v8, (a0)
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave5.nxv10f16(<vscale x 10 x half> %arg)
-  ret {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} %res
+  %res = call {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave4.nxv16f16(<vscale x 16 x half> %arg)
+  ret {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} %res
 }
 
-define {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv20f16(<vscale x 20 x half> %arg) nounwind {
-; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv20f16:
+define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_nxv32f16(<vscale x 32 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv32f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -1586,86 +2209,59 @@ define {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0)
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave5.nxv20f16(<vscale x 20 x half> %arg)
-  ret {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} %res
+  %res = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave4.nxv32f16(<vscale x 32 x half> %arg)
+  ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res
 }
 
-define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_nxv40f16(<vscale x 40 x half> %arg) nounwind {
-; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv40f16:
+define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv8bf16(<vscale x 8 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vmv1r.v v26, v15
-; CHECK-NEXT:    vmv1r.v v27, v16
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vmv1r.v v24, v13
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    add a1, sp, a1
-; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vmv1r.v v25, v14
-; CHECK-NEXT:    vs8r.v v8, (a0)
-; CHECK-NEXT:    vmv1r.v v28, v17
-; CHECK-NEXT:    vs8r.v v24, (a1)
-; CHECK-NEXT:    vlseg5e16.v v12, (a0)
-; CHECK-NEXT:    vlseg5e16.v v18, (a1)
-; CHECK-NEXT:    vmv2r.v v8, v12
-; CHECK-NEXT:    vmv1r.v v9, v18
-; CHECK-NEXT:    vmv1r.v v18, v13
-; CHECK-NEXT:    vmv2r.v v12, v14
-; CHECK-NEXT:    vmv1r.v v13, v20
-; CHECK-NEXT:    vmv1r.v v20, v15
-; CHECK-NEXT:    vmv1r.v v17, v22
-; CHECK-NEXT:    vmv2r.v v10, v18
-; CHECK-NEXT:    vmv2r.v v14, v20
+; CHECK-NEXT:    vs2r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave5.nxv40f16(<vscale x 40 x half> %arg)
-  ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res
+  %res = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave4.nxv8bf16(<vscale x 8 x bfloat> %arg)
+  ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %res
 }
 
-define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv10bf16(<vscale x 10 x bfloat> %arg) nounwind {
-; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv10bf16:
+define {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @vector_deinterleave_nxv4bf16_nxv16bf16(<vscale x 16 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    srli a0, a0, 2
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vslidedown.vx v11, v9, a0
-; CHECK-NEXT:    vslideup.vx v9, v11, a0
-; CHECK-NEXT:    vslidedown.vx v11, v8, a0
-; CHECK-NEXT:    vslideup.vx v8, v11, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs4r.v v8, (a0)
-; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e16.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave5.nxv10bf16(<vscale x 10 x bfloat> %arg)
-  ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %res
+  %res = call {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @llvm.vector.deinterleave4.nxv16bf16(<vscale x 16 x bfloat> %arg)
+  ret {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} %res
 }
 
-define {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @vector_deinterleave_nxv4bf16_nxv20bf16(<vscale x 20 x bfloat> %arg) nounwind {
-; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv20bf16:
+define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @vector_deinterleave_nxv8bf16_nxv32bf16(<vscale x 32 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8bf16_nxv32bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -1673,7 +2269,241 @@ define {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vs
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0)
-; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vlseg4e16.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.vector.deinterleave4.nxv32bf16(<vscale x 32 x bfloat> %arg)
+  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res
+}
+
+define {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @vector_deinterleave_nxv1f32_nxv4f32(<vscale x 4 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv1f32_nxv4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs2r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @llvm.vector.deinterleave4.nxv4f32(<vscale x 4 x float> %arg)
+  ret {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} %res
+}
+
+define {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv8f32(<vscale x 8 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs4r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave4.nxv6f32(<vscale x 8 x float> %arg)
+  ret {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} %res
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32_nxv16f32(<vscale x 16 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv16f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vlseg4e32.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> %arg)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res
+}
+
+define {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} @vector_deinterleave_nxv1f64_nxv4f64(<vscale x 4 x double> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv1f64_nxv4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs4r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vlseg4e64.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} @llvm.vector.deinterleave4.nxv4f64(<vscale x 4 x double> %arg)
+  ret {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} %res
+}
+
+define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f64_nxv8f64(<vscale x 8 x double> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vlseg4e64.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave4.nxv8f64(<vscale x 8 x double> %arg)
+  ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %res
+}
+
+define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv10f16(<vscale x 10 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv10f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v11, v9, a0
+; CHECK-NEXT:    vslideup.vx v9, v11, a0
+; CHECK-NEXT:    vslidedown.vx v11, v8, a0
+; CHECK-NEXT:    vslideup.vx v8, v11, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs4r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave5.nxv10f16(<vscale x 10 x half> %arg)
+  ret {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} %res
+}
+
+define {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv20f16(<vscale x 20 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv20f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave5.nxv20f16(<vscale x 20 x half> %arg)
+  ret {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} %res
+}
+
+define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_nxv40f16(<vscale x 40 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv40f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v26, v15
+; CHECK-NEXT:    vmv1r.v v27, v16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vmv1r.v v25, v14
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vmv1r.v v28, v17
+; CHECK-NEXT:    vs8r.v v24, (a1)
+; CHECK-NEXT:    vlseg5e16.v v12, (a0)
+; CHECK-NEXT:    vlseg5e16.v v18, (a1)
+; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v9, v18
+; CHECK-NEXT:    vmv1r.v v18, v13
+; CHECK-NEXT:    vmv2r.v v12, v14
+; CHECK-NEXT:    vmv1r.v v13, v20
+; CHECK-NEXT:    vmv1r.v v20, v15
+; CHECK-NEXT:    vmv1r.v v17, v22
+; CHECK-NEXT:    vmv2r.v v10, v18
+; CHECK-NEXT:    vmv2r.v v14, v20
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave5.nxv40f16(<vscale x 40 x half> %arg)
+  ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res
+}
+
+define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv10bf16(<vscale x 10 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv10bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v11, v9, a0
+; CHECK-NEXT:    vslideup.vx v9, v11, a0
+; CHECK-NEXT:    vslidedown.vx v11, v8, a0
+; CHECK-NEXT:    vslideup.vx v8, v11, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs4r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vlseg5e16.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave5.nxv10bf16(<vscale x 10 x bfloat> %arg)
+  ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %res
+}
+
+define {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @vector_deinterleave_nxv4bf16_nxv20bf16(<vscale x 20 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv20bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
 ; CHECK-NEXT:    vlseg5e16.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
@@ -1720,12 +2550,351 @@ define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vs
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.vector.deinterleave5.nxv40bf16(<vscale x 40 x bfloat> %arg)
-  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res
+  %res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.vector.deinterleave5.nxv40bf16(<vscale x 40 x bfloat> %arg)
+  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res
+}
+
+define {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @vector_deinterleave_nxv1f32_nxv5f32(<vscale x 5 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv1f32_nxv5f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v11, v9, a0
+; CHECK-NEXT:    vslideup.vx v9, v11, a0
+; CHECK-NEXT:    vslidedown.vx v11, v8, a0
+; CHECK-NEXT:    vslideup.vx v8, v11, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs4r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vlseg5e32.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @llvm.vector.deinterleave5.nxv5f32(<vscale x 5 x float> %arg)
+  ret {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} %res
+}
+
+define {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv10f32(<vscale x 10 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv10f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg5e32.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave5.nxv10f32(<vscale x 10 x float> %arg)
+  ret {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} %res
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32_nxv20f32(<vscale x 20 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv20f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v26, v15
+; CHECK-NEXT:    vmv1r.v v27, v16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vmv1r.v v25, v14
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vmv1r.v v28, v17
+; CHECK-NEXT:    vs8r.v v24, (a1)
+; CHECK-NEXT:    vlseg5e32.v v12, (a0)
+; CHECK-NEXT:    vlseg5e32.v v18, (a1)
+; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v9, v18
+; CHECK-NEXT:    vmv1r.v v18, v13
+; CHECK-NEXT:    vmv2r.v v12, v14
+; CHECK-NEXT:    vmv1r.v v13, v20
+; CHECK-NEXT:    vmv1r.v v20, v15
+; CHECK-NEXT:    vmv1r.v v17, v22
+; CHECK-NEXT:    vmv2r.v v10, v18
+; CHECK-NEXT:    vmv2r.v v14, v20
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave5.nxv20f32(<vscale x 20 x float> %arg)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res
+}
+
+define {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} @vector_deinterleave_nxv1f64_nxv5f64(<vscale x 5 x double> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv1f64_nxv5f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vlseg5e64.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} @llvm.vector.deinterleave5.nxv5f64(<vscale x 5 x double> %arg)
+  ret {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} %res
+}
+
+define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f64_nxv10f64(<vscale x 10 x double> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv10f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vmv1r.v v26, v15
+; CHECK-NEXT:    vmv1r.v v27, v16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vmv1r.v v25, v14
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vmv1r.v v28, v17
+; CHECK-NEXT:    vs8r.v v24, (a1)
+; CHECK-NEXT:    vlseg5e64.v v12, (a0)
+; CHECK-NEXT:    vlseg5e64.v v18, (a1)
+; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vmv1r.v v9, v18
+; CHECK-NEXT:    vmv1r.v v18, v13
+; CHECK-NEXT:    vmv2r.v v12, v14
+; CHECK-NEXT:    vmv1r.v v13, v20
+; CHECK-NEXT:    vmv1r.v v20, v15
+; CHECK-NEXT:    vmv1r.v v17, v22
+; CHECK-NEXT:    vmv2r.v v10, v18
+; CHECK-NEXT:    vmv2r.v v14, v20
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave5.nxv10f64(<vscale x 10 x double> %arg)
+  ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %res
+}
+
+define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv12f16(<vscale x 12 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv12f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v11, v9, a0
+; CHECK-NEXT:    add a1, a0, a0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v11, a0
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v11, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v11, a0
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v11, v10, a0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v11, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs4r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave6.nxv12f16(<vscale x 12 x half> %arg)
+  ret {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} %res
+}
+
+define {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv24f16(<vscale x 24 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv24f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave6.nxv24f16(<vscale x 24 x half> %arg)
+  ret {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} %res
+}
+
+define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_nxv48f16(<vscale x 48 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv48f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v28, v18
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vmv2r.v v24, v14
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vmv2r.v v26, v16
+; CHECK-NEXT:    vs8r.v v24, (a1)
+; CHECK-NEXT:    vlseg6e16.v v24, (a0)
+; CHECK-NEXT:    vlseg6e16.v v18, (a1)
+; CHECK-NEXT:    vmv2r.v v8, v24
+; CHECK-NEXT:    vmv1r.v v9, v18
+; CHECK-NEXT:    vmv1r.v v18, v25
+; CHECK-NEXT:    vmv2r.v v12, v26
+; CHECK-NEXT:    vmv1r.v v13, v20
+; CHECK-NEXT:    vmv1r.v v20, v27
+; CHECK-NEXT:    vmv2r.v v16, v28
+; CHECK-NEXT:    vmv1r.v v17, v22
+; CHECK-NEXT:    vmv1r.v v22, v29
+; CHECK-NEXT:    vmv2r.v v10, v18
+; CHECK-NEXT:    vmv2r.v v14, v20
+; CHECK-NEXT:    vmv2r.v v18, v22
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave6.nxv48f16(<vscale x 48 x half> %arg)
+  ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res
+}
+
+define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv12bf16(<vscale x 12 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv12bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v11, v9, a0
+; CHECK-NEXT:    add a1, a0, a0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v11, a0
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v11, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v11, a0
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v11, v10, a0
+; CHECK-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v11, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs4r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave6.nxv12bf16(<vscale x 12 x bfloat> %arg)
+  ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %res
+}
+
+define {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @vector_deinterleave_nxv4bf16_nxv24bf16(<vscale x 24 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv24bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vlseg6e16.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @llvm.vector.deinterleave6.nxv24bf16(<vscale x 24 x bfloat> %arg)
+  ret {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} %res
+}
+
+define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @vector_deinterleave_nxv8bf16_nxv48bf16(<vscale x 48 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8bf16_nxv48bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v28, v18
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vmv2r.v v24, v14
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vmv2r.v v26, v16
+; CHECK-NEXT:    vs8r.v v24, (a1)
+; CHECK-NEXT:    vlseg6e16.v v24, (a0)
+; CHECK-NEXT:    vlseg6e16.v v18, (a1)
+; CHECK-NEXT:    vmv2r.v v8, v24
+; CHECK-NEXT:    vmv1r.v v9, v18
+; CHECK-NEXT:    vmv1r.v v18, v25
+; CHECK-NEXT:    vmv2r.v v12, v26
+; CHECK-NEXT:    vmv1r.v v13, v20
+; CHECK-NEXT:    vmv1r.v v20, v27
+; CHECK-NEXT:    vmv2r.v v16, v28
+; CHECK-NEXT:    vmv1r.v v17, v22
+; CHECK-NEXT:    vmv1r.v v22, v29
+; CHECK-NEXT:    vmv2r.v v10, v18
+; CHECK-NEXT:    vmv2r.v v14, v20
+; CHECK-NEXT:    vmv2r.v v18, v22
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.vector.deinterleave6.nxv48bf16(<vscale x 48 x bfloat> %arg)
+  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res
 }
 
-define {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @vector_deinterleave_nxv1f32_nxv5f32(<vscale x 5 x float> %arg) nounwind {
-; CHECK-LABEL: vector_deinterleave_nxv1f32_nxv5f32:
+define {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @vector_deinterleave_nxv1f32_nxv6f32(<vscale x 6 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv1f32_nxv6f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -1735,24 +2904,32 @@ define {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscal
 ; CHECK-NEXT:    srli a0, a0, 3
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v11, v9, a0
+; CHECK-NEXT:    add a1, a0, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v11, a0
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vx v11, v8, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v11, a0
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v11, v10, a0
+; CHECK-NEXT:    vsetvli zero, a1, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v11, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs4r.v v8, (a0)
 ; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vlseg5e32.v v8, (a0)
+; CHECK-NEXT:    vlseg6e32.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @llvm.vector.deinterleave5.nxv5f32(<vscale x 5 x float> %arg)
-  ret {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} %res
+  %res = call {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @llvm.vector.deinterleave6.nxv6f32(<vscale x 6 x float> %arg)
+  ret {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} %res
 }
 
-define {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv10f32(<vscale x 10 x float> %arg) nounwind {
-; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv10f32:
+define {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv12f32(<vscale x 12 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv12f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -1761,58 +2938,59 @@ define {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscal
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0)
 ; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vlseg5e32.v v8, (a0)
+; CHECK-NEXT:    vlseg6e32.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave5.nxv10f32(<vscale x 10 x float> %arg)
-  ret {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} %res
+  %res = call {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave6.nxv12f32(<vscale x 12 x float> %arg)
+  ret {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} %res
 }
 
-define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32_nxv20f32(<vscale x 20 x float> %arg) nounwind {
-; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv20f32:
+define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32_nxv24f32(<vscale x 24 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv24f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vmv1r.v v26, v15
-; CHECK-NEXT:    vmv1r.v v27, v16
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v28, v18
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vmv1r.v v25, v14
+; CHECK-NEXT:    vmv2r.v v24, v14
 ; CHECK-NEXT:    vs8r.v v8, (a0)
-; CHECK-NEXT:    vmv1r.v v28, v17
+; CHECK-NEXT:    vmv2r.v v26, v16
 ; CHECK-NEXT:    vs8r.v v24, (a1)
-; CHECK-NEXT:    vlseg5e32.v v12, (a0)
-; CHECK-NEXT:    vlseg5e32.v v18, (a1)
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vlseg6e32.v v24, (a0)
+; CHECK-NEXT:    vlseg6e32.v v18, (a1)
+; CHECK-NEXT:    vmv2r.v v8, v24
 ; CHECK-NEXT:    vmv1r.v v9, v18
-; CHECK-NEXT:    vmv1r.v v18, v13
-; CHECK-NEXT:    vmv2r.v v12, v14
+; CHECK-NEXT:    vmv1r.v v18, v25
+; CHECK-NEXT:    vmv2r.v v12, v26
 ; CHECK-NEXT:    vmv1r.v v13, v20
-; CHECK-NEXT:    vmv1r.v v20, v15
+; CHECK-NEXT:    vmv1r.v v20, v27
+; CHECK-NEXT:    vmv2r.v v16, v28
 ; CHECK-NEXT:    vmv1r.v v17, v22
+; CHECK-NEXT:    vmv1r.v v22, v29
 ; CHECK-NEXT:    vmv2r.v v10, v18
 ; CHECK-NEXT:    vmv2r.v v14, v20
+; CHECK-NEXT:    vmv2r.v v18, v22
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave5.nxv20f32(<vscale x 20 x float> %arg)
-  ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res
+  %res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave6.nxv24f32(<vscale x 24 x float> %arg)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res
 }
 
-define {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} @vector_deinterleave_nxv1f64_nxv5f64(<vscale x 5 x double> %arg) nounwind {
-; CHECK-LABEL: vector_deinterleave_nxv1f64_nxv5f64:
+define {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} @vector_deinterleave_nxv1f64_nxv6f64(<vscale x 6 x double> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv1f64_nxv6f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -1821,54 +2999,55 @@ define {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vs
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    vs8r.v v8, (a0)
 ; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vlseg5e64.v v8, (a0)
+; CHECK-NEXT:    vlseg6e64.v v8, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} @llvm.vector.deinterleave5.nxv5f64(<vscale x 5 x double> %arg)
-  ret {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} %res
+  %res = call {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} @llvm.vector.deinterleave6.nxv6f64(<vscale x 6 x double> %arg)
+  ret {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} %res
 }
 
-define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f64_nxv10f64(<vscale x 10 x double> %arg) nounwind {
-; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv10f64:
+define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f64_nxv12f64(<vscale x 12 x double> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv12f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vmv1r.v v26, v15
-; CHECK-NEXT:    vmv1r.v v27, v16
 ; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v28, v18
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    slli a1, a1, 3
 ; CHECK-NEXT:    add a1, sp, a1
 ; CHECK-NEXT:    addi a1, a1, 16
-; CHECK-NEXT:    vmv1r.v v25, v14
+; CHECK-NEXT:    vmv2r.v v24, v14
 ; CHECK-NEXT:    vs8r.v v8, (a0)
-; CHECK-NEXT:    vmv1r.v v28, v17
+; CHECK-NEXT:    vmv2r.v v26, v16
 ; CHECK-NEXT:    vs8r.v v24, (a1)
-; CHECK-NEXT:    vlseg5e64.v v12, (a0)
-; CHECK-NEXT:    vlseg5e64.v v18, (a1)
-; CHECK-NEXT:    vmv2r.v v8, v12
+; CHECK-NEXT:    vlseg6e64.v v24, (a0)
+; CHECK-NEXT:    vlseg6e64.v v18, (a1)
+; CHECK-NEXT:    vmv2r.v v8, v24
 ; CHECK-NEXT:    vmv1r.v v9, v18
-; CHECK-NEXT:    vmv1r.v v18, v13
-; CHECK-NEXT:    vmv2r.v v12, v14
+; CHECK-NEXT:    vmv1r.v v18, v25
+; CHECK-NEXT:    vmv2r.v v12, v26
 ; CHECK-NEXT:    vmv1r.v v13, v20
-; CHECK-NEXT:    vmv1r.v v20, v15
+; CHECK-NEXT:    vmv1r.v v20, v27
+; CHECK-NEXT:    vmv2r.v v16, v28
 ; CHECK-NEXT:    vmv1r.v v17, v22
+; CHECK-NEXT:    vmv1r.v v22, v29
 ; CHECK-NEXT:    vmv2r.v v10, v18
 ; CHECK-NEXT:    vmv2r.v v14, v20
+; CHECK-NEXT:    vmv2r.v v18, v22
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 4
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave5.nxv10f64(<vscale x 10 x double> %arg)
-  ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %res
+  %res = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave6.nxv12f64(<vscale x 12 x double> %arg)
+  ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %res
 }
 
 define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv14f16(<vscale x 14 x half> %arg) nounwind {
@@ -2221,3 +3400,311 @@ define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vs
   %res = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave7.nxv14f64(<vscale x 14 x double> %arg)
   ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %res
 }
+
+define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv16f16(<vscale x 16 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv16f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs4r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave8.nxv16f16(<vscale x 16 x half> %arg)
+  ret {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} %res
+}
+
+define {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv32f16(<vscale x 32 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv32f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave8.nxv32f16(<vscale x 32 x half> %arg)
+  ret {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} %res
+}
+
+define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_nxv64f16(<vscale x 64 x half> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv64f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vlseg8e16.v v0, (a0)
+; CHECK-NEXT:    vlseg8e16.v v22, (a1)
+; CHECK-NEXT:    vmv2r.v v8, v0
+; CHECK-NEXT:    vmv1r.v v9, v22
+; CHECK-NEXT:    vmv1r.v v22, v1
+; CHECK-NEXT:    vmv2r.v v12, v2
+; CHECK-NEXT:    vmv1r.v v13, v24
+; CHECK-NEXT:    vmv1r.v v24, v3
+; CHECK-NEXT:    vmv2r.v v16, v4
+; CHECK-NEXT:    vmv1r.v v17, v26
+; CHECK-NEXT:    vmv1r.v v26, v5
+; CHECK-NEXT:    vmv2r.v v20, v6
+; CHECK-NEXT:    vmv1r.v v21, v28
+; CHECK-NEXT:    vmv1r.v v28, v7
+; CHECK-NEXT:    vmv2r.v v10, v22
+; CHECK-NEXT:    vmv2r.v v14, v24
+; CHECK-NEXT:    vmv2r.v v18, v26
+; CHECK-NEXT:    vmv2r.v v22, v28
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave8.nxv64f16(<vscale x 64 x half> %arg)
+  ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %res
+}
+
+define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv16bf16(<vscale x 16 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs4r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave8.nxv16bf16(<vscale x 16 x bfloat> %arg)
+  ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %res
+}
+
+define {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @vector_deinterleave_nxv4bf16_nxv32bf16(<vscale x 32 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv32bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vlseg8e16.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @llvm.vector.deinterleave8.nxv32bf16(<vscale x 32 x bfloat> %arg)
+  ret {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} %res
+}
+
+define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @vector_deinterleave_nxv8bf16_nxv64bf16(<vscale x 64 x bfloat> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv8bf16_nxv64bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vlseg8e16.v v0, (a0)
+; CHECK-NEXT:    vlseg8e16.v v22, (a1)
+; CHECK-NEXT:    vmv2r.v v8, v0
+; CHECK-NEXT:    vmv1r.v v9, v22
+; CHECK-NEXT:    vmv1r.v v22, v1
+; CHECK-NEXT:    vmv2r.v v12, v2
+; CHECK-NEXT:    vmv1r.v v13, v24
+; CHECK-NEXT:    vmv1r.v v24, v3
+; CHECK-NEXT:    vmv2r.v v16, v4
+; CHECK-NEXT:    vmv1r.v v17, v26
+; CHECK-NEXT:    vmv1r.v v26, v5
+; CHECK-NEXT:    vmv2r.v v20, v6
+; CHECK-NEXT:    vmv1r.v v21, v28
+; CHECK-NEXT:    vmv1r.v v28, v7
+; CHECK-NEXT:    vmv2r.v v10, v22
+; CHECK-NEXT:    vmv2r.v v14, v24
+; CHECK-NEXT:    vmv2r.v v18, v26
+; CHECK-NEXT:    vmv2r.v v22, v28
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.vector.deinterleave8.nxv64bf16(<vscale x 64 x bfloat> %arg)
+  ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %res
+}
+
+define {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @vector_deinterleave_nxv1f32_nxv8f32(<vscale x 8 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv1f32_nxv8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs4r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vlseg8e32.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} @llvm.vector.deinterleave8.nxv8f32(<vscale x 8 x float> %arg)
+  ret {<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x float>} %res
+}
+
+define {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv16f32(<vscale x 16 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv16f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg8e32.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave8.nxv16f32(<vscale x 16 x float> %arg)
+  ret {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} %res
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32_nxv32f32(<vscale x 32 x float> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv32f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vlseg8e32.v v0, (a0)
+; CHECK-NEXT:    vlseg8e32.v v22, (a1)
+; CHECK-NEXT:    vmv2r.v v8, v0
+; CHECK-NEXT:    vmv1r.v v9, v22
+; CHECK-NEXT:    vmv1r.v v22, v1
+; CHECK-NEXT:    vmv2r.v v12, v2
+; CHECK-NEXT:    vmv1r.v v13, v24
+; CHECK-NEXT:    vmv1r.v v24, v3
+; CHECK-NEXT:    vmv2r.v v16, v4
+; CHECK-NEXT:    vmv1r.v v17, v26
+; CHECK-NEXT:    vmv1r.v v26, v5
+; CHECK-NEXT:    vmv2r.v v20, v6
+; CHECK-NEXT:    vmv1r.v v21, v28
+; CHECK-NEXT:    vmv1r.v v28, v7
+; CHECK-NEXT:    vmv2r.v v10, v22
+; CHECK-NEXT:    vmv2r.v v14, v24
+; CHECK-NEXT:    vmv2r.v v18, v26
+; CHECK-NEXT:    vmv2r.v v22, v28
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave8.nxv32f32(<vscale x 32 x float> %arg)
+  ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %res
+}
+
+define {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} @vector_deinterleave_nxv1f64_nxv8f64(<vscale x 8 x double> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv1f64_nxv8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vlseg8e64.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} @llvm.vector.deinterleave8.nxv8f64(<vscale x 8 x double> %arg)
+  ret {<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x double>} %res
+}
+
+define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f64_nxv16f64(<vscale x 16 x double> %arg) nounwind {
+; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv16f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a0)
+; CHECK-NEXT:    vs8r.v v16, (a1)
+; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vlseg8e64.v v0, (a0)
+; CHECK-NEXT:    vlseg8e64.v v22, (a1)
+; CHECK-NEXT:    vmv2r.v v8, v0
+; CHECK-NEXT:    vmv1r.v v9, v22
+; CHECK-NEXT:    vmv1r.v v22, v1
+; CHECK-NEXT:    vmv2r.v v12, v2
+; CHECK-NEXT:    vmv1r.v v13, v24
+; CHECK-NEXT:    vmv1r.v v24, v3
+; CHECK-NEXT:    vmv2r.v v16, v4
+; CHECK-NEXT:    vmv1r.v v17, v26
+; CHECK-NEXT:    vmv1r.v v26, v5
+; CHECK-NEXT:    vmv2r.v v20, v6
+; CHECK-NEXT:    vmv1r.v v21, v28
+; CHECK-NEXT:    vmv1r.v v28, v7
+; CHECK-NEXT:    vmv2r.v v10, v22
+; CHECK-NEXT:    vmv2r.v v14, v24
+; CHECK-NEXT:    vmv2r.v v18, v26
+; CHECK-NEXT:    vmv2r.v v22, v28
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %res = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave8.nxv16f64(<vscale x 16 x double> %arg)
+  ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
index 279779dc49667..faf7903c21614 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
@@ -261,6 +261,108 @@ define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2
 	   ret <6 x i32> %res
 }
 
+define <8 x i32> @vector_interleave4_v8i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
+; CHECK-LABEL: vector_interleave4_v8i32_v2i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a1, a1, 1
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    vsetvli a3, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsseg4e32.v v8, (a0)
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vle32.v v10, (a3)
+; CHECK-NEXT:    vle32.v v9, (a2)
+; CHECK-NEXT:    vle32.v v11, (a1)
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v10, v11, 2
+; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave4_v8i32_v2i32:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    .cfi_def_cfa_offset 16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    srli a1, a1, 1
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    vsetvli a3, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsseg4e32.v v8, (a0)
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vle32.v v10, (a3)
+; ZVBB-NEXT:    vle32.v v9, (a2)
+; ZVBB-NEXT:    vle32.v v11, (a1)
+; ZVBB-NEXT:    vle32.v v8, (a0)
+; ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vi v10, v11, 2
+; ZVBB-NEXT:    vslideup.vi v8, v9, 2
+; ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; ZVBB-NEXT:    vslideup.vi v8, v10, 4
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    .cfi_def_cfa sp, 16
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    .cfi_def_cfa_offset 0
+; ZVBB-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave4_v8i32_v2i32:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    addi sp, sp, -16
+; ZIP-NEXT:    .cfi_def_cfa_offset 16
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    slli a0, a0, 1
+; ZIP-NEXT:    sub sp, sp, a0
+; ZIP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZIP-NEXT:    addi a0, sp, 16
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    srli a1, a1, 1
+; ZIP-NEXT:    add a2, a0, a1
+; ZIP-NEXT:    vsetvli a3, zero, e32, mf2, ta, ma
+; ZIP-NEXT:    vsseg4e32.v v8, (a0)
+; ZIP-NEXT:    add a3, a2, a1
+; ZIP-NEXT:    add a1, a3, a1
+; ZIP-NEXT:    vle32.v v10, (a3)
+; ZIP-NEXT:    vle32.v v9, (a2)
+; ZIP-NEXT:    vle32.v v11, (a1)
+; ZIP-NEXT:    vle32.v v8, (a0)
+; ZIP-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; ZIP-NEXT:    vslideup.vi v10, v11, 2
+; ZIP-NEXT:    vslideup.vi v8, v9, 2
+; ZIP-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; ZIP-NEXT:    vslideup.vi v8, v10, 4
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    slli a0, a0, 1
+; ZIP-NEXT:    add sp, sp, a0
+; ZIP-NEXT:    .cfi_def_cfa sp, 16
+; ZIP-NEXT:    addi sp, sp, 16
+; ZIP-NEXT:    .cfi_def_cfa_offset 0
+; ZIP-NEXT:    ret
+	   %res = call <8 x i32> @llvm.vector.interleave4.v8i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d)
+	   ret <8 x i32> %res
+}
 
 define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e) {
 ; CHECK-LABEL: vector_interleave5_v10i16_v2i16:
@@ -377,6 +479,130 @@ define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <
 	   ret <10 x i16> %res
 }
 
+define <12 x i16> @vector_interleave6_v12i16_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e, <2 x i16> %f) {
+; CHECK-LABEL: vector_interleave6_v12i16_v2i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    vsetvli a4, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsseg6e16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a2)
+; CHECK-NEXT:    add a2, a3, a1
+; CHECK-NEXT:    vle16.v v11, (a2)
+; CHECK-NEXT:    add a2, a2, a1
+; CHECK-NEXT:    vle16.v v12, (a3)
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    vle16.v v10, (a2)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v13, (a1)
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v12, v11, 2
+; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    vslideup.vi v10, v13, 2
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v12, 4
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 8
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave6_v12i16_v2i16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    .cfi_def_cfa_offset 16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    srli a1, a1, 2
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e16, mf4, ta, ma
+; ZVBB-NEXT:    vsseg6e16.v v8, (a0)
+; ZVBB-NEXT:    vle16.v v9, (a2)
+; ZVBB-NEXT:    add a2, a3, a1
+; ZVBB-NEXT:    vle16.v v11, (a2)
+; ZVBB-NEXT:    add a2, a2, a1
+; ZVBB-NEXT:    vle16.v v12, (a3)
+; ZVBB-NEXT:    add a1, a2, a1
+; ZVBB-NEXT:    vle16.v v10, (a2)
+; ZVBB-NEXT:    vle16.v v8, (a0)
+; ZVBB-NEXT:    vle16.v v13, (a1)
+; ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVBB-NEXT:    vslideup.vi v12, v11, 2
+; ZVBB-NEXT:    vslideup.vi v8, v9, 2
+; ZVBB-NEXT:    vslideup.vi v10, v13, 2
+; ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vi v8, v12, 4
+; ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVBB-NEXT:    vslideup.vi v8, v10, 8
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    .cfi_def_cfa sp, 16
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    .cfi_def_cfa_offset 0
+; ZVBB-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave6_v12i16_v2i16:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    addi sp, sp, -16
+; ZIP-NEXT:    .cfi_def_cfa_offset 16
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    slli a0, a0, 1
+; ZIP-NEXT:    sub sp, sp, a0
+; ZIP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZIP-NEXT:    addi a0, sp, 16
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    srli a1, a1, 2
+; ZIP-NEXT:    add a2, a0, a1
+; ZIP-NEXT:    add a3, a2, a1
+; ZIP-NEXT:    vsetvli a4, zero, e16, mf4, ta, ma
+; ZIP-NEXT:    vsseg6e16.v v8, (a0)
+; ZIP-NEXT:    vle16.v v9, (a2)
+; ZIP-NEXT:    add a2, a3, a1
+; ZIP-NEXT:    vle16.v v11, (a2)
+; ZIP-NEXT:    add a2, a2, a1
+; ZIP-NEXT:    vle16.v v12, (a3)
+; ZIP-NEXT:    add a1, a2, a1
+; ZIP-NEXT:    vle16.v v10, (a2)
+; ZIP-NEXT:    vle16.v v8, (a0)
+; ZIP-NEXT:    vle16.v v13, (a1)
+; ZIP-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZIP-NEXT:    vslideup.vi v12, v11, 2
+; ZIP-NEXT:    vslideup.vi v8, v9, 2
+; ZIP-NEXT:    vslideup.vi v10, v13, 2
+; ZIP-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZIP-NEXT:    vslideup.vi v8, v12, 4
+; ZIP-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZIP-NEXT:    vslideup.vi v8, v10, 8
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    slli a0, a0, 1
+; ZIP-NEXT:    add sp, sp, a0
+; ZIP-NEXT:    .cfi_def_cfa sp, 16
+; ZIP-NEXT:    addi sp, sp, 16
+; ZIP-NEXT:    .cfi_def_cfa_offset 0
+; ZIP-NEXT:    ret
+	   %res = call <12 x i16> @llvm.vector.interleave6.v12i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e, <2 x i16> %f)
+	   ret <12 x i16> %res
+}
+
 define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g) {
 ; CHECK-LABEL: vector_interleave7_v14i8_v2i8:
 ; CHECK:       # %bb.0:
@@ -507,6 +733,144 @@ define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i
 	   ret <14 x i8> %res
 }
 
+define <16 x i8> @vector_interleave8_v16i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g, <2 x i8> %h) {
+; CHECK-LABEL: vector_interleave8_v16i8_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a1, a1, 3
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    add a4, a3, a1
+; CHECK-NEXT:    add a5, a4, a1
+; CHECK-NEXT:    add a6, a5, a1
+; CHECK-NEXT:    vsetvli a7, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vsseg8e8.v v8, (a0)
+; CHECK-NEXT:    vle8.v v9, (a6)
+; CHECK-NEXT:    add a6, a6, a1
+; CHECK-NEXT:    vle8.v v10, (a5)
+; CHECK-NEXT:    vle8.v v11, (a6)
+; CHECK-NEXT:    add a1, a6, a1
+; CHECK-NEXT:    vle8.v v12, (a2)
+; CHECK-NEXT:    vle8.v v8, (a0)
+; CHECK-NEXT:    vle8.v v13, (a3)
+; CHECK-NEXT:    vle8.v v14, (a4)
+; CHECK-NEXT:    vle8.v v15, (a1)
+; CHECK-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v10, v9, 2
+; CHECK-NEXT:    vslideup.vi v8, v12, 2
+; CHECK-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v10, v11, 4
+; CHECK-NEXT:    vslideup.vi v8, v13, 4
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v10, v15, 6
+; CHECK-NEXT:    vslideup.vi v8, v14, 6
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 8
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave8_v16i8_v2i8:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    .cfi_def_cfa_offset 16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    srli a1, a1, 3
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    add a4, a3, a1
+; ZVBB-NEXT:    add a5, a4, a1
+; ZVBB-NEXT:    add a6, a5, a1
+; ZVBB-NEXT:    vsetvli a7, zero, e8, mf8, ta, ma
+; ZVBB-NEXT:    vsseg8e8.v v8, (a0)
+; ZVBB-NEXT:    vle8.v v9, (a6)
+; ZVBB-NEXT:    add a6, a6, a1
+; ZVBB-NEXT:    vle8.v v10, (a5)
+; ZVBB-NEXT:    vle8.v v11, (a6)
+; ZVBB-NEXT:    add a1, a6, a1
+; ZVBB-NEXT:    vle8.v v12, (a2)
+; ZVBB-NEXT:    vle8.v v8, (a0)
+; ZVBB-NEXT:    vle8.v v13, (a3)
+; ZVBB-NEXT:    vle8.v v14, (a4)
+; ZVBB-NEXT:    vle8.v v15, (a1)
+; ZVBB-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
+; ZVBB-NEXT:    vslideup.vi v10, v9, 2
+; ZVBB-NEXT:    vslideup.vi v8, v12, 2
+; ZVBB-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
+; ZVBB-NEXT:    vslideup.vi v10, v11, 4
+; ZVBB-NEXT:    vslideup.vi v8, v13, 4
+; ZVBB-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; ZVBB-NEXT:    vslideup.vi v10, v15, 6
+; ZVBB-NEXT:    vslideup.vi v8, v14, 6
+; ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vi v8, v10, 8
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    .cfi_def_cfa sp, 16
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    .cfi_def_cfa_offset 0
+; ZVBB-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave8_v16i8_v2i8:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    addi sp, sp, -16
+; ZIP-NEXT:    .cfi_def_cfa_offset 16
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    sub sp, sp, a0
+; ZIP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
+; ZIP-NEXT:    addi a0, sp, 16
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    srli a1, a1, 3
+; ZIP-NEXT:    add a2, a0, a1
+; ZIP-NEXT:    add a3, a2, a1
+; ZIP-NEXT:    add a4, a3, a1
+; ZIP-NEXT:    add a5, a4, a1
+; ZIP-NEXT:    add a6, a5, a1
+; ZIP-NEXT:    vsetvli a7, zero, e8, mf8, ta, ma
+; ZIP-NEXT:    vsseg8e8.v v8, (a0)
+; ZIP-NEXT:    vle8.v v9, (a6)
+; ZIP-NEXT:    add a6, a6, a1
+; ZIP-NEXT:    vle8.v v10, (a5)
+; ZIP-NEXT:    vle8.v v11, (a6)
+; ZIP-NEXT:    add a1, a6, a1
+; ZIP-NEXT:    vle8.v v12, (a2)
+; ZIP-NEXT:    vle8.v v8, (a0)
+; ZIP-NEXT:    vle8.v v13, (a3)
+; ZIP-NEXT:    vle8.v v14, (a4)
+; ZIP-NEXT:    vle8.v v15, (a1)
+; ZIP-NEXT:    vsetivli zero, 4, e8, mf2, tu, ma
+; ZIP-NEXT:    vslideup.vi v10, v9, 2
+; ZIP-NEXT:    vslideup.vi v8, v12, 2
+; ZIP-NEXT:    vsetivli zero, 6, e8, mf2, tu, ma
+; ZIP-NEXT:    vslideup.vi v10, v11, 4
+; ZIP-NEXT:    vslideup.vi v8, v13, 4
+; ZIP-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; ZIP-NEXT:    vslideup.vi v10, v15, 6
+; ZIP-NEXT:    vslideup.vi v8, v14, 6
+; ZIP-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
+; ZIP-NEXT:    vslideup.vi v8, v10, 8
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    add sp, sp, a0
+; ZIP-NEXT:    .cfi_def_cfa sp, 16
+; ZIP-NEXT:    addi sp, sp, 16
+; ZIP-NEXT:    .cfi_def_cfa_offset 0
+; ZIP-NEXT:    ret
+	   %res = call <16 x i8> @llvm.vector.interleave8.v16i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g, <2 x i8> %h)
+	   ret <16 x i8> %res
+}
 
 ; Floats
 
@@ -689,19 +1053,113 @@ define <4 x double> @vector_interleave_v4f64_v2f64(<2 x double> %a, <2 x double>
 ; ZVBB-NEXT:    vmv.v.v v8, v10
 ; ZVBB-NEXT:    ret
 ;
-; ZIP-LABEL: vector_interleave_v4f64_v2f64:
+; ZIP-LABEL: vector_interleave_v4f64_v2f64:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; ZIP-NEXT:    vmv1r.v v12, v9
+; ZIP-NEXT:    ri.vzip2a.vv v10, v8, v12
+; ZIP-NEXT:    vmv.v.v v8, v10
+; ZIP-NEXT:    ret
+	   %res = call <4 x double> @llvm.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b)
+	   ret <4 x double> %res
+}
+
+define <6 x float> @vector_interleave3_v6f32_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+; CHECK-LABEL: vector_interleave3_v6f32_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a1, a1, 1
+; CHECK-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsseg3e32.v v8, (a0)
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    vle32.v v9, (a2)
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    vle32.v v10, (a1)
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave3_v6f32_v2f32:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    .cfi_def_cfa_offset 16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    srli a1, a1, 1
+; ZVBB-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsseg3e32.v v8, (a0)
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    vle32.v v9, (a2)
+; ZVBB-NEXT:    vle32.v v8, (a0)
+; ZVBB-NEXT:    add a1, a2, a1
+; ZVBB-NEXT:    vle32.v v10, (a1)
+; ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vi v8, v9, 2
+; ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; ZVBB-NEXT:    vslideup.vi v8, v10, 4
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    .cfi_def_cfa sp, 16
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    .cfi_def_cfa_offset 0
+; ZVBB-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave3_v6f32_v2f32:
 ; ZIP:       # %bb.0:
-; ZIP-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; ZIP-NEXT:    vmv1r.v v12, v9
-; ZIP-NEXT:    ri.vzip2a.vv v10, v8, v12
-; ZIP-NEXT:    vmv.v.v v8, v10
+; ZIP-NEXT:    addi sp, sp, -16
+; ZIP-NEXT:    .cfi_def_cfa_offset 16
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    slli a0, a0, 1
+; ZIP-NEXT:    sub sp, sp, a0
+; ZIP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZIP-NEXT:    addi a0, sp, 16
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    srli a1, a1, 1
+; ZIP-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
+; ZIP-NEXT:    vsseg3e32.v v8, (a0)
+; ZIP-NEXT:    add a2, a0, a1
+; ZIP-NEXT:    vle32.v v9, (a2)
+; ZIP-NEXT:    vle32.v v8, (a0)
+; ZIP-NEXT:    add a1, a2, a1
+; ZIP-NEXT:    vle32.v v10, (a1)
+; ZIP-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; ZIP-NEXT:    vslideup.vi v8, v9, 2
+; ZIP-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; ZIP-NEXT:    vslideup.vi v8, v10, 4
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    slli a0, a0, 1
+; ZIP-NEXT:    add sp, sp, a0
+; ZIP-NEXT:    .cfi_def_cfa sp, 16
+; ZIP-NEXT:    addi sp, sp, 16
+; ZIP-NEXT:    .cfi_def_cfa_offset 0
 ; ZIP-NEXT:    ret
-	   %res = call <4 x double> @llvm.vector.interleave2.v4f64(<2 x double> %a, <2 x double> %b)
-	   ret <4 x double> %res
+	   %res = call <6 x float> @llvm.vector.interleave3.v6f32(<2 x float> %a, <2 x float> %b, <2 x float> %c)
+	   ret <6 x float> %res
 }
 
-define <6 x float> @vector_interleave3_v632_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
-; CHECK-LABEL: vector_interleave3_v632_v2f32:
+define <8 x float> @vector_interleave4_v8f32_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) {
+; CHECK-LABEL: vector_interleave4_v8f32_v2f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
@@ -712,14 +1170,17 @@ define <6 x float> @vector_interleave3_v632_v2f32(<2 x float> %a, <2 x float> %b
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a1, a1, 1
-; CHECK-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vsseg3e32.v v8, (a0)
 ; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    vsetvli a3, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsseg4e32.v v8, (a0)
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vle32.v v10, (a3)
 ; CHECK-NEXT:    vle32.v v9, (a2)
+; CHECK-NEXT:    vle32.v v11, (a1)
 ; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    add a1, a2, a1
-; CHECK-NEXT:    vle32.v v10, (a1)
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v10, v11, 2
 ; CHECK-NEXT:    vslideup.vi v8, v9, 2
 ; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; CHECK-NEXT:    vslideup.vi v8, v10, 4
@@ -731,7 +1192,7 @@ define <6 x float> @vector_interleave3_v632_v2f32(<2 x float> %a, <2 x float> %b
 ; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave3_v632_v2f32:
+; ZVBB-LABEL: vector_interleave4_v8f32_v2f32:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    .cfi_def_cfa_offset 16
@@ -742,14 +1203,17 @@ define <6 x float> @vector_interleave3_v632_v2f32(<2 x float> %a, <2 x float> %b
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a1, a1, 1
-; ZVBB-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
-; ZVBB-NEXT:    vsseg3e32.v v8, (a0)
 ; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    vsetvli a3, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsseg4e32.v v8, (a0)
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vle32.v v10, (a3)
 ; ZVBB-NEXT:    vle32.v v9, (a2)
+; ZVBB-NEXT:    vle32.v v11, (a1)
 ; ZVBB-NEXT:    vle32.v v8, (a0)
-; ZVBB-NEXT:    add a1, a2, a1
-; ZVBB-NEXT:    vle32.v v10, (a1)
 ; ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vi v10, v11, 2
 ; ZVBB-NEXT:    vslideup.vi v8, v9, 2
 ; ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; ZVBB-NEXT:    vslideup.vi v8, v10, 4
@@ -761,7 +1225,7 @@ define <6 x float> @vector_interleave3_v632_v2f32(<2 x float> %a, <2 x float> %b
 ; ZVBB-NEXT:    .cfi_def_cfa_offset 0
 ; ZVBB-NEXT:    ret
 ;
-; ZIP-LABEL: vector_interleave3_v632_v2f32:
+; ZIP-LABEL: vector_interleave4_v8f32_v2f32:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -16
 ; ZIP-NEXT:    .cfi_def_cfa_offset 16
@@ -772,14 +1236,17 @@ define <6 x float> @vector_interleave3_v632_v2f32(<2 x float> %a, <2 x float> %b
 ; ZIP-NEXT:    addi a0, sp, 16
 ; ZIP-NEXT:    csrr a1, vlenb
 ; ZIP-NEXT:    srli a1, a1, 1
-; ZIP-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
-; ZIP-NEXT:    vsseg3e32.v v8, (a0)
 ; ZIP-NEXT:    add a2, a0, a1
+; ZIP-NEXT:    vsetvli a3, zero, e32, mf2, ta, ma
+; ZIP-NEXT:    vsseg4e32.v v8, (a0)
+; ZIP-NEXT:    add a3, a2, a1
+; ZIP-NEXT:    add a1, a3, a1
+; ZIP-NEXT:    vle32.v v10, (a3)
 ; ZIP-NEXT:    vle32.v v9, (a2)
+; ZIP-NEXT:    vle32.v v11, (a1)
 ; ZIP-NEXT:    vle32.v v8, (a0)
-; ZIP-NEXT:    add a1, a2, a1
-; ZIP-NEXT:    vle32.v v10, (a1)
 ; ZIP-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; ZIP-NEXT:    vslideup.vi v10, v11, 2
 ; ZIP-NEXT:    vslideup.vi v8, v9, 2
 ; ZIP-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
 ; ZIP-NEXT:    vslideup.vi v8, v10, 4
@@ -790,11 +1257,10 @@ define <6 x float> @vector_interleave3_v632_v2f32(<2 x float> %a, <2 x float> %b
 ; ZIP-NEXT:    addi sp, sp, 16
 ; ZIP-NEXT:    .cfi_def_cfa_offset 0
 ; ZIP-NEXT:    ret
-	   %res = call <6 x float> @llvm.vector.interleave3.v6f32(<2 x float> %a, <2 x float> %b, <2 x float> %c)
-	   ret <6 x float> %res
+	   %res = call <8 x float> @llvm.vector.interleave4.v8f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d)
+	   ret <8 x float> %res
 }
 
-
 define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e) {
 ; CHECK-LABEL: vector_interleave5_v10f16_v2f16:
 ; CHECK:       # %bb.0:
@@ -910,6 +1376,130 @@ define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b
 	   ret <10 x half> %res
 }
 
+define <12 x half> @vector_interleave6_v12f16_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e, <2 x half> %f) {
+; CHECK-LABEL: vector_interleave6_v12f16_v2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    vsetvli a4, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsseg6e16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a2)
+; CHECK-NEXT:    add a2, a3, a1
+; CHECK-NEXT:    vle16.v v11, (a2)
+; CHECK-NEXT:    add a2, a2, a1
+; CHECK-NEXT:    vle16.v v12, (a3)
+; CHECK-NEXT:    add a1, a2, a1
+; CHECK-NEXT:    vle16.v v10, (a2)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v13, (a1)
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v12, v11, 2
+; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    vslideup.vi v10, v13, 2
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v12, 4
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 8
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave6_v12f16_v2f16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    .cfi_def_cfa_offset 16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    srli a1, a1, 2
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e16, mf4, ta, ma
+; ZVBB-NEXT:    vsseg6e16.v v8, (a0)
+; ZVBB-NEXT:    vle16.v v9, (a2)
+; ZVBB-NEXT:    add a2, a3, a1
+; ZVBB-NEXT:    vle16.v v11, (a2)
+; ZVBB-NEXT:    add a2, a2, a1
+; ZVBB-NEXT:    vle16.v v12, (a3)
+; ZVBB-NEXT:    add a1, a2, a1
+; ZVBB-NEXT:    vle16.v v10, (a2)
+; ZVBB-NEXT:    vle16.v v8, (a0)
+; ZVBB-NEXT:    vle16.v v13, (a1)
+; ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVBB-NEXT:    vslideup.vi v12, v11, 2
+; ZVBB-NEXT:    vslideup.vi v8, v9, 2
+; ZVBB-NEXT:    vslideup.vi v10, v13, 2
+; ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vi v8, v12, 4
+; ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVBB-NEXT:    vslideup.vi v8, v10, 8
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    .cfi_def_cfa sp, 16
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    .cfi_def_cfa_offset 0
+; ZVBB-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave6_v12f16_v2f16:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    addi sp, sp, -16
+; ZIP-NEXT:    .cfi_def_cfa_offset 16
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    slli a0, a0, 1
+; ZIP-NEXT:    sub sp, sp, a0
+; ZIP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZIP-NEXT:    addi a0, sp, 16
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    srli a1, a1, 2
+; ZIP-NEXT:    add a2, a0, a1
+; ZIP-NEXT:    add a3, a2, a1
+; ZIP-NEXT:    vsetvli a4, zero, e16, mf4, ta, ma
+; ZIP-NEXT:    vsseg6e16.v v8, (a0)
+; ZIP-NEXT:    vle16.v v9, (a2)
+; ZIP-NEXT:    add a2, a3, a1
+; ZIP-NEXT:    vle16.v v11, (a2)
+; ZIP-NEXT:    add a2, a2, a1
+; ZIP-NEXT:    vle16.v v12, (a3)
+; ZIP-NEXT:    add a1, a2, a1
+; ZIP-NEXT:    vle16.v v10, (a2)
+; ZIP-NEXT:    vle16.v v8, (a0)
+; ZIP-NEXT:    vle16.v v13, (a1)
+; ZIP-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZIP-NEXT:    vslideup.vi v12, v11, 2
+; ZIP-NEXT:    vslideup.vi v8, v9, 2
+; ZIP-NEXT:    vslideup.vi v10, v13, 2
+; ZIP-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZIP-NEXT:    vslideup.vi v8, v12, 4
+; ZIP-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZIP-NEXT:    vslideup.vi v8, v10, 8
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    slli a0, a0, 1
+; ZIP-NEXT:    add sp, sp, a0
+; ZIP-NEXT:    .cfi_def_cfa sp, 16
+; ZIP-NEXT:    addi sp, sp, 16
+; ZIP-NEXT:    .cfi_def_cfa_offset 0
+; ZIP-NEXT:    ret
+	   %res = call <12 x half> @llvm.vector.interleave6.v12f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e, <2 x half> %f)
+	   ret <12 x half> %res
+}
+
 define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g) {
 ; CHECK-LABEL: vector_interleave7_v7f16_v1f16:
 ; CHECK:       # %bb.0:
@@ -1045,3 +1635,148 @@ define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b,
 	   %res = call <7 x half> @llvm.vector.interleave7.v7f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g)
 	   ret <7 x half> %res
 }
+
+define <8 x half> @vector_interleave8_v8f16_v1f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g, <1 x half> %h) {
+; CHECK-LABEL: vector_interleave8_v8f16_v1f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    add a4, a3, a1
+; CHECK-NEXT:    add a5, a4, a1
+; CHECK-NEXT:    add a6, a5, a1
+; CHECK-NEXT:    vsetvli a7, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vsseg8e16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v9, (a6)
+; CHECK-NEXT:    add a6, a6, a1
+; CHECK-NEXT:    vle16.v v10, (a5)
+; CHECK-NEXT:    vle16.v v11, (a6)
+; CHECK-NEXT:    add a1, a6, a1
+; CHECK-NEXT:    vle16.v v12, (a2)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle16.v v13, (a3)
+; CHECK-NEXT:    vle16.v v14, (a4)
+; CHECK-NEXT:    vle16.v v15, (a1)
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v10, v9, 1
+; CHECK-NEXT:    vslideup.vi v8, v12, 1
+; CHECK-NEXT:    vsetivli zero, 3, e16, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vi v10, v11, 2
+; CHECK-NEXT:    vslideup.vi v8, v13, 2
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v10, v15, 3
+; CHECK-NEXT:    vslideup.vi v8, v14, 3
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 4
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    .cfi_def_cfa sp, 16
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    .cfi_def_cfa_offset 0
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave8_v8f16_v1f16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    .cfi_def_cfa_offset 16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    srli a1, a1, 2
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    add a4, a3, a1
+; ZVBB-NEXT:    add a5, a4, a1
+; ZVBB-NEXT:    add a6, a5, a1
+; ZVBB-NEXT:    vsetvli a7, zero, e16, mf4, ta, ma
+; ZVBB-NEXT:    vsseg8e16.v v8, (a0)
+; ZVBB-NEXT:    vle16.v v9, (a6)
+; ZVBB-NEXT:    add a6, a6, a1
+; ZVBB-NEXT:    vle16.v v10, (a5)
+; ZVBB-NEXT:    vle16.v v11, (a6)
+; ZVBB-NEXT:    add a1, a6, a1
+; ZVBB-NEXT:    vle16.v v12, (a2)
+; ZVBB-NEXT:    vle16.v v8, (a0)
+; ZVBB-NEXT:    vle16.v v13, (a3)
+; ZVBB-NEXT:    vle16.v v14, (a4)
+; ZVBB-NEXT:    vle16.v v15, (a1)
+; ZVBB-NEXT:    vsetivli zero, 2, e16, mf2, tu, ma
+; ZVBB-NEXT:    vslideup.vi v10, v9, 1
+; ZVBB-NEXT:    vslideup.vi v8, v12, 1
+; ZVBB-NEXT:    vsetivli zero, 3, e16, mf2, tu, ma
+; ZVBB-NEXT:    vslideup.vi v10, v11, 2
+; ZVBB-NEXT:    vslideup.vi v8, v13, 2
+; ZVBB-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVBB-NEXT:    vslideup.vi v10, v15, 3
+; ZVBB-NEXT:    vslideup.vi v8, v14, 3
+; ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vi v8, v10, 4
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    .cfi_def_cfa sp, 16
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    .cfi_def_cfa_offset 0
+; ZVBB-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave8_v8f16_v1f16:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    addi sp, sp, -16
+; ZIP-NEXT:    .cfi_def_cfa_offset 16
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    slli a0, a0, 1
+; ZIP-NEXT:    sub sp, sp, a0
+; ZIP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
+; ZIP-NEXT:    addi a0, sp, 16
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    srli a1, a1, 2
+; ZIP-NEXT:    add a2, a0, a1
+; ZIP-NEXT:    add a3, a2, a1
+; ZIP-NEXT:    add a4, a3, a1
+; ZIP-NEXT:    add a5, a4, a1
+; ZIP-NEXT:    add a6, a5, a1
+; ZIP-NEXT:    vsetvli a7, zero, e16, mf4, ta, ma
+; ZIP-NEXT:    vsseg8e16.v v8, (a0)
+; ZIP-NEXT:    vle16.v v9, (a6)
+; ZIP-NEXT:    add a6, a6, a1
+; ZIP-NEXT:    vle16.v v10, (a5)
+; ZIP-NEXT:    vle16.v v11, (a6)
+; ZIP-NEXT:    add a1, a6, a1
+; ZIP-NEXT:    vle16.v v12, (a2)
+; ZIP-NEXT:    vle16.v v8, (a0)
+; ZIP-NEXT:    vle16.v v13, (a3)
+; ZIP-NEXT:    vle16.v v14, (a4)
+; ZIP-NEXT:    vle16.v v15, (a1)
+; ZIP-NEXT:    vsetivli zero, 2, e16, mf2, tu, ma
+; ZIP-NEXT:    vslideup.vi v10, v9, 1
+; ZIP-NEXT:    vslideup.vi v8, v12, 1
+; ZIP-NEXT:    vsetivli zero, 3, e16, mf2, tu, ma
+; ZIP-NEXT:    vslideup.vi v10, v11, 2
+; ZIP-NEXT:    vslideup.vi v8, v13, 2
+; ZIP-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZIP-NEXT:    vslideup.vi v10, v15, 3
+; ZIP-NEXT:    vslideup.vi v8, v14, 3
+; ZIP-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZIP-NEXT:    vslideup.vi v8, v10, 4
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    slli a0, a0, 1
+; ZIP-NEXT:    add sp, sp, a0
+; ZIP-NEXT:    .cfi_def_cfa sp, 16
+; ZIP-NEXT:    addi sp, sp, 16
+; ZIP-NEXT:    .cfi_def_cfa_offset 0
+; ZIP-NEXT:    ret
+	   %res = call <8 x half> @llvm.vector.interleave8.v8f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g, <1 x half> %h)
+	   ret <8 x half> %res
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
index 7347000bf5c71..77723609a60c7 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll
@@ -786,6 +786,313 @@ define <vscale x 6 x i64> @vector_interleave_nxv6i64_nxv2i64(<vscale x 2 x i64>
   ret <vscale x 6 x i64> %res
 }
 
+define <vscale x 64 x i1> @vector_interleave_nxv64i1_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d) nounwind {
+; CHECK-LABEL: vector_interleave_nxv64i1_nxv16i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmv1r.v v11, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmv.v.i v12, 0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    vmerge.vim v16, v12, 1, v0
+; CHECK-NEXT:    slli a2, a1, 1
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmerge.vim v14, v12, 1, v0
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v18, v12, 1, v0
+; CHECK-NEXT:    add a4, a3, a2
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v20, v12, 1, v0
+; CHECK-NEXT:    add a2, a4, a2
+; CHECK-NEXT:    vsseg4e8.v v14, (a0)
+; CHECK-NEXT:    vl2r.v v8, (a2)
+; CHECK-NEXT:    srli a2, a1, 2
+; CHECK-NEXT:    srli a1, a1, 1
+; CHECK-NEXT:    vl2r.v v10, (a4)
+; CHECK-NEXT:    add a4, a2, a2
+; CHECK-NEXT:    vl2r.v v12, (a3)
+; CHECK-NEXT:    vl2r.v v14, (a0)
+; CHECK-NEXT:    vmsne.vi v16, v8, 0
+; CHECK-NEXT:    vmsne.vi v8, v10, 0
+; CHECK-NEXT:    vmsne.vi v9, v12, 0
+; CHECK-NEXT:    vmsne.vi v0, v14, 0
+; CHECK-NEXT:    vsetvli zero, a4, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v16, a2
+; CHECK-NEXT:    vslideup.vx v0, v9, a2
+; CHECK-NEXT:    add a0, a1, a1
+; CHECK-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v0, v8, a1
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv64i1_nxv16i1:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; ZVBB-NEXT:    vmv1r.v v11, v0
+; ZVBB-NEXT:    vmv1r.v v0, v8
+; ZVBB-NEXT:    vmv.v.i v12, 0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    vmerge.vim v16, v12, 1, v0
+; ZVBB-NEXT:    slli a2, a1, 1
+; ZVBB-NEXT:    vmv1r.v v0, v11
+; ZVBB-NEXT:    vmerge.vim v14, v12, 1, v0
+; ZVBB-NEXT:    add a3, a0, a2
+; ZVBB-NEXT:    vmv1r.v v0, v9
+; ZVBB-NEXT:    vmerge.vim v18, v12, 1, v0
+; ZVBB-NEXT:    add a4, a3, a2
+; ZVBB-NEXT:    vmv1r.v v0, v10
+; ZVBB-NEXT:    vmerge.vim v20, v12, 1, v0
+; ZVBB-NEXT:    add a2, a4, a2
+; ZVBB-NEXT:    vsseg4e8.v v14, (a0)
+; ZVBB-NEXT:    vl2r.v v8, (a2)
+; ZVBB-NEXT:    srli a2, a1, 2
+; ZVBB-NEXT:    srli a1, a1, 1
+; ZVBB-NEXT:    vl2r.v v10, (a4)
+; ZVBB-NEXT:    add a4, a2, a2
+; ZVBB-NEXT:    vl2r.v v12, (a3)
+; ZVBB-NEXT:    vl2r.v v14, (a0)
+; ZVBB-NEXT:    vmsne.vi v16, v8, 0
+; ZVBB-NEXT:    vmsne.vi v8, v10, 0
+; ZVBB-NEXT:    vmsne.vi v9, v12, 0
+; ZVBB-NEXT:    vmsne.vi v0, v14, 0
+; ZVBB-NEXT:    vsetvli zero, a4, e8, mf2, ta, ma
+; ZVBB-NEXT:    vslideup.vx v8, v16, a2
+; ZVBB-NEXT:    vslideup.vx v0, v9, a2
+; ZVBB-NEXT:    add a0, a1, a1
+; ZVBB-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v8, a1
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 64 x i1> @llvm.vector.interleave4.nxv64i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d)
+  ret <vscale x 64 x i1> %res
+}
+
+define <vscale x 64 x i8> @vector_interleave_nxv64i8_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d) nounwind {
+;
+; CHECK-LABEL: vector_interleave_nxv64i8_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vsseg4e8.v v8, (a0)
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    vl2r.v v12, (a3)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vl2r.v v14, (a1)
+; CHECK-NEXT:    vl2r.v v8, (a0)
+; CHECK-NEXT:    vl2r.v v10, (a2)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv64i8_nxv16i8:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    slli a1, a1, 1
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
+; ZVBB-NEXT:    vsseg4e8.v v8, (a0)
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    vl2r.v v12, (a3)
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vl2r.v v14, (a1)
+; ZVBB-NEXT:    vl2r.v v8, (a0)
+; ZVBB-NEXT:    vl2r.v v10, (a2)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv64i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d)
+  ret <vscale x 64 x i8> %res
+}
+
+define <vscale x 32 x i8> @vector_interleave_nxv32i8_nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, <vscale x 8 x i8> %c, <vscale x 8 x i8> %d) nounwind {
+; CHECK-LABEL: vector_interleave_nxv32i8_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vsseg4e8.v v8, (a0)
+; CHECK-NEXT:    vl1r.v v10, (a3)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vl1r.v v11, (a1)
+; CHECK-NEXT:    vl1r.v v8, (a0)
+; CHECK-NEXT:    vl1r.v v9, (a2)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv32i8_nxv8i8:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 2
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
+; ZVBB-NEXT:    vsseg4e8.v v8, (a0)
+; ZVBB-NEXT:    vl1r.v v10, (a3)
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vl1r.v v11, (a1)
+; ZVBB-NEXT:    vl1r.v v8, (a0)
+; ZVBB-NEXT:    vl1r.v v9, (a2)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 2
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 32 x i8> @llvm.vector.interleave4.nxv32i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, <vscale x 8 x i8> %c, <vscale x 8 x i8> %d)
+  ret <vscale x 32 x i8> %res
+}
+
+define <vscale x 16 x i32> @vector_interleave_nxv16i32_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d) nounwind {
+;
+; CHECK-LABEL: vector_interleave_nxv16i32_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    vsetvli a3, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vsseg4e32.v v8, (a0)
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    vl2re32.v v12, (a3)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vl2re32.v v14, (a1)
+; CHECK-NEXT:    vl2re32.v v8, (a0)
+; CHECK-NEXT:    vl2re32.v v10, (a2)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16i32_nxv4i32:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    slli a1, a1, 1
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    vsetvli a3, zero, e32, m2, ta, ma
+; ZVBB-NEXT:    vsseg4e32.v v8, (a0)
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    vl2re32.v v12, (a3)
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vl2re32.v v14, (a1)
+; ZVBB-NEXT:    vl2re32.v v8, (a0)
+; ZVBB-NEXT:    vl2re32.v v10, (a2)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 16 x i32> @llvm.vector.interleave4.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d)
+  ret <vscale x 16 x i32> %res
+}
+
+define <vscale x 8 x i64> @vector_interleave_nxv8i64_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d) nounwind {
+;
+; CHECK-LABEL: vector_interleave_nxv8i64_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    vsetvli a3, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsseg4e64.v v8, (a0)
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    vl2re64.v v12, (a3)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vl2re64.v v14, (a1)
+; CHECK-NEXT:    vl2re64.v v8, (a0)
+; CHECK-NEXT:    vl2re64.v v10, (a2)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv8i64_nxv2i64:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    slli a1, a1, 1
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    vsetvli a3, zero, e64, m2, ta, ma
+; ZVBB-NEXT:    vsseg4e64.v v8, (a0)
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    vl2re64.v v12, (a3)
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vl2re64.v v14, (a1)
+; ZVBB-NEXT:    vl2re64.v v8, (a0)
+; ZVBB-NEXT:    vl2re64.v v10, (a2)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 8 x i64> @llvm.vector.interleave4.nxv8i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d)
+  ret <vscale x 8 x i64> %res
+}
+
 define <vscale x 80 x i1> @vector_interleave_nxv80i1_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e) nounwind {
 ; CHECK-LABEL: vector_interleave_nxv80i1_nxv16i1:
 ; CHECK:       # %bb.0:
@@ -2009,1449 +2316,1552 @@ define <vscale x 10 x i64> @vector_interleave_nxv10i64_nxv2i64(<vscale x 2 x i64
   ret <vscale x 10 x i64> %res
 }
 
-define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e, <vscale x 16 x i1> %f, <vscale x 16 x i1> %g) nounwind {
-; CHECK-LABEL: vector_interleave_nxv112i1_nxv16i1:
+define <vscale x 96 x i1> @vector_interleave_nxv96i1_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e, <vscale x 16 x i1> %f) nounwind {
+; CHECK-LABEL: vector_interleave_nxv96i1_nxv16i1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 14
+; CHECK-NEXT:    li a1, 12
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmv.v.i v14, 0
-; CHECK-NEXT:    addi a4, sp, 16
+; CHECK-NEXT:    vmv.v.i v20, 0
+; CHECK-NEXT:    vmerge.vim v14, v20, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v22, v20, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmv1r.v v16, v23
+; CHECK-NEXT:    vmerge.vim v8, v20, 1, v0
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 3
-; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    li a1, 6
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add a0, sp, a0
 ; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vmerge.vim v16, v14, 1, v0
-; CHECK-NEXT:    vmv1r.v v0, v8
-; CHECK-NEXT:    vmerge.vim v22, v14, 1, v0
-; CHECK-NEXT:    add a3, a4, a2
-; CHECK-NEXT:    srli a1, a2, 2
-; CHECK-NEXT:    add a5, a0, a2
-; CHECK-NEXT:    vmv4r.v v24, v16
-; CHECK-NEXT:    vmv1r.v v0, v9
-; CHECK-NEXT:    vmerge.vim v18, v14, 1, v0
-; CHECK-NEXT:    add a6, a3, a2
-; CHECK-NEXT:    vmv1r.v v25, v22
+; CHECK-NEXT:    vmv1r.v v17, v9
 ; CHECK-NEXT:    vmv1r.v v0, v10
-; CHECK-NEXT:    vmerge.vim v8, v14, 1, v0
-; CHECK-NEXT:    vmv1r.v v26, v18
+; CHECK-NEXT:    vmerge.vim v24, v20, 1, v0
+; CHECK-NEXT:    addi a5, sp, 16
+; CHECK-NEXT:    vmv1r.v v18, v25
 ; CHECK-NEXT:    vmv1r.v v0, v11
-; CHECK-NEXT:    vmerge.vim v20, v14, 1, v0
-; CHECK-NEXT:    vmv1r.v v27, v8
+; CHECK-NEXT:    vmerge.vim v26, v20, 1, v0
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vmv1r.v v19, v27
 ; CHECK-NEXT:    vmv1r.v v0, v12
-; CHECK-NEXT:    vmerge.vim v10, v14, 1, v0
-; CHECK-NEXT:    vmv1r.v v28, v20
-; CHECK-NEXT:    vmv1r.v v18, v23
+; CHECK-NEXT:    vmerge.vim v10, v20, 1, v0
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    vmv1r.v v20, v11
+; CHECK-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vsseg6e8.v v15, (a0)
+; CHECK-NEXT:    vmv1r.v v15, v22
+; CHECK-NEXT:    add a4, a5, a2
+; CHECK-NEXT:    vmv1r.v v16, v8
+; CHECK-NEXT:    srli a1, a2, 2
+; CHECK-NEXT:    vmv1r.v v17, v24
+; CHECK-NEXT:    add a6, a4, a2
+; CHECK-NEXT:    vmv1r.v v18, v26
+; CHECK-NEXT:    add a7, a3, a2
+; CHECK-NEXT:    vmv1r.v v19, v10
+; CHECK-NEXT:    vsseg6e8.v v14, (a5)
+; CHECK-NEXT:    vl1r.v v8, (a0)
+; CHECK-NEXT:    add a0, a6, a2
+; CHECK-NEXT:    vl1r.v v10, (a6)
+; CHECK-NEXT:    add a6, a7, a2
+; CHECK-NEXT:    vl1r.v v12, (a5)
+; CHECK-NEXT:    add a5, a0, a2
+; CHECK-NEXT:    vl1r.v v14, (a7)
 ; CHECK-NEXT:    add a7, a6, a2
-; CHECK-NEXT:    vmv1r.v v29, v10
-; CHECK-NEXT:    vmv1r.v v20, v9
-; CHECK-NEXT:    vmv1r.v v0, v13
-; CHECK-NEXT:    vmerge.vim v30, v14, 1, v0
-; CHECK-NEXT:    vmv1r.v v22, v11
-; CHECK-NEXT:    vsetvli t0, zero, e8, m1, ta, ma
-; CHECK-NEXT:    vsseg7e8.v v24, (a4)
-; CHECK-NEXT:    vmv1r.v v23, v31
-; CHECK-NEXT:    vsseg7e8.v v17, (a0)
-; CHECK-NEXT:    vl1r.v v8, (a6)
-; CHECK-NEXT:    add a6, a7, a2
-; CHECK-NEXT:    vl1r.v v10, (a4)
-; CHECK-NEXT:    add a4, a6, a2
-; CHECK-NEXT:    vl1r.v v12, (a6)
-; CHECK-NEXT:    add a6, a4, a2
-; CHECK-NEXT:    vl1r.v v14, (a6)
-; CHECK-NEXT:    add a6, a5, a2
 ; CHECK-NEXT:    vl1r.v v16, (a5)
-; CHECK-NEXT:    add a5, a6, a2
-; CHECK-NEXT:    vl1r.v v18, (a5)
 ; CHECK-NEXT:    add a5, a5, a2
-; CHECK-NEXT:    vl1r.v v9, (a7)
-; CHECK-NEXT:    add a7, a5, a2
-; CHECK-NEXT:    vl1r.v v20, (a7)
+; CHECK-NEXT:    vl1r.v v18, (a7)
 ; CHECK-NEXT:    add a7, a7, a2
 ; CHECK-NEXT:    srli a2, a2, 1
-; CHECK-NEXT:    vl1r.v v11, (a3)
+; CHECK-NEXT:    vl1r.v v9, (a3)
 ; CHECK-NEXT:    add a3, a1, a1
+; CHECK-NEXT:    vl1r.v v17, (a5)
+; CHECK-NEXT:    add a5, a2, a2
+; CHECK-NEXT:    vl1r.v v11, (a0)
 ; CHECK-NEXT:    vl1r.v v13, (a4)
-; CHECK-NEXT:    add a4, a2, a2
-; CHECK-NEXT:    vl1r.v v15, (a0)
-; CHECK-NEXT:    vl1r.v v19, (a5)
-; CHECK-NEXT:    vl1r.v v17, (a6)
-; CHECK-NEXT:    vl1r.v v21, (a7)
+; CHECK-NEXT:    vl1r.v v19, (a7)
+; CHECK-NEXT:    vl1r.v v15, (a6)
 ; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; CHECK-NEXT:    vmsne.vi v22, v8, 0
-; CHECK-NEXT:    vmsne.vi v0, v10, 0
-; CHECK-NEXT:    vmsne.vi v9, v12, 0
-; CHECK-NEXT:    vmsne.vi v10, v14, 0
-; CHECK-NEXT:    vmsne.vi v11, v18, 0
-; CHECK-NEXT:    vmsne.vi v8, v16, 0
-; CHECK-NEXT:    vmsne.vi v12, v20, 0
+; CHECK-NEXT:    vmsne.vi v20, v8, 0
+; CHECK-NEXT:    vmsne.vi v9, v16, 0
+; CHECK-NEXT:    vmsne.vi v16, v10, 0
+; CHECK-NEXT:    vmsne.vi v0, v12, 0
+; CHECK-NEXT:    vmsne.vi v10, v18, 0
+; CHECK-NEXT:    vmsne.vi v8, v14, 0
 ; CHECK-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
-; CHECK-NEXT:    vslideup.vx v0, v22, a1
-; CHECK-NEXT:    vslideup.vx v9, v10, a1
-; CHECK-NEXT:    vslideup.vx v8, v11, a1
-; CHECK-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v20, a1
+; CHECK-NEXT:    vslideup.vx v0, v16, a1
+; CHECK-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v0, v9, a2
-; CHECK-NEXT:    vslideup.vx v8, v12, a2
+; CHECK-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v10, a1
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 14
+; CHECK-NEXT:    li a1, 12
 ; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv112i1_nxv16i1:
+; ZVBB-LABEL: vector_interleave_nxv96i1_nxv16i1:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    li a1, 14
+; ZVBB-NEXT:    li a1, 12
 ; ZVBB-NEXT:    mul a0, a0, a1
 ; ZVBB-NEXT:    sub sp, sp, a0
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; ZVBB-NEXT:    vmv.v.i v14, 0
-; ZVBB-NEXT:    addi a4, sp, 16
+; ZVBB-NEXT:    vmv.v.i v20, 0
+; ZVBB-NEXT:    vmerge.vim v14, v20, 1, v0
+; ZVBB-NEXT:    vmv1r.v v0, v8
+; ZVBB-NEXT:    vmerge.vim v22, v20, 1, v0
+; ZVBB-NEXT:    vmv1r.v v0, v9
+; ZVBB-NEXT:    vmv1r.v v16, v23
+; ZVBB-NEXT:    vmerge.vim v8, v20, 1, v0
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a1, a0, 3
-; ZVBB-NEXT:    sub a0, a1, a0
+; ZVBB-NEXT:    li a1, 6
+; ZVBB-NEXT:    mul a0, a0, a1
 ; ZVBB-NEXT:    add a0, sp, a0
 ; ZVBB-NEXT:    addi a0, a0, 16
-; ZVBB-NEXT:    csrr a2, vlenb
-; ZVBB-NEXT:    vmerge.vim v16, v14, 1, v0
-; ZVBB-NEXT:    vmv1r.v v0, v8
-; ZVBB-NEXT:    vmerge.vim v22, v14, 1, v0
-; ZVBB-NEXT:    add a3, a4, a2
-; ZVBB-NEXT:    srli a1, a2, 2
-; ZVBB-NEXT:    add a5, a0, a2
-; ZVBB-NEXT:    vmv4r.v v24, v16
-; ZVBB-NEXT:    vmv1r.v v0, v9
-; ZVBB-NEXT:    vmerge.vim v18, v14, 1, v0
-; ZVBB-NEXT:    add a6, a3, a2
-; ZVBB-NEXT:    vmv1r.v v25, v22
+; ZVBB-NEXT:    vmv1r.v v17, v9
 ; ZVBB-NEXT:    vmv1r.v v0, v10
-; ZVBB-NEXT:    vmerge.vim v8, v14, 1, v0
-; ZVBB-NEXT:    vmv1r.v v26, v18
+; ZVBB-NEXT:    vmerge.vim v24, v20, 1, v0
+; ZVBB-NEXT:    addi a5, sp, 16
+; ZVBB-NEXT:    vmv1r.v v18, v25
 ; ZVBB-NEXT:    vmv1r.v v0, v11
-; ZVBB-NEXT:    vmerge.vim v20, v14, 1, v0
-; ZVBB-NEXT:    vmv1r.v v27, v8
+; ZVBB-NEXT:    vmerge.vim v26, v20, 1, v0
+; ZVBB-NEXT:    csrr a2, vlenb
+; ZVBB-NEXT:    vmv1r.v v19, v27
 ; ZVBB-NEXT:    vmv1r.v v0, v12
-; ZVBB-NEXT:    vmerge.vim v10, v14, 1, v0
-; ZVBB-NEXT:    vmv1r.v v28, v20
-; ZVBB-NEXT:    vmv1r.v v18, v23
-; ZVBB-NEXT:    add a7, a6, a2
-; ZVBB-NEXT:    vmv1r.v v29, v10
-; ZVBB-NEXT:    vmv1r.v v20, v9
-; ZVBB-NEXT:    vmv1r.v v0, v13
-; ZVBB-NEXT:    vmerge.vim v30, v14, 1, v0
-; ZVBB-NEXT:    vmv1r.v v22, v11
-; ZVBB-NEXT:    vsetvli t0, zero, e8, m1, ta, ma
-; ZVBB-NEXT:    vsseg7e8.v v24, (a4)
-; ZVBB-NEXT:    vmv1r.v v23, v31
-; ZVBB-NEXT:    vsseg7e8.v v17, (a0)
-; ZVBB-NEXT:    vl1r.v v8, (a6)
-; ZVBB-NEXT:    add a6, a7, a2
-; ZVBB-NEXT:    vl1r.v v10, (a4)
-; ZVBB-NEXT:    add a4, a6, a2
-; ZVBB-NEXT:    vl1r.v v12, (a6)
+; ZVBB-NEXT:    vmerge.vim v10, v20, 1, v0
+; ZVBB-NEXT:    add a3, a0, a2
+; ZVBB-NEXT:    vmv1r.v v20, v11
+; ZVBB-NEXT:    vsetvli a1, zero, e8, m1, ta, ma
+; ZVBB-NEXT:    vsseg6e8.v v15, (a0)
+; ZVBB-NEXT:    vmv1r.v v15, v22
+; ZVBB-NEXT:    add a4, a5, a2
+; ZVBB-NEXT:    vmv1r.v v16, v8
+; ZVBB-NEXT:    srli a1, a2, 2
+; ZVBB-NEXT:    vmv1r.v v17, v24
 ; ZVBB-NEXT:    add a6, a4, a2
-; ZVBB-NEXT:    vl1r.v v14, (a6)
-; ZVBB-NEXT:    add a6, a5, a2
+; ZVBB-NEXT:    vmv1r.v v18, v26
+; ZVBB-NEXT:    add a7, a3, a2
+; ZVBB-NEXT:    vmv1r.v v19, v10
+; ZVBB-NEXT:    vsseg6e8.v v14, (a5)
+; ZVBB-NEXT:    vl1r.v v8, (a0)
+; ZVBB-NEXT:    add a0, a6, a2
+; ZVBB-NEXT:    vl1r.v v10, (a6)
+; ZVBB-NEXT:    add a6, a7, a2
+; ZVBB-NEXT:    vl1r.v v12, (a5)
+; ZVBB-NEXT:    add a5, a0, a2
+; ZVBB-NEXT:    vl1r.v v14, (a7)
+; ZVBB-NEXT:    add a7, a6, a2
 ; ZVBB-NEXT:    vl1r.v v16, (a5)
-; ZVBB-NEXT:    add a5, a6, a2
-; ZVBB-NEXT:    vl1r.v v18, (a5)
 ; ZVBB-NEXT:    add a5, a5, a2
-; ZVBB-NEXT:    vl1r.v v9, (a7)
-; ZVBB-NEXT:    add a7, a5, a2
-; ZVBB-NEXT:    vl1r.v v20, (a7)
+; ZVBB-NEXT:    vl1r.v v18, (a7)
 ; ZVBB-NEXT:    add a7, a7, a2
 ; ZVBB-NEXT:    srli a2, a2, 1
-; ZVBB-NEXT:    vl1r.v v11, (a3)
+; ZVBB-NEXT:    vl1r.v v9, (a3)
 ; ZVBB-NEXT:    add a3, a1, a1
+; ZVBB-NEXT:    vl1r.v v17, (a5)
+; ZVBB-NEXT:    add a5, a2, a2
+; ZVBB-NEXT:    vl1r.v v11, (a0)
 ; ZVBB-NEXT:    vl1r.v v13, (a4)
-; ZVBB-NEXT:    add a4, a2, a2
-; ZVBB-NEXT:    vl1r.v v15, (a0)
-; ZVBB-NEXT:    vl1r.v v19, (a5)
-; ZVBB-NEXT:    vl1r.v v17, (a6)
-; ZVBB-NEXT:    vl1r.v v21, (a7)
+; ZVBB-NEXT:    vl1r.v v19, (a7)
+; ZVBB-NEXT:    vl1r.v v15, (a6)
 ; ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
-; ZVBB-NEXT:    vmsne.vi v22, v8, 0
-; ZVBB-NEXT:    vmsne.vi v0, v10, 0
-; ZVBB-NEXT:    vmsne.vi v9, v12, 0
-; ZVBB-NEXT:    vmsne.vi v10, v14, 0
-; ZVBB-NEXT:    vmsne.vi v11, v18, 0
-; ZVBB-NEXT:    vmsne.vi v8, v16, 0
-; ZVBB-NEXT:    vmsne.vi v12, v20, 0
+; ZVBB-NEXT:    vmsne.vi v20, v8, 0
+; ZVBB-NEXT:    vmsne.vi v9, v16, 0
+; ZVBB-NEXT:    vmsne.vi v16, v10, 0
+; ZVBB-NEXT:    vmsne.vi v0, v12, 0
+; ZVBB-NEXT:    vmsne.vi v10, v18, 0
+; ZVBB-NEXT:    vmsne.vi v8, v14, 0
 ; ZVBB-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
-; ZVBB-NEXT:    vslideup.vx v0, v22, a1
-; ZVBB-NEXT:    vslideup.vx v9, v10, a1
-; ZVBB-NEXT:    vslideup.vx v8, v11, a1
-; ZVBB-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v9, v20, a1
+; ZVBB-NEXT:    vslideup.vx v0, v16, a1
+; ZVBB-NEXT:    vsetvli zero, a5, e8, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v0, v9, a2
-; ZVBB-NEXT:    vslideup.vx v8, v12, a2
+; ZVBB-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; ZVBB-NEXT:    vslideup.vx v8, v10, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    li a1, 14
+; ZVBB-NEXT:    li a1, 12
 ; ZVBB-NEXT:    mul a0, a0, a1
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 112 x i1> @llvm.vector.interleave7.nxv112i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e, <vscale x 16 x i1> %f, <vscale x 16 x i1> %g)
-  ret <vscale x 112 x i1> %res
+  %res = call <vscale x 96 x i1> @llvm.vector.interleave6.nxv96i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e, <vscale x 16 x i1> %f)
+  ret <vscale x 96 x i1> %res
 }
 
-
-define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, <vscale x 16 x i8> %e, <vscale x 16 x i8> %f, <vscale x 16 x i8> %g) nounwind {
+define <vscale x 96 x i8> @vector_interleave_nxv96i8_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, <vscale x 16 x i8> %e, <vscale x 16 x i8> %f) nounwind {
 ;
-; RV32-LABEL: vector_interleave_nxv112i8_nxv16i8:
+; RV32-LABEL: vector_interleave_nxv96i8_nxv16i8:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -80
 ; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    addi s0, sp, 80
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 28
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    sub sp, sp, a0
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV32-NEXT:    vmv2r.v v26, v20
-; RV32-NEXT:    addi a0, sp, 64
-; RV32-NEXT:    vmv2r.v v24, v16
+; RV32-NEXT:    vmv2r.v v20, v14
+; RV32-NEXT:    vmv2r.v v22, v12
+; RV32-NEXT:    vmv2r.v v24, v10
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 3
-; RV32-NEXT:    sub a1, a2, a1
+; RV32-NEXT:    li a0, 6
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 64
-; RV32-NEXT:    vmv2r.v v22, v12
+; RV32-NEXT:    vmv1r.v v10, v25
+; RV32-NEXT:    vmv1r.v v11, v23
+; RV32-NEXT:    vmv1r.v v12, v21
+; RV32-NEXT:    addi a0, sp, 64
+; RV32-NEXT:    vmv1r.v v13, v17
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    vmv2r.v v20, v8
-; RV32-NEXT:    vmv1r.v v1, v20
-; RV32-NEXT:    vmv1r.v v3, v22
-; RV32-NEXT:    vmv1r.v v5, v24
-; RV32-NEXT:    vmv1r.v v7, v26
+; RV32-NEXT:    vmv1r.v v14, v19
+; RV32-NEXT:    vsseg6e8.v v9, (a1)
+; RV32-NEXT:    vmv1r.v v9, v24
+; RV32-NEXT:    add a5, a1, a2
+; RV32-NEXT:    vmv1r.v v10, v22
 ; RV32-NEXT:    add a3, a0, a2
-; RV32-NEXT:    vmv1r.v v2, v10
-; RV32-NEXT:    add a4, a1, a2
-; RV32-NEXT:    slli a5, a2, 2
-; RV32-NEXT:    vmv1r.v v4, v14
-; RV32-NEXT:    slli a6, a2, 4
-; RV32-NEXT:    add a7, a4, a2
-; RV32-NEXT:    vmv1r.v v6, v18
-; RV32-NEXT:    sub a5, a6, a5
-; RV32-NEXT:    vmv1r.v v22, v11
-; RV32-NEXT:    add a6, a7, a2
-; RV32-NEXT:    vmv1r.v v24, v15
-; RV32-NEXT:    vsseg7e8.v v1, (a0)
-; RV32-NEXT:    vmv1r.v v26, v19
-; RV32-NEXT:    vsseg7e8.v v21, (a1)
-; RV32-NEXT:    vl1r.v v18, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1r.v v19, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1r.v v20, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1r.v v21, (a6)
-; RV32-NEXT:    add a6, a3, a2
-; RV32-NEXT:    vl1r.v v10, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1r.v v11, (a6)
-; RV32-NEXT:    vl1r.v v8, (a0)
-; RV32-NEXT:    vl1r.v v16, (a4)
-; RV32-NEXT:    vl1r.v v9, (a3)
-; RV32-NEXT:    vl1r.v v17, (a7)
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a3, 14
-; RV32-NEXT:    mul a0, a0, a3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 64
-; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vmv1r.v v11, v20
+; RV32-NEXT:    add a4, a3, a2
+; RV32-NEXT:    vmv1r.v v12, v16
+; RV32-NEXT:    add a6, a5, a2
+; RV32-NEXT:    vmv1r.v v13, v18
+; RV32-NEXT:    vsseg6e8.v v8, (a0)
+; RV32-NEXT:    vl1r.v v14, (a1)
+; RV32-NEXT:    add a1, a6, a2
+; RV32-NEXT:    vl1r.v v15, (a5)
+; RV32-NEXT:    add a5, a1, a2
+; RV32-NEXT:    vl1r.v v18, (a5)
+; RV32-NEXT:    add a5, a5, a2
+; RV32-NEXT:    vl1r.v v19, (a5)
+; RV32-NEXT:    add a5, a4, a2
+; RV32-NEXT:    vl1r.v v16, (a6)
+; RV32-NEXT:    add a6, a5, a2
 ; RV32-NEXT:    vl1r.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    vl1r.v v13, (a6)
-; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    li a7, 12
+; RV32-NEXT:    mul a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 64
+; RV32-NEXT:    vl1r.v v17, (a1)
+; RV32-NEXT:    vl1r.v v10, (a4)
+; RV32-NEXT:    vl1r.v v11, (a5)
+; RV32-NEXT:    vl1r.v v8, (a0)
+; RV32-NEXT:    vl1r.v v9, (a3)
 ; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, a0, a2
-; RV32-NEXT:    vl1r.v v14, (a6)
-; RV32-NEXT:    vl1r.v v15, (a1)
-; RV32-NEXT:    add a5, a0, a5
-; RV32-NEXT:    vs2r.v v20, (a5)
+; RV32-NEXT:    add a2, a6, a2
 ; RV32-NEXT:    vs4r.v v16, (a2)
-; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    vs8r.v v8, (a6)
 ; RV32-NEXT:    vl8r.v v16, (a2)
-; RV32-NEXT:    vl8r.v v8, (a0)
+; RV32-NEXT:    vl8r.v v8, (a6)
 ; RV32-NEXT:    addi sp, s0, -80
 ; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 80
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: vector_interleave_nxv112i8_nxv16i8:
+; RV64-LABEL: vector_interleave_nxv96i8_nxv16i8:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -80
 ; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    addi s0, sp, 80
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    li a1, 28
+; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    sub sp, sp, a0
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; RV64-NEXT:    vmv2r.v v26, v20
-; RV64-NEXT:    addi a0, sp, 64
-; RV64-NEXT:    vmv2r.v v24, v16
+; RV64-NEXT:    vmv2r.v v20, v14
+; RV64-NEXT:    vmv2r.v v22, v12
+; RV64-NEXT:    vmv2r.v v24, v10
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    sub a1, a2, a1
+; RV64-NEXT:    li a0, 6
+; RV64-NEXT:    mul a1, a1, a0
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 64
-; RV64-NEXT:    vmv2r.v v22, v12
+; RV64-NEXT:    vmv1r.v v10, v25
+; RV64-NEXT:    vmv1r.v v11, v23
+; RV64-NEXT:    vmv1r.v v12, v21
+; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    vmv1r.v v13, v17
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    vmv2r.v v20, v8
-; RV64-NEXT:    vmv1r.v v1, v20
-; RV64-NEXT:    vmv1r.v v3, v22
-; RV64-NEXT:    vmv1r.v v5, v24
-; RV64-NEXT:    vmv1r.v v7, v26
+; RV64-NEXT:    vmv1r.v v14, v19
+; RV64-NEXT:    vsseg6e8.v v9, (a1)
+; RV64-NEXT:    vmv1r.v v9, v24
+; RV64-NEXT:    add a5, a1, a2
+; RV64-NEXT:    vmv1r.v v10, v22
 ; RV64-NEXT:    add a3, a0, a2
-; RV64-NEXT:    vmv1r.v v2, v10
-; RV64-NEXT:    add a4, a1, a2
-; RV64-NEXT:    slli a5, a2, 2
-; RV64-NEXT:    vmv1r.v v4, v14
-; RV64-NEXT:    slli a6, a2, 4
-; RV64-NEXT:    add a7, a4, a2
-; RV64-NEXT:    vmv1r.v v6, v18
-; RV64-NEXT:    sub a5, a6, a5
-; RV64-NEXT:    vmv1r.v v22, v11
-; RV64-NEXT:    add a6, a7, a2
-; RV64-NEXT:    vmv1r.v v24, v15
-; RV64-NEXT:    vsseg7e8.v v1, (a0)
-; RV64-NEXT:    vmv1r.v v26, v19
-; RV64-NEXT:    vsseg7e8.v v21, (a1)
-; RV64-NEXT:    vl1r.v v18, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1r.v v19, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1r.v v20, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1r.v v21, (a6)
-; RV64-NEXT:    add a6, a3, a2
-; RV64-NEXT:    vl1r.v v10, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1r.v v11, (a6)
-; RV64-NEXT:    vl1r.v v8, (a0)
-; RV64-NEXT:    vl1r.v v16, (a4)
-; RV64-NEXT:    vl1r.v v9, (a3)
-; RV64-NEXT:    vl1r.v v17, (a7)
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a3, 14
-; RV64-NEXT:    mul a0, a0, a3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 64
-; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vmv1r.v v11, v20
+; RV64-NEXT:    add a4, a3, a2
+; RV64-NEXT:    vmv1r.v v12, v16
+; RV64-NEXT:    add a6, a5, a2
+; RV64-NEXT:    vmv1r.v v13, v18
+; RV64-NEXT:    vsseg6e8.v v8, (a0)
+; RV64-NEXT:    vl1r.v v14, (a1)
+; RV64-NEXT:    add a1, a6, a2
+; RV64-NEXT:    vl1r.v v15, (a5)
+; RV64-NEXT:    add a5, a1, a2
+; RV64-NEXT:    vl1r.v v18, (a5)
+; RV64-NEXT:    add a5, a5, a2
+; RV64-NEXT:    vl1r.v v19, (a5)
+; RV64-NEXT:    add a5, a4, a2
+; RV64-NEXT:    vl1r.v v16, (a6)
+; RV64-NEXT:    add a6, a5, a2
 ; RV64-NEXT:    vl1r.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    vl1r.v v13, (a6)
-; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    li a7, 12
+; RV64-NEXT:    mul a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 64
+; RV64-NEXT:    vl1r.v v17, (a1)
+; RV64-NEXT:    vl1r.v v10, (a4)
+; RV64-NEXT:    vl1r.v v11, (a5)
+; RV64-NEXT:    vl1r.v v8, (a0)
+; RV64-NEXT:    vl1r.v v9, (a3)
 ; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a2, a0, a2
-; RV64-NEXT:    vl1r.v v14, (a6)
-; RV64-NEXT:    vl1r.v v15, (a1)
-; RV64-NEXT:    add a5, a0, a5
-; RV64-NEXT:    vs2r.v v20, (a5)
+; RV64-NEXT:    add a2, a6, a2
 ; RV64-NEXT:    vs4r.v v16, (a2)
-; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    vs8r.v v8, (a6)
 ; RV64-NEXT:    vl8r.v v16, (a2)
-; RV64-NEXT:    vl8r.v v8, (a0)
+; RV64-NEXT:    vl8r.v v8, (a6)
 ; RV64-NEXT:    addi sp, s0, -80
 ; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 80
 ; RV64-NEXT:    ret
 ;
-; ZVBB-RV32-LABEL: vector_interleave_nxv112i8_nxv16i8:
+; ZVBB-RV32-LABEL: vector_interleave_nxv96i8_nxv16i8:
 ; ZVBB-RV32:       # %bb.0:
 ; ZVBB-RV32-NEXT:    addi sp, sp, -80
 ; ZVBB-RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
 ; ZVBB-RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
 ; ZVBB-RV32-NEXT:    addi s0, sp, 80
 ; ZVBB-RV32-NEXT:    csrr a0, vlenb
-; ZVBB-RV32-NEXT:    slli a0, a0, 5
+; ZVBB-RV32-NEXT:    li a1, 28
+; ZVBB-RV32-NEXT:    mul a0, a0, a1
 ; ZVBB-RV32-NEXT:    sub sp, sp, a0
 ; ZVBB-RV32-NEXT:    andi sp, sp, -64
 ; ZVBB-RV32-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
-; ZVBB-RV32-NEXT:    addi a0, sp, 64
-; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
+; ZVBB-RV32-NEXT:    vmv2r.v v20, v14
+; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV32-NEXT:    vmv2r.v v24, v10
 ; ZVBB-RV32-NEXT:    csrr a1, vlenb
-; ZVBB-RV32-NEXT:    slli a2, a1, 3
-; ZVBB-RV32-NEXT:    sub a1, a2, a1
+; ZVBB-RV32-NEXT:    li a0, 6
+; ZVBB-RV32-NEXT:    mul a1, a1, a0
 ; ZVBB-RV32-NEXT:    add a1, sp, a1
 ; ZVBB-RV32-NEXT:    addi a1, a1, 64
-; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV32-NEXT:    vmv1r.v v10, v25
+; ZVBB-RV32-NEXT:    vmv1r.v v11, v23
+; ZVBB-RV32-NEXT:    vmv1r.v v12, v21
+; ZVBB-RV32-NEXT:    addi a0, sp, 64
+; ZVBB-RV32-NEXT:    vmv1r.v v13, v17
 ; ZVBB-RV32-NEXT:    csrr a2, vlenb
-; ZVBB-RV32-NEXT:    vmv2r.v v20, v8
-; ZVBB-RV32-NEXT:    vmv1r.v v1, v20
-; ZVBB-RV32-NEXT:    vmv1r.v v3, v22
-; ZVBB-RV32-NEXT:    vmv1r.v v5, v24
-; ZVBB-RV32-NEXT:    vmv1r.v v7, v26
+; ZVBB-RV32-NEXT:    vmv1r.v v14, v19
+; ZVBB-RV32-NEXT:    vsseg6e8.v v9, (a1)
+; ZVBB-RV32-NEXT:    vmv1r.v v9, v24
+; ZVBB-RV32-NEXT:    add a5, a1, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v10, v22
 ; ZVBB-RV32-NEXT:    add a3, a0, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
-; ZVBB-RV32-NEXT:    slli a5, a2, 2
-; ZVBB-RV32-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV32-NEXT:    slli a6, a2, 4
-; ZVBB-RV32-NEXT:    add a7, a4, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV32-NEXT:    sub a5, a6, a5
-; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV32-NEXT:    add a6, a7, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV32-NEXT:    vsseg7e8.v v1, (a0)
-; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV32-NEXT:    vsseg7e8.v v21, (a1)
-; ZVBB-RV32-NEXT:    vl1r.v v18, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1r.v v19, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1r.v v20, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1r.v v21, (a6)
-; ZVBB-RV32-NEXT:    add a6, a3, a2
-; ZVBB-RV32-NEXT:    vl1r.v v10, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1r.v v11, (a6)
-; ZVBB-RV32-NEXT:    vl1r.v v8, (a0)
-; ZVBB-RV32-NEXT:    vl1r.v v16, (a4)
-; ZVBB-RV32-NEXT:    vl1r.v v9, (a3)
-; ZVBB-RV32-NEXT:    vl1r.v v17, (a7)
-; ZVBB-RV32-NEXT:    csrr a0, vlenb
-; ZVBB-RV32-NEXT:    li a3, 14
-; ZVBB-RV32-NEXT:    mul a0, a0, a3
-; ZVBB-RV32-NEXT:    add a0, sp, a0
-; ZVBB-RV32-NEXT:    addi a0, a0, 64
-; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v11, v20
+; ZVBB-RV32-NEXT:    add a4, a3, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v12, v16
+; ZVBB-RV32-NEXT:    add a6, a5, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v13, v18
+; ZVBB-RV32-NEXT:    vsseg6e8.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1r.v v14, (a1)
+; ZVBB-RV32-NEXT:    add a1, a6, a2
+; ZVBB-RV32-NEXT:    vl1r.v v15, (a5)
+; ZVBB-RV32-NEXT:    add a5, a1, a2
+; ZVBB-RV32-NEXT:    vl1r.v v18, (a5)
+; ZVBB-RV32-NEXT:    add a5, a5, a2
+; ZVBB-RV32-NEXT:    vl1r.v v19, (a5)
+; ZVBB-RV32-NEXT:    add a5, a4, a2
+; ZVBB-RV32-NEXT:    vl1r.v v16, (a6)
+; ZVBB-RV32-NEXT:    add a6, a5, a2
 ; ZVBB-RV32-NEXT:    vl1r.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    vl1r.v v13, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    csrr a6, vlenb
+; ZVBB-RV32-NEXT:    li a7, 12
+; ZVBB-RV32-NEXT:    mul a6, a6, a7
+; ZVBB-RV32-NEXT:    add a6, sp, a6
+; ZVBB-RV32-NEXT:    addi a6, a6, 64
+; ZVBB-RV32-NEXT:    vl1r.v v17, (a1)
+; ZVBB-RV32-NEXT:    vl1r.v v10, (a4)
+; ZVBB-RV32-NEXT:    vl1r.v v11, (a5)
+; ZVBB-RV32-NEXT:    vl1r.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1r.v v9, (a3)
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
-; ZVBB-RV32-NEXT:    add a2, a0, a2
-; ZVBB-RV32-NEXT:    vl1r.v v14, (a6)
-; ZVBB-RV32-NEXT:    vl1r.v v15, (a1)
-; ZVBB-RV32-NEXT:    add a5, a0, a5
-; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT:    add a2, a6, a2
 ; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
-; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT:    vs8r.v v8, (a6)
 ; ZVBB-RV32-NEXT:    vl8r.v v16, (a2)
-; ZVBB-RV32-NEXT:    vl8r.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl8r.v v8, (a6)
 ; ZVBB-RV32-NEXT:    addi sp, s0, -80
 ; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    addi sp, sp, 80
 ; ZVBB-RV32-NEXT:    ret
 ;
-; ZVBB-RV64-LABEL: vector_interleave_nxv112i8_nxv16i8:
+; ZVBB-RV64-LABEL: vector_interleave_nxv96i8_nxv16i8:
 ; ZVBB-RV64:       # %bb.0:
 ; ZVBB-RV64-NEXT:    addi sp, sp, -80
 ; ZVBB-RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
 ; ZVBB-RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
 ; ZVBB-RV64-NEXT:    addi s0, sp, 80
 ; ZVBB-RV64-NEXT:    csrr a0, vlenb
-; ZVBB-RV64-NEXT:    slli a0, a0, 5
+; ZVBB-RV64-NEXT:    li a1, 28
+; ZVBB-RV64-NEXT:    mul a0, a0, a1
 ; ZVBB-RV64-NEXT:    sub sp, sp, a0
 ; ZVBB-RV64-NEXT:    andi sp, sp, -64
 ; ZVBB-RV64-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
-; ZVBB-RV64-NEXT:    addi a0, sp, 64
-; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
+; ZVBB-RV64-NEXT:    vmv2r.v v20, v14
+; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV64-NEXT:    vmv2r.v v24, v10
 ; ZVBB-RV64-NEXT:    csrr a1, vlenb
-; ZVBB-RV64-NEXT:    slli a2, a1, 3
-; ZVBB-RV64-NEXT:    sub a1, a2, a1
+; ZVBB-RV64-NEXT:    li a0, 6
+; ZVBB-RV64-NEXT:    mul a1, a1, a0
 ; ZVBB-RV64-NEXT:    add a1, sp, a1
 ; ZVBB-RV64-NEXT:    addi a1, a1, 64
-; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV64-NEXT:    vmv1r.v v10, v25
+; ZVBB-RV64-NEXT:    vmv1r.v v11, v23
+; ZVBB-RV64-NEXT:    vmv1r.v v12, v21
+; ZVBB-RV64-NEXT:    addi a0, sp, 64
+; ZVBB-RV64-NEXT:    vmv1r.v v13, v17
 ; ZVBB-RV64-NEXT:    csrr a2, vlenb
-; ZVBB-RV64-NEXT:    vmv2r.v v20, v8
-; ZVBB-RV64-NEXT:    vmv1r.v v1, v20
-; ZVBB-RV64-NEXT:    vmv1r.v v3, v22
-; ZVBB-RV64-NEXT:    vmv1r.v v5, v24
-; ZVBB-RV64-NEXT:    vmv1r.v v7, v26
+; ZVBB-RV64-NEXT:    vmv1r.v v14, v19
+; ZVBB-RV64-NEXT:    vsseg6e8.v v9, (a1)
+; ZVBB-RV64-NEXT:    vmv1r.v v9, v24
+; ZVBB-RV64-NEXT:    add a5, a1, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v10, v22
 ; ZVBB-RV64-NEXT:    add a3, a0, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
-; ZVBB-RV64-NEXT:    slli a5, a2, 2
-; ZVBB-RV64-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV64-NEXT:    slli a6, a2, 4
-; ZVBB-RV64-NEXT:    add a7, a4, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV64-NEXT:    sub a5, a6, a5
-; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV64-NEXT:    add a6, a7, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV64-NEXT:    vsseg7e8.v v1, (a0)
-; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV64-NEXT:    vsseg7e8.v v21, (a1)
-; ZVBB-RV64-NEXT:    vl1r.v v18, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1r.v v19, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1r.v v20, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1r.v v21, (a6)
-; ZVBB-RV64-NEXT:    add a6, a3, a2
-; ZVBB-RV64-NEXT:    vl1r.v v10, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1r.v v11, (a6)
-; ZVBB-RV64-NEXT:    vl1r.v v8, (a0)
-; ZVBB-RV64-NEXT:    vl1r.v v16, (a4)
-; ZVBB-RV64-NEXT:    vl1r.v v9, (a3)
-; ZVBB-RV64-NEXT:    vl1r.v v17, (a7)
-; ZVBB-RV64-NEXT:    csrr a0, vlenb
-; ZVBB-RV64-NEXT:    li a3, 14
-; ZVBB-RV64-NEXT:    mul a0, a0, a3
-; ZVBB-RV64-NEXT:    add a0, sp, a0
-; ZVBB-RV64-NEXT:    addi a0, a0, 64
-; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v11, v20
+; ZVBB-RV64-NEXT:    add a4, a3, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v12, v16
+; ZVBB-RV64-NEXT:    add a6, a5, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v13, v18
+; ZVBB-RV64-NEXT:    vsseg6e8.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1r.v v14, (a1)
+; ZVBB-RV64-NEXT:    add a1, a6, a2
+; ZVBB-RV64-NEXT:    vl1r.v v15, (a5)
+; ZVBB-RV64-NEXT:    add a5, a1, a2
+; ZVBB-RV64-NEXT:    vl1r.v v18, (a5)
+; ZVBB-RV64-NEXT:    add a5, a5, a2
+; ZVBB-RV64-NEXT:    vl1r.v v19, (a5)
+; ZVBB-RV64-NEXT:    add a5, a4, a2
+; ZVBB-RV64-NEXT:    vl1r.v v16, (a6)
+; ZVBB-RV64-NEXT:    add a6, a5, a2
 ; ZVBB-RV64-NEXT:    vl1r.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    vl1r.v v13, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    csrr a6, vlenb
+; ZVBB-RV64-NEXT:    li a7, 12
+; ZVBB-RV64-NEXT:    mul a6, a6, a7
+; ZVBB-RV64-NEXT:    add a6, sp, a6
+; ZVBB-RV64-NEXT:    addi a6, a6, 64
+; ZVBB-RV64-NEXT:    vl1r.v v17, (a1)
+; ZVBB-RV64-NEXT:    vl1r.v v10, (a4)
+; ZVBB-RV64-NEXT:    vl1r.v v11, (a5)
+; ZVBB-RV64-NEXT:    vl1r.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1r.v v9, (a3)
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
-; ZVBB-RV64-NEXT:    add a2, a0, a2
-; ZVBB-RV64-NEXT:    vl1r.v v14, (a6)
-; ZVBB-RV64-NEXT:    vl1r.v v15, (a1)
-; ZVBB-RV64-NEXT:    add a5, a0, a5
-; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT:    add a2, a6, a2
 ; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
-; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT:    vs8r.v v8, (a6)
 ; ZVBB-RV64-NEXT:    vl8r.v v16, (a2)
-; ZVBB-RV64-NEXT:    vl8r.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl8r.v v8, (a6)
 ; ZVBB-RV64-NEXT:    addi sp, s0, -80
 ; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    addi sp, sp, 80
 ; ZVBB-RV64-NEXT:    ret
 ;
-; ZIP-LABEL: vector_interleave_nxv112i8_nxv16i8:
+; ZIP-LABEL: vector_interleave_nxv96i8_nxv16i8:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -80
 ; ZIP-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
 ; ZIP-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
 ; ZIP-NEXT:    addi s0, sp, 80
 ; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    slli a0, a0, 5
+; ZIP-NEXT:    li a1, 28
+; ZIP-NEXT:    mul a0, a0, a1
 ; ZIP-NEXT:    sub sp, sp, a0
 ; ZIP-NEXT:    andi sp, sp, -64
 ; ZIP-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
-; ZIP-NEXT:    vmv2r.v v26, v20
-; ZIP-NEXT:    addi a0, sp, 64
-; ZIP-NEXT:    vmv2r.v v24, v16
+; ZIP-NEXT:    vmv2r.v v20, v14
+; ZIP-NEXT:    vmv2r.v v22, v12
+; ZIP-NEXT:    vmv2r.v v24, v10
 ; ZIP-NEXT:    csrr a1, vlenb
-; ZIP-NEXT:    slli a2, a1, 3
-; ZIP-NEXT:    sub a1, a2, a1
+; ZIP-NEXT:    li a0, 6
+; ZIP-NEXT:    mul a1, a1, a0
 ; ZIP-NEXT:    add a1, sp, a1
 ; ZIP-NEXT:    addi a1, a1, 64
-; ZIP-NEXT:    vmv2r.v v22, v12
+; ZIP-NEXT:    vmv1r.v v10, v25
+; ZIP-NEXT:    vmv1r.v v11, v23
+; ZIP-NEXT:    vmv1r.v v12, v21
+; ZIP-NEXT:    addi a0, sp, 64
+; ZIP-NEXT:    vmv1r.v v13, v17
 ; ZIP-NEXT:    csrr a2, vlenb
-; ZIP-NEXT:    vmv2r.v v20, v8
-; ZIP-NEXT:    vmv1r.v v1, v20
-; ZIP-NEXT:    vmv1r.v v3, v22
-; ZIP-NEXT:    vmv1r.v v5, v24
-; ZIP-NEXT:    vmv1r.v v7, v26
+; ZIP-NEXT:    vmv1r.v v14, v19
+; ZIP-NEXT:    vsseg6e8.v v9, (a1)
+; ZIP-NEXT:    vmv1r.v v9, v24
+; ZIP-NEXT:    add a5, a1, a2
+; ZIP-NEXT:    vmv1r.v v10, v22
 ; ZIP-NEXT:    add a3, a0, a2
-; ZIP-NEXT:    vmv1r.v v2, v10
-; ZIP-NEXT:    add a4, a1, a2
-; ZIP-NEXT:    slli a5, a2, 2
-; ZIP-NEXT:    vmv1r.v v4, v14
-; ZIP-NEXT:    slli a6, a2, 4
-; ZIP-NEXT:    add a7, a4, a2
-; ZIP-NEXT:    vmv1r.v v6, v18
-; ZIP-NEXT:    sub a5, a6, a5
-; ZIP-NEXT:    vmv1r.v v22, v11
-; ZIP-NEXT:    add a6, a7, a2
-; ZIP-NEXT:    vmv1r.v v24, v15
-; ZIP-NEXT:    vsseg7e8.v v1, (a0)
-; ZIP-NEXT:    vmv1r.v v26, v19
-; ZIP-NEXT:    vsseg7e8.v v21, (a1)
-; ZIP-NEXT:    vl1r.v v18, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1r.v v19, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1r.v v20, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1r.v v21, (a6)
-; ZIP-NEXT:    add a6, a3, a2
-; ZIP-NEXT:    vl1r.v v10, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1r.v v11, (a6)
-; ZIP-NEXT:    vl1r.v v8, (a0)
-; ZIP-NEXT:    vl1r.v v16, (a4)
-; ZIP-NEXT:    vl1r.v v9, (a3)
-; ZIP-NEXT:    vl1r.v v17, (a7)
-; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    li a3, 14
-; ZIP-NEXT:    mul a0, a0, a3
-; ZIP-NEXT:    add a0, sp, a0
-; ZIP-NEXT:    addi a0, a0, 64
-; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vmv1r.v v11, v20
+; ZIP-NEXT:    add a4, a3, a2
+; ZIP-NEXT:    vmv1r.v v12, v16
+; ZIP-NEXT:    add a6, a5, a2
+; ZIP-NEXT:    vmv1r.v v13, v18
+; ZIP-NEXT:    vsseg6e8.v v8, (a0)
+; ZIP-NEXT:    vl1r.v v14, (a1)
+; ZIP-NEXT:    add a1, a6, a2
+; ZIP-NEXT:    vl1r.v v15, (a5)
+; ZIP-NEXT:    add a5, a1, a2
+; ZIP-NEXT:    vl1r.v v18, (a5)
+; ZIP-NEXT:    add a5, a5, a2
+; ZIP-NEXT:    vl1r.v v19, (a5)
+; ZIP-NEXT:    add a5, a4, a2
+; ZIP-NEXT:    vl1r.v v16, (a6)
+; ZIP-NEXT:    add a6, a5, a2
 ; ZIP-NEXT:    vl1r.v v12, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    vl1r.v v13, (a6)
-; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    csrr a6, vlenb
+; ZIP-NEXT:    li a7, 12
+; ZIP-NEXT:    mul a6, a6, a7
+; ZIP-NEXT:    add a6, sp, a6
+; ZIP-NEXT:    addi a6, a6, 64
+; ZIP-NEXT:    vl1r.v v17, (a1)
+; ZIP-NEXT:    vl1r.v v10, (a4)
+; ZIP-NEXT:    vl1r.v v11, (a5)
+; ZIP-NEXT:    vl1r.v v8, (a0)
+; ZIP-NEXT:    vl1r.v v9, (a3)
 ; ZIP-NEXT:    slli a2, a2, 3
-; ZIP-NEXT:    add a2, a0, a2
-; ZIP-NEXT:    vl1r.v v14, (a6)
-; ZIP-NEXT:    vl1r.v v15, (a1)
-; ZIP-NEXT:    add a5, a0, a5
-; ZIP-NEXT:    vs2r.v v20, (a5)
+; ZIP-NEXT:    add a2, a6, a2
 ; ZIP-NEXT:    vs4r.v v16, (a2)
-; ZIP-NEXT:    vs8r.v v8, (a0)
+; ZIP-NEXT:    vs8r.v v8, (a6)
 ; ZIP-NEXT:    vl8r.v v16, (a2)
-; ZIP-NEXT:    vl8r.v v8, (a0)
+; ZIP-NEXT:    vl8r.v v8, (a6)
 ; ZIP-NEXT:    addi sp, s0, -80
 ; ZIP-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    addi sp, sp, 80
 ; ZIP-NEXT:    ret
-  %res = call <vscale x 112 x i8> @llvm.vector.interleave7.nxv112i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, <vscale x 16 x i8> %e, <vscale x 16 x i8> %f, <vscale x 16 x i8> %g)
-  ret <vscale x 112 x i8> %res
+  %res = call <vscale x 96 x i8> @llvm.vector.interleave6.nxv96i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, <vscale x 16 x i8> %e, <vscale x 16 x i8> %f)
+  ret <vscale x 96 x i8> %res
 }
 
-
-define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c, <vscale x 8 x i16> %d, <vscale x 8 x i16> %e, <vscale x 8 x i16> %f, <vscale x 8 x i16> %g) nounwind {
+define <vscale x 48 x i8> @vector_interleave_nxv48i8_nxv8i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, <vscale x 8 x i8> %c, <vscale x 8 x i8> %d, <vscale x 8 x i8> %e, <vscale x 8 x i8> %f) nounwind {
+; CHECK-LABEL: vector_interleave_nxv48i8_nxv8i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 6
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vsseg6e8.v v8, (a0)
+; CHECK-NEXT:    vl1r.v v10, (a3)
+; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    vl1r.v v11, (a3)
+; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    vl1r.v v8, (a0)
+; CHECK-NEXT:    vl1r.v v9, (a2)
+; CHECK-NEXT:    vl1r.v v12, (a3)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vl1r.v v13, (a1)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 6
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
 ;
-; RV32-LABEL: vector_interleave_nxv56i16_nxv8i16:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -80
-; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
-; RV32-NEXT:    addi s0, sp, 80
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    sub sp, sp, a0
-; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; RV32-NEXT:    vmv2r.v v26, v20
-; RV32-NEXT:    addi a0, sp, 64
-; RV32-NEXT:    vmv2r.v v24, v16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 3
-; RV32-NEXT:    sub a1, a2, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 64
-; RV32-NEXT:    vmv2r.v v22, v12
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    vmv2r.v v20, v8
-; RV32-NEXT:    vmv1r.v v1, v20
-; RV32-NEXT:    vmv1r.v v3, v22
-; RV32-NEXT:    vmv1r.v v5, v24
-; RV32-NEXT:    vmv1r.v v7, v26
-; RV32-NEXT:    add a3, a0, a2
-; RV32-NEXT:    vmv1r.v v2, v10
-; RV32-NEXT:    add a4, a1, a2
-; RV32-NEXT:    slli a5, a2, 2
-; RV32-NEXT:    vmv1r.v v4, v14
-; RV32-NEXT:    slli a6, a2, 4
-; RV32-NEXT:    add a7, a4, a2
-; RV32-NEXT:    vmv1r.v v6, v18
-; RV32-NEXT:    sub a5, a6, a5
-; RV32-NEXT:    vmv1r.v v22, v11
-; RV32-NEXT:    add a6, a7, a2
-; RV32-NEXT:    vmv1r.v v24, v15
-; RV32-NEXT:    vsseg7e16.v v1, (a0)
-; RV32-NEXT:    vmv1r.v v26, v19
-; RV32-NEXT:    vsseg7e16.v v21, (a1)
-; RV32-NEXT:    vl1re16.v v18, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v19, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v20, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v21, (a6)
-; RV32-NEXT:    add a6, a3, a2
-; RV32-NEXT:    vl1re16.v v10, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v11, (a6)
-; RV32-NEXT:    vl1re16.v v8, (a0)
-; RV32-NEXT:    vl1re16.v v16, (a4)
-; RV32-NEXT:    vl1re16.v v9, (a3)
-; RV32-NEXT:    vl1re16.v v17, (a7)
+; ZVBB-LABEL: vector_interleave_nxv48i8_nxv8i8:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    li a1, 6
+; ZVBB-NEXT:    mul a0, a0, a1
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e8, m1, ta, ma
+; ZVBB-NEXT:    vsseg6e8.v v8, (a0)
+; ZVBB-NEXT:    vl1r.v v10, (a3)
+; ZVBB-NEXT:    add a3, a3, a1
+; ZVBB-NEXT:    vl1r.v v11, (a3)
+; ZVBB-NEXT:    add a3, a3, a1
+; ZVBB-NEXT:    vl1r.v v8, (a0)
+; ZVBB-NEXT:    vl1r.v v9, (a2)
+; ZVBB-NEXT:    vl1r.v v12, (a3)
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vl1r.v v13, (a1)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    li a1, 6
+; ZVBB-NEXT:    mul a0, a0, a1
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 48 x i8> @llvm.vector.interleave6.nxv48i8(<vscale x 8 x i8> %a, <vscale x 8 x i8> %b, <vscale x 8 x i8> %c, <vscale x 8 x i8> %d, <vscale x 8 x i8> %e, <vscale x 8 x i8> %f)
+  ret <vscale x 48 x i8> %res
+}
+
+define <vscale x 24 x i32> @vector_interleave_nxv24i32_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, <vscale x 4 x i32> %e, <vscale x 4 x i32> %f) nounwind {
+;
+; RV32-LABEL: vector_interleave_nxv24i32_nxv4i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -80
+; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi s0, sp, 80
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a3, 14
-; RV32-NEXT:    mul a0, a0, a3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 64
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v12, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v13, (a6)
+; RV32-NEXT:    li a1, 28
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    sub sp, sp, a0
+; RV32-NEXT:    andi sp, sp, -64
+; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv2r.v v20, v14
+; RV32-NEXT:    vmv2r.v v22, v12
+; RV32-NEXT:    vmv2r.v v24, v10
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a0, 6
+; RV32-NEXT:    mul a1, a1, a0
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    vmv1r.v v10, v25
+; RV32-NEXT:    vmv1r.v v11, v23
+; RV32-NEXT:    vmv1r.v v12, v21
+; RV32-NEXT:    addi a0, sp, 64
+; RV32-NEXT:    vmv1r.v v13, v17
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    vmv1r.v v14, v19
+; RV32-NEXT:    vsseg6e32.v v9, (a1)
+; RV32-NEXT:    vmv1r.v v9, v24
+; RV32-NEXT:    add a5, a1, a2
+; RV32-NEXT:    vmv1r.v v10, v22
+; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    vmv1r.v v11, v20
+; RV32-NEXT:    add a4, a3, a2
+; RV32-NEXT:    vmv1r.v v12, v16
+; RV32-NEXT:    add a6, a5, a2
+; RV32-NEXT:    vmv1r.v v13, v18
+; RV32-NEXT:    vsseg6e32.v v8, (a0)
+; RV32-NEXT:    vl1re32.v v14, (a1)
+; RV32-NEXT:    add a1, a6, a2
+; RV32-NEXT:    vl1re32.v v15, (a5)
+; RV32-NEXT:    add a5, a1, a2
+; RV32-NEXT:    vl1re32.v v18, (a5)
+; RV32-NEXT:    add a5, a5, a2
+; RV32-NEXT:    vl1re32.v v19, (a5)
+; RV32-NEXT:    add a5, a4, a2
+; RV32-NEXT:    vl1re32.v v16, (a6)
+; RV32-NEXT:    add a6, a5, a2
+; RV32-NEXT:    vl1re32.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re32.v v13, (a6)
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    li a7, 12
+; RV32-NEXT:    mul a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 64
+; RV32-NEXT:    vl1re32.v v17, (a1)
+; RV32-NEXT:    vl1re32.v v10, (a4)
+; RV32-NEXT:    vl1re32.v v11, (a5)
+; RV32-NEXT:    vl1re32.v v8, (a0)
+; RV32-NEXT:    vl1re32.v v9, (a3)
 ; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, a0, a2
-; RV32-NEXT:    vl1re16.v v14, (a6)
-; RV32-NEXT:    vl1re16.v v15, (a1)
-; RV32-NEXT:    add a5, a0, a5
-; RV32-NEXT:    vs2r.v v20, (a5)
+; RV32-NEXT:    add a2, a6, a2
 ; RV32-NEXT:    vs4r.v v16, (a2)
-; RV32-NEXT:    vs8r.v v8, (a0)
-; RV32-NEXT:    vl8re16.v v16, (a2)
-; RV32-NEXT:    vl8re16.v v8, (a0)
+; RV32-NEXT:    vs8r.v v8, (a6)
+; RV32-NEXT:    vl8re32.v v16, (a2)
+; RV32-NEXT:    vl8re32.v v8, (a6)
 ; RV32-NEXT:    addi sp, s0, -80
 ; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 80
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; RV64-LABEL: vector_interleave_nxv24i32_nxv4i32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -80
 ; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    addi s0, sp, 80
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    li a1, 28
+; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    sub sp, sp, a0
 ; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; RV64-NEXT:    vmv2r.v v26, v20
-; RV64-NEXT:    addi a0, sp, 64
-; RV64-NEXT:    vmv2r.v v24, v16
+; RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV64-NEXT:    vmv2r.v v20, v14
+; RV64-NEXT:    vmv2r.v v22, v12
+; RV64-NEXT:    vmv2r.v v24, v10
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    sub a1, a2, a1
+; RV64-NEXT:    li a0, 6
+; RV64-NEXT:    mul a1, a1, a0
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 64
-; RV64-NEXT:    vmv2r.v v22, v12
+; RV64-NEXT:    vmv1r.v v10, v25
+; RV64-NEXT:    vmv1r.v v11, v23
+; RV64-NEXT:    vmv1r.v v12, v21
+; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    vmv1r.v v13, v17
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    vmv2r.v v20, v8
-; RV64-NEXT:    vmv1r.v v1, v20
-; RV64-NEXT:    vmv1r.v v3, v22
-; RV64-NEXT:    vmv1r.v v5, v24
-; RV64-NEXT:    vmv1r.v v7, v26
+; RV64-NEXT:    vmv1r.v v14, v19
+; RV64-NEXT:    vsseg6e32.v v9, (a1)
+; RV64-NEXT:    vmv1r.v v9, v24
+; RV64-NEXT:    add a5, a1, a2
+; RV64-NEXT:    vmv1r.v v10, v22
 ; RV64-NEXT:    add a3, a0, a2
-; RV64-NEXT:    vmv1r.v v2, v10
-; RV64-NEXT:    add a4, a1, a2
-; RV64-NEXT:    slli a5, a2, 2
-; RV64-NEXT:    vmv1r.v v4, v14
-; RV64-NEXT:    slli a6, a2, 4
-; RV64-NEXT:    add a7, a4, a2
-; RV64-NEXT:    vmv1r.v v6, v18
-; RV64-NEXT:    sub a5, a6, a5
-; RV64-NEXT:    vmv1r.v v22, v11
-; RV64-NEXT:    add a6, a7, a2
-; RV64-NEXT:    vmv1r.v v24, v15
-; RV64-NEXT:    vsseg7e16.v v1, (a0)
-; RV64-NEXT:    vmv1r.v v26, v19
-; RV64-NEXT:    vsseg7e16.v v21, (a1)
-; RV64-NEXT:    vl1re16.v v18, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v19, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v20, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v21, (a6)
-; RV64-NEXT:    add a6, a3, a2
-; RV64-NEXT:    vl1re16.v v10, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v11, (a6)
-; RV64-NEXT:    vl1re16.v v8, (a0)
-; RV64-NEXT:    vl1re16.v v16, (a4)
-; RV64-NEXT:    vl1re16.v v9, (a3)
-; RV64-NEXT:    vl1re16.v v17, (a7)
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a3, 14
-; RV64-NEXT:    mul a0, a0, a3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 64
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v12, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v13, (a6)
+; RV64-NEXT:    vmv1r.v v11, v20
+; RV64-NEXT:    add a4, a3, a2
+; RV64-NEXT:    vmv1r.v v12, v16
+; RV64-NEXT:    add a6, a5, a2
+; RV64-NEXT:    vmv1r.v v13, v18
+; RV64-NEXT:    vsseg6e32.v v8, (a0)
+; RV64-NEXT:    vl1re32.v v14, (a1)
+; RV64-NEXT:    add a1, a6, a2
+; RV64-NEXT:    vl1re32.v v15, (a5)
+; RV64-NEXT:    add a5, a1, a2
+; RV64-NEXT:    vl1re32.v v18, (a5)
+; RV64-NEXT:    add a5, a5, a2
+; RV64-NEXT:    vl1re32.v v19, (a5)
+; RV64-NEXT:    add a5, a4, a2
+; RV64-NEXT:    vl1re32.v v16, (a6)
+; RV64-NEXT:    add a6, a5, a2
+; RV64-NEXT:    vl1re32.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re32.v v13, (a6)
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    li a7, 12
+; RV64-NEXT:    mul a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 64
+; RV64-NEXT:    vl1re32.v v17, (a1)
+; RV64-NEXT:    vl1re32.v v10, (a4)
+; RV64-NEXT:    vl1re32.v v11, (a5)
+; RV64-NEXT:    vl1re32.v v8, (a0)
+; RV64-NEXT:    vl1re32.v v9, (a3)
 ; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a2, a0, a2
-; RV64-NEXT:    vl1re16.v v14, (a6)
-; RV64-NEXT:    vl1re16.v v15, (a1)
-; RV64-NEXT:    add a5, a0, a5
-; RV64-NEXT:    vs2r.v v20, (a5)
+; RV64-NEXT:    add a2, a6, a2
 ; RV64-NEXT:    vs4r.v v16, (a2)
-; RV64-NEXT:    vs8r.v v8, (a0)
-; RV64-NEXT:    vl8re16.v v16, (a2)
-; RV64-NEXT:    vl8re16.v v8, (a0)
+; RV64-NEXT:    vs8r.v v8, (a6)
+; RV64-NEXT:    vl8re32.v v16, (a2)
+; RV64-NEXT:    vl8re32.v v8, (a6)
 ; RV64-NEXT:    addi sp, s0, -80
 ; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 80
 ; RV64-NEXT:    ret
 ;
-; ZVBB-RV32-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; ZVBB-RV32-LABEL: vector_interleave_nxv24i32_nxv4i32:
 ; ZVBB-RV32:       # %bb.0:
 ; ZVBB-RV32-NEXT:    addi sp, sp, -80
 ; ZVBB-RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
 ; ZVBB-RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
 ; ZVBB-RV32-NEXT:    addi s0, sp, 80
 ; ZVBB-RV32-NEXT:    csrr a0, vlenb
-; ZVBB-RV32-NEXT:    slli a0, a0, 5
+; ZVBB-RV32-NEXT:    li a1, 28
+; ZVBB-RV32-NEXT:    mul a0, a0, a1
 ; ZVBB-RV32-NEXT:    sub sp, sp, a0
 ; ZVBB-RV32-NEXT:    andi sp, sp, -64
-; ZVBB-RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
-; ZVBB-RV32-NEXT:    addi a0, sp, 64
-; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
+; ZVBB-RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV32-NEXT:    vmv2r.v v20, v14
+; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV32-NEXT:    vmv2r.v v24, v10
 ; ZVBB-RV32-NEXT:    csrr a1, vlenb
-; ZVBB-RV32-NEXT:    slli a2, a1, 3
-; ZVBB-RV32-NEXT:    sub a1, a2, a1
+; ZVBB-RV32-NEXT:    li a0, 6
+; ZVBB-RV32-NEXT:    mul a1, a1, a0
 ; ZVBB-RV32-NEXT:    add a1, sp, a1
 ; ZVBB-RV32-NEXT:    addi a1, a1, 64
-; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV32-NEXT:    vmv1r.v v10, v25
+; ZVBB-RV32-NEXT:    vmv1r.v v11, v23
+; ZVBB-RV32-NEXT:    vmv1r.v v12, v21
+; ZVBB-RV32-NEXT:    addi a0, sp, 64
+; ZVBB-RV32-NEXT:    vmv1r.v v13, v17
 ; ZVBB-RV32-NEXT:    csrr a2, vlenb
-; ZVBB-RV32-NEXT:    vmv2r.v v20, v8
-; ZVBB-RV32-NEXT:    vmv1r.v v1, v20
-; ZVBB-RV32-NEXT:    vmv1r.v v3, v22
-; ZVBB-RV32-NEXT:    vmv1r.v v5, v24
-; ZVBB-RV32-NEXT:    vmv1r.v v7, v26
+; ZVBB-RV32-NEXT:    vmv1r.v v14, v19
+; ZVBB-RV32-NEXT:    vsseg6e32.v v9, (a1)
+; ZVBB-RV32-NEXT:    vmv1r.v v9, v24
+; ZVBB-RV32-NEXT:    add a5, a1, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v10, v22
 ; ZVBB-RV32-NEXT:    add a3, a0, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
-; ZVBB-RV32-NEXT:    slli a5, a2, 2
-; ZVBB-RV32-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV32-NEXT:    slli a6, a2, 4
-; ZVBB-RV32-NEXT:    add a7, a4, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV32-NEXT:    sub a5, a6, a5
-; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV32-NEXT:    add a6, a7, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV32-NEXT:    vsseg7e16.v v1, (a0)
-; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV32-NEXT:    vsseg7e16.v v21, (a1)
-; ZVBB-RV32-NEXT:    vl1re16.v v18, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v19, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v20, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v21, (a6)
-; ZVBB-RV32-NEXT:    add a6, a3, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v10, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v11, (a6)
-; ZVBB-RV32-NEXT:    vl1re16.v v8, (a0)
-; ZVBB-RV32-NEXT:    vl1re16.v v16, (a4)
-; ZVBB-RV32-NEXT:    vl1re16.v v9, (a3)
-; ZVBB-RV32-NEXT:    vl1re16.v v17, (a7)
-; ZVBB-RV32-NEXT:    csrr a0, vlenb
-; ZVBB-RV32-NEXT:    li a3, 14
-; ZVBB-RV32-NEXT:    mul a0, a0, a3
-; ZVBB-RV32-NEXT:    add a0, sp, a0
-; ZVBB-RV32-NEXT:    addi a0, a0, 64
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v12, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v13, (a6)
+; ZVBB-RV32-NEXT:    vmv1r.v v11, v20
+; ZVBB-RV32-NEXT:    add a4, a3, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v12, v16
+; ZVBB-RV32-NEXT:    add a6, a5, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v13, v18
+; ZVBB-RV32-NEXT:    vsseg6e32.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re32.v v14, (a1)
+; ZVBB-RV32-NEXT:    add a1, a6, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v15, (a5)
+; ZVBB-RV32-NEXT:    add a5, a1, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v18, (a5)
+; ZVBB-RV32-NEXT:    add a5, a5, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v19, (a5)
+; ZVBB-RV32-NEXT:    add a5, a4, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v16, (a6)
+; ZVBB-RV32-NEXT:    add a6, a5, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v13, (a6)
+; ZVBB-RV32-NEXT:    csrr a6, vlenb
+; ZVBB-RV32-NEXT:    li a7, 12
+; ZVBB-RV32-NEXT:    mul a6, a6, a7
+; ZVBB-RV32-NEXT:    add a6, sp, a6
+; ZVBB-RV32-NEXT:    addi a6, a6, 64
+; ZVBB-RV32-NEXT:    vl1re32.v v17, (a1)
+; ZVBB-RV32-NEXT:    vl1re32.v v10, (a4)
+; ZVBB-RV32-NEXT:    vl1re32.v v11, (a5)
+; ZVBB-RV32-NEXT:    vl1re32.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re32.v v9, (a3)
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
-; ZVBB-RV32-NEXT:    add a2, a0, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v14, (a6)
-; ZVBB-RV32-NEXT:    vl1re16.v v15, (a1)
-; ZVBB-RV32-NEXT:    add a5, a0, a5
-; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT:    add a2, a6, a2
 ; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
-; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
-; ZVBB-RV32-NEXT:    vl8re16.v v16, (a2)
-; ZVBB-RV32-NEXT:    vl8re16.v v8, (a0)
+; ZVBB-RV32-NEXT:    vs8r.v v8, (a6)
+; ZVBB-RV32-NEXT:    vl8re32.v v16, (a2)
+; ZVBB-RV32-NEXT:    vl8re32.v v8, (a6)
 ; ZVBB-RV32-NEXT:    addi sp, s0, -80
 ; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    addi sp, sp, 80
 ; ZVBB-RV32-NEXT:    ret
 ;
-; ZVBB-RV64-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; ZVBB-RV64-LABEL: vector_interleave_nxv24i32_nxv4i32:
 ; ZVBB-RV64:       # %bb.0:
 ; ZVBB-RV64-NEXT:    addi sp, sp, -80
 ; ZVBB-RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
 ; ZVBB-RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
 ; ZVBB-RV64-NEXT:    addi s0, sp, 80
 ; ZVBB-RV64-NEXT:    csrr a0, vlenb
-; ZVBB-RV64-NEXT:    slli a0, a0, 5
+; ZVBB-RV64-NEXT:    li a1, 28
+; ZVBB-RV64-NEXT:    mul a0, a0, a1
 ; ZVBB-RV64-NEXT:    sub sp, sp, a0
 ; ZVBB-RV64-NEXT:    andi sp, sp, -64
-; ZVBB-RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
-; ZVBB-RV64-NEXT:    addi a0, sp, 64
-; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
+; ZVBB-RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV64-NEXT:    vmv2r.v v20, v14
+; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV64-NEXT:    vmv2r.v v24, v10
 ; ZVBB-RV64-NEXT:    csrr a1, vlenb
-; ZVBB-RV64-NEXT:    slli a2, a1, 3
-; ZVBB-RV64-NEXT:    sub a1, a2, a1
+; ZVBB-RV64-NEXT:    li a0, 6
+; ZVBB-RV64-NEXT:    mul a1, a1, a0
 ; ZVBB-RV64-NEXT:    add a1, sp, a1
 ; ZVBB-RV64-NEXT:    addi a1, a1, 64
-; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV64-NEXT:    vmv1r.v v10, v25
+; ZVBB-RV64-NEXT:    vmv1r.v v11, v23
+; ZVBB-RV64-NEXT:    vmv1r.v v12, v21
+; ZVBB-RV64-NEXT:    addi a0, sp, 64
+; ZVBB-RV64-NEXT:    vmv1r.v v13, v17
 ; ZVBB-RV64-NEXT:    csrr a2, vlenb
-; ZVBB-RV64-NEXT:    vmv2r.v v20, v8
-; ZVBB-RV64-NEXT:    vmv1r.v v1, v20
-; ZVBB-RV64-NEXT:    vmv1r.v v3, v22
-; ZVBB-RV64-NEXT:    vmv1r.v v5, v24
-; ZVBB-RV64-NEXT:    vmv1r.v v7, v26
+; ZVBB-RV64-NEXT:    vmv1r.v v14, v19
+; ZVBB-RV64-NEXT:    vsseg6e32.v v9, (a1)
+; ZVBB-RV64-NEXT:    vmv1r.v v9, v24
+; ZVBB-RV64-NEXT:    add a5, a1, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v10, v22
 ; ZVBB-RV64-NEXT:    add a3, a0, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
-; ZVBB-RV64-NEXT:    slli a5, a2, 2
-; ZVBB-RV64-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV64-NEXT:    slli a6, a2, 4
-; ZVBB-RV64-NEXT:    add a7, a4, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV64-NEXT:    sub a5, a6, a5
-; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV64-NEXT:    add a6, a7, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV64-NEXT:    vsseg7e16.v v1, (a0)
-; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV64-NEXT:    vsseg7e16.v v21, (a1)
-; ZVBB-RV64-NEXT:    vl1re16.v v18, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v19, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v20, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v21, (a6)
-; ZVBB-RV64-NEXT:    add a6, a3, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v10, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v11, (a6)
-; ZVBB-RV64-NEXT:    vl1re16.v v8, (a0)
-; ZVBB-RV64-NEXT:    vl1re16.v v16, (a4)
-; ZVBB-RV64-NEXT:    vl1re16.v v9, (a3)
-; ZVBB-RV64-NEXT:    vl1re16.v v17, (a7)
-; ZVBB-RV64-NEXT:    csrr a0, vlenb
-; ZVBB-RV64-NEXT:    li a3, 14
-; ZVBB-RV64-NEXT:    mul a0, a0, a3
-; ZVBB-RV64-NEXT:    add a0, sp, a0
-; ZVBB-RV64-NEXT:    addi a0, a0, 64
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v12, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v13, (a6)
+; ZVBB-RV64-NEXT:    vmv1r.v v11, v20
+; ZVBB-RV64-NEXT:    add a4, a3, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v12, v16
+; ZVBB-RV64-NEXT:    add a6, a5, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v13, v18
+; ZVBB-RV64-NEXT:    vsseg6e32.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re32.v v14, (a1)
+; ZVBB-RV64-NEXT:    add a1, a6, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v15, (a5)
+; ZVBB-RV64-NEXT:    add a5, a1, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v18, (a5)
+; ZVBB-RV64-NEXT:    add a5, a5, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v19, (a5)
+; ZVBB-RV64-NEXT:    add a5, a4, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v16, (a6)
+; ZVBB-RV64-NEXT:    add a6, a5, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v13, (a6)
+; ZVBB-RV64-NEXT:    csrr a6, vlenb
+; ZVBB-RV64-NEXT:    li a7, 12
+; ZVBB-RV64-NEXT:    mul a6, a6, a7
+; ZVBB-RV64-NEXT:    add a6, sp, a6
+; ZVBB-RV64-NEXT:    addi a6, a6, 64
+; ZVBB-RV64-NEXT:    vl1re32.v v17, (a1)
+; ZVBB-RV64-NEXT:    vl1re32.v v10, (a4)
+; ZVBB-RV64-NEXT:    vl1re32.v v11, (a5)
+; ZVBB-RV64-NEXT:    vl1re32.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re32.v v9, (a3)
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
-; ZVBB-RV64-NEXT:    add a2, a0, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v14, (a6)
-; ZVBB-RV64-NEXT:    vl1re16.v v15, (a1)
-; ZVBB-RV64-NEXT:    add a5, a0, a5
-; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT:    add a2, a6, a2
 ; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
-; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
-; ZVBB-RV64-NEXT:    vl8re16.v v16, (a2)
-; ZVBB-RV64-NEXT:    vl8re16.v v8, (a0)
+; ZVBB-RV64-NEXT:    vs8r.v v8, (a6)
+; ZVBB-RV64-NEXT:    vl8re32.v v16, (a2)
+; ZVBB-RV64-NEXT:    vl8re32.v v8, (a6)
 ; ZVBB-RV64-NEXT:    addi sp, s0, -80
 ; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    addi sp, sp, 80
 ; ZVBB-RV64-NEXT:    ret
 ;
-; ZIP-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; ZIP-LABEL: vector_interleave_nxv24i32_nxv4i32:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -80
 ; ZIP-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
 ; ZIP-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
 ; ZIP-NEXT:    addi s0, sp, 80
 ; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    slli a0, a0, 5
+; ZIP-NEXT:    li a1, 28
+; ZIP-NEXT:    mul a0, a0, a1
 ; ZIP-NEXT:    sub sp, sp, a0
 ; ZIP-NEXT:    andi sp, sp, -64
-; ZIP-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZIP-NEXT:    vmv2r.v v26, v20
-; ZIP-NEXT:    addi a0, sp, 64
-; ZIP-NEXT:    vmv2r.v v24, v16
+; ZIP-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; ZIP-NEXT:    vmv2r.v v20, v14
+; ZIP-NEXT:    vmv2r.v v22, v12
+; ZIP-NEXT:    vmv2r.v v24, v10
 ; ZIP-NEXT:    csrr a1, vlenb
-; ZIP-NEXT:    slli a2, a1, 3
-; ZIP-NEXT:    sub a1, a2, a1
+; ZIP-NEXT:    li a0, 6
+; ZIP-NEXT:    mul a1, a1, a0
 ; ZIP-NEXT:    add a1, sp, a1
 ; ZIP-NEXT:    addi a1, a1, 64
-; ZIP-NEXT:    vmv2r.v v22, v12
+; ZIP-NEXT:    vmv1r.v v10, v25
+; ZIP-NEXT:    vmv1r.v v11, v23
+; ZIP-NEXT:    vmv1r.v v12, v21
+; ZIP-NEXT:    addi a0, sp, 64
+; ZIP-NEXT:    vmv1r.v v13, v17
 ; ZIP-NEXT:    csrr a2, vlenb
-; ZIP-NEXT:    vmv2r.v v20, v8
-; ZIP-NEXT:    vmv1r.v v1, v20
-; ZIP-NEXT:    vmv1r.v v3, v22
-; ZIP-NEXT:    vmv1r.v v5, v24
-; ZIP-NEXT:    vmv1r.v v7, v26
+; ZIP-NEXT:    vmv1r.v v14, v19
+; ZIP-NEXT:    vsseg6e32.v v9, (a1)
+; ZIP-NEXT:    vmv1r.v v9, v24
+; ZIP-NEXT:    add a5, a1, a2
+; ZIP-NEXT:    vmv1r.v v10, v22
 ; ZIP-NEXT:    add a3, a0, a2
-; ZIP-NEXT:    vmv1r.v v2, v10
-; ZIP-NEXT:    add a4, a1, a2
-; ZIP-NEXT:    slli a5, a2, 2
-; ZIP-NEXT:    vmv1r.v v4, v14
-; ZIP-NEXT:    slli a6, a2, 4
-; ZIP-NEXT:    add a7, a4, a2
-; ZIP-NEXT:    vmv1r.v v6, v18
-; ZIP-NEXT:    sub a5, a6, a5
-; ZIP-NEXT:    vmv1r.v v22, v11
-; ZIP-NEXT:    add a6, a7, a2
-; ZIP-NEXT:    vmv1r.v v24, v15
-; ZIP-NEXT:    vsseg7e16.v v1, (a0)
-; ZIP-NEXT:    vmv1r.v v26, v19
-; ZIP-NEXT:    vsseg7e16.v v21, (a1)
-; ZIP-NEXT:    vl1re16.v v18, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re16.v v19, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re16.v v20, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re16.v v21, (a6)
-; ZIP-NEXT:    add a6, a3, a2
-; ZIP-NEXT:    vl1re16.v v10, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re16.v v11, (a6)
-; ZIP-NEXT:    vl1re16.v v8, (a0)
-; ZIP-NEXT:    vl1re16.v v16, (a4)
-; ZIP-NEXT:    vl1re16.v v9, (a3)
-; ZIP-NEXT:    vl1re16.v v17, (a7)
-; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    li a3, 14
-; ZIP-NEXT:    mul a0, a0, a3
-; ZIP-NEXT:    add a0, sp, a0
-; ZIP-NEXT:    addi a0, a0, 64
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re16.v v12, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re16.v v13, (a6)
+; ZIP-NEXT:    vmv1r.v v11, v20
+; ZIP-NEXT:    add a4, a3, a2
+; ZIP-NEXT:    vmv1r.v v12, v16
+; ZIP-NEXT:    add a6, a5, a2
+; ZIP-NEXT:    vmv1r.v v13, v18
+; ZIP-NEXT:    vsseg6e32.v v8, (a0)
+; ZIP-NEXT:    vl1re32.v v14, (a1)
+; ZIP-NEXT:    add a1, a6, a2
+; ZIP-NEXT:    vl1re32.v v15, (a5)
+; ZIP-NEXT:    add a5, a1, a2
+; ZIP-NEXT:    vl1re32.v v18, (a5)
+; ZIP-NEXT:    add a5, a5, a2
+; ZIP-NEXT:    vl1re32.v v19, (a5)
+; ZIP-NEXT:    add a5, a4, a2
+; ZIP-NEXT:    vl1re32.v v16, (a6)
+; ZIP-NEXT:    add a6, a5, a2
+; ZIP-NEXT:    vl1re32.v v12, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re32.v v13, (a6)
+; ZIP-NEXT:    csrr a6, vlenb
+; ZIP-NEXT:    li a7, 12
+; ZIP-NEXT:    mul a6, a6, a7
+; ZIP-NEXT:    add a6, sp, a6
+; ZIP-NEXT:    addi a6, a6, 64
+; ZIP-NEXT:    vl1re32.v v17, (a1)
+; ZIP-NEXT:    vl1re32.v v10, (a4)
+; ZIP-NEXT:    vl1re32.v v11, (a5)
+; ZIP-NEXT:    vl1re32.v v8, (a0)
+; ZIP-NEXT:    vl1re32.v v9, (a3)
 ; ZIP-NEXT:    slli a2, a2, 3
-; ZIP-NEXT:    add a2, a0, a2
-; ZIP-NEXT:    vl1re16.v v14, (a6)
-; ZIP-NEXT:    vl1re16.v v15, (a1)
-; ZIP-NEXT:    add a5, a0, a5
-; ZIP-NEXT:    vs2r.v v20, (a5)
+; ZIP-NEXT:    add a2, a6, a2
 ; ZIP-NEXT:    vs4r.v v16, (a2)
-; ZIP-NEXT:    vs8r.v v8, (a0)
-; ZIP-NEXT:    vl8re16.v v16, (a2)
-; ZIP-NEXT:    vl8re16.v v8, (a0)
+; ZIP-NEXT:    vs8r.v v8, (a6)
+; ZIP-NEXT:    vl8re32.v v16, (a2)
+; ZIP-NEXT:    vl8re32.v v8, (a6)
 ; ZIP-NEXT:    addi sp, s0, -80
 ; ZIP-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    addi sp, sp, 80
 ; ZIP-NEXT:    ret
-  %res = call <vscale x 56 x i16> @llvm.vector.interleave7.nxv56i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c, <vscale x 8 x i16> %d, <vscale x 8 x i16> %e, <vscale x 8 x i16> %f, <vscale x 8 x i16> %g)
-  ret <vscale x 56 x i16> %res
+  %res = call <vscale x 24 x i32> @llvm.vector.interleave6.nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, <vscale x 4 x i32> %e, <vscale x 4 x i32> %f)
+  ret <vscale x 24 x i32> %res
 }
 
-
-define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, <vscale x 4 x i32> %e, <vscale x 4 x i32> %f, <vscale x 4 x i32> %g) nounwind {
+define <vscale x 12 x i64> @vector_interleave_nxv12i64_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d, <vscale x 2 x i64> %e, <vscale x 2 x i64> %f) nounwind {
 ;
-; RV32-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; RV32-LABEL: vector_interleave_nxv12i64_nxv2i64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -80
 ; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    addi s0, sp, 80
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    li a1, 28
+; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    sub sp, sp, a0
 ; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv2r.v v26, v20
-; RV32-NEXT:    addi a0, sp, 64
-; RV32-NEXT:    vmv2r.v v24, v16
+; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT:    vmv2r.v v20, v14
+; RV32-NEXT:    vmv2r.v v22, v12
+; RV32-NEXT:    vmv2r.v v24, v10
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 3
-; RV32-NEXT:    sub a1, a2, a1
+; RV32-NEXT:    li a0, 6
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 64
-; RV32-NEXT:    vmv2r.v v22, v12
+; RV32-NEXT:    vmv1r.v v10, v25
+; RV32-NEXT:    vmv1r.v v11, v23
+; RV32-NEXT:    vmv1r.v v12, v21
+; RV32-NEXT:    addi a0, sp, 64
+; RV32-NEXT:    vmv1r.v v13, v17
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    vmv2r.v v20, v8
-; RV32-NEXT:    vmv1r.v v1, v20
-; RV32-NEXT:    vmv1r.v v3, v22
-; RV32-NEXT:    vmv1r.v v5, v24
-; RV32-NEXT:    vmv1r.v v7, v26
+; RV32-NEXT:    vmv1r.v v14, v19
+; RV32-NEXT:    vsseg6e64.v v9, (a1)
+; RV32-NEXT:    vmv1r.v v9, v24
+; RV32-NEXT:    add a5, a1, a2
+; RV32-NEXT:    vmv1r.v v10, v22
 ; RV32-NEXT:    add a3, a0, a2
-; RV32-NEXT:    vmv1r.v v2, v10
-; RV32-NEXT:    add a4, a1, a2
-; RV32-NEXT:    slli a5, a2, 2
-; RV32-NEXT:    vmv1r.v v4, v14
-; RV32-NEXT:    slli a6, a2, 4
-; RV32-NEXT:    add a7, a4, a2
-; RV32-NEXT:    vmv1r.v v6, v18
-; RV32-NEXT:    sub a5, a6, a5
-; RV32-NEXT:    vmv1r.v v22, v11
-; RV32-NEXT:    add a6, a7, a2
-; RV32-NEXT:    vmv1r.v v24, v15
-; RV32-NEXT:    vsseg7e32.v v1, (a0)
-; RV32-NEXT:    vmv1r.v v26, v19
-; RV32-NEXT:    vsseg7e32.v v21, (a1)
-; RV32-NEXT:    vl1re32.v v18, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v19, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v20, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v21, (a6)
-; RV32-NEXT:    add a6, a3, a2
-; RV32-NEXT:    vl1re32.v v10, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v11, (a6)
-; RV32-NEXT:    vl1re32.v v8, (a0)
-; RV32-NEXT:    vl1re32.v v16, (a4)
-; RV32-NEXT:    vl1re32.v v9, (a3)
-; RV32-NEXT:    vl1re32.v v17, (a7)
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a3, 14
-; RV32-NEXT:    mul a0, a0, a3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 64
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v12, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v13, (a6)
+; RV32-NEXT:    vmv1r.v v11, v20
+; RV32-NEXT:    add a4, a3, a2
+; RV32-NEXT:    vmv1r.v v12, v16
+; RV32-NEXT:    add a6, a5, a2
+; RV32-NEXT:    vmv1r.v v13, v18
+; RV32-NEXT:    vsseg6e64.v v8, (a0)
+; RV32-NEXT:    vl1re64.v v14, (a1)
+; RV32-NEXT:    add a1, a6, a2
+; RV32-NEXT:    vl1re64.v v15, (a5)
+; RV32-NEXT:    add a5, a1, a2
+; RV32-NEXT:    vl1re64.v v18, (a5)
+; RV32-NEXT:    add a5, a5, a2
+; RV32-NEXT:    vl1re64.v v19, (a5)
+; RV32-NEXT:    add a5, a4, a2
+; RV32-NEXT:    vl1re64.v v16, (a6)
+; RV32-NEXT:    add a6, a5, a2
+; RV32-NEXT:    vl1re64.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re64.v v13, (a6)
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    li a7, 12
+; RV32-NEXT:    mul a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 64
+; RV32-NEXT:    vl1re64.v v17, (a1)
+; RV32-NEXT:    vl1re64.v v10, (a4)
+; RV32-NEXT:    vl1re64.v v11, (a5)
+; RV32-NEXT:    vl1re64.v v8, (a0)
+; RV32-NEXT:    vl1re64.v v9, (a3)
 ; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, a0, a2
-; RV32-NEXT:    vl1re32.v v14, (a6)
-; RV32-NEXT:    vl1re32.v v15, (a1)
-; RV32-NEXT:    add a5, a0, a5
-; RV32-NEXT:    vs2r.v v20, (a5)
+; RV32-NEXT:    add a2, a6, a2
 ; RV32-NEXT:    vs4r.v v16, (a2)
-; RV32-NEXT:    vs8r.v v8, (a0)
-; RV32-NEXT:    vl8re32.v v16, (a2)
-; RV32-NEXT:    vl8re32.v v8, (a0)
+; RV32-NEXT:    vs8r.v v8, (a6)
+; RV32-NEXT:    vl8re64.v v16, (a2)
+; RV32-NEXT:    vl8re64.v v8, (a6)
 ; RV32-NEXT:    addi sp, s0, -80
 ; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 80
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; RV64-LABEL: vector_interleave_nxv12i64_nxv2i64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -80
 ; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    addi s0, sp, 80
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    li a1, 28
+; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    sub sp, sp, a0
 ; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; RV64-NEXT:    vmv2r.v v26, v20
-; RV64-NEXT:    addi a0, sp, 64
-; RV64-NEXT:    vmv2r.v v24, v16
+; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv2r.v v20, v14
+; RV64-NEXT:    vmv2r.v v22, v12
+; RV64-NEXT:    vmv2r.v v24, v10
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    sub a1, a2, a1
+; RV64-NEXT:    li a0, 6
+; RV64-NEXT:    mul a1, a1, a0
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 64
-; RV64-NEXT:    vmv2r.v v22, v12
+; RV64-NEXT:    vmv1r.v v10, v25
+; RV64-NEXT:    vmv1r.v v11, v23
+; RV64-NEXT:    vmv1r.v v12, v21
+; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    vmv1r.v v13, v17
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    vmv2r.v v20, v8
-; RV64-NEXT:    vmv1r.v v1, v20
-; RV64-NEXT:    vmv1r.v v3, v22
-; RV64-NEXT:    vmv1r.v v5, v24
-; RV64-NEXT:    vmv1r.v v7, v26
+; RV64-NEXT:    vmv1r.v v14, v19
+; RV64-NEXT:    vsseg6e64.v v9, (a1)
+; RV64-NEXT:    vmv1r.v v9, v24
+; RV64-NEXT:    add a5, a1, a2
+; RV64-NEXT:    vmv1r.v v10, v22
 ; RV64-NEXT:    add a3, a0, a2
-; RV64-NEXT:    vmv1r.v v2, v10
-; RV64-NEXT:    add a4, a1, a2
-; RV64-NEXT:    slli a5, a2, 2
-; RV64-NEXT:    vmv1r.v v4, v14
-; RV64-NEXT:    slli a6, a2, 4
-; RV64-NEXT:    add a7, a4, a2
-; RV64-NEXT:    vmv1r.v v6, v18
-; RV64-NEXT:    sub a5, a6, a5
-; RV64-NEXT:    vmv1r.v v22, v11
-; RV64-NEXT:    add a6, a7, a2
-; RV64-NEXT:    vmv1r.v v24, v15
-; RV64-NEXT:    vsseg7e32.v v1, (a0)
-; RV64-NEXT:    vmv1r.v v26, v19
-; RV64-NEXT:    vsseg7e32.v v21, (a1)
-; RV64-NEXT:    vl1re32.v v18, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v19, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v20, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v21, (a6)
-; RV64-NEXT:    add a6, a3, a2
-; RV64-NEXT:    vl1re32.v v10, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v11, (a6)
-; RV64-NEXT:    vl1re32.v v8, (a0)
-; RV64-NEXT:    vl1re32.v v16, (a4)
-; RV64-NEXT:    vl1re32.v v9, (a3)
-; RV64-NEXT:    vl1re32.v v17, (a7)
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a3, 14
-; RV64-NEXT:    mul a0, a0, a3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 64
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v12, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v13, (a6)
+; RV64-NEXT:    vmv1r.v v11, v20
+; RV64-NEXT:    add a4, a3, a2
+; RV64-NEXT:    vmv1r.v v12, v16
+; RV64-NEXT:    add a6, a5, a2
+; RV64-NEXT:    vmv1r.v v13, v18
+; RV64-NEXT:    vsseg6e64.v v8, (a0)
+; RV64-NEXT:    vl1re64.v v14, (a1)
+; RV64-NEXT:    add a1, a6, a2
+; RV64-NEXT:    vl1re64.v v15, (a5)
+; RV64-NEXT:    add a5, a1, a2
+; RV64-NEXT:    vl1re64.v v18, (a5)
+; RV64-NEXT:    add a5, a5, a2
+; RV64-NEXT:    vl1re64.v v19, (a5)
+; RV64-NEXT:    add a5, a4, a2
+; RV64-NEXT:    vl1re64.v v16, (a6)
+; RV64-NEXT:    add a6, a5, a2
+; RV64-NEXT:    vl1re64.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re64.v v13, (a6)
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    li a7, 12
+; RV64-NEXT:    mul a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 64
+; RV64-NEXT:    vl1re64.v v17, (a1)
+; RV64-NEXT:    vl1re64.v v10, (a4)
+; RV64-NEXT:    vl1re64.v v11, (a5)
+; RV64-NEXT:    vl1re64.v v8, (a0)
+; RV64-NEXT:    vl1re64.v v9, (a3)
 ; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a2, a0, a2
-; RV64-NEXT:    vl1re32.v v14, (a6)
-; RV64-NEXT:    vl1re32.v v15, (a1)
-; RV64-NEXT:    add a5, a0, a5
-; RV64-NEXT:    vs2r.v v20, (a5)
+; RV64-NEXT:    add a2, a6, a2
 ; RV64-NEXT:    vs4r.v v16, (a2)
-; RV64-NEXT:    vs8r.v v8, (a0)
-; RV64-NEXT:    vl8re32.v v16, (a2)
-; RV64-NEXT:    vl8re32.v v8, (a0)
+; RV64-NEXT:    vs8r.v v8, (a6)
+; RV64-NEXT:    vl8re64.v v16, (a2)
+; RV64-NEXT:    vl8re64.v v8, (a6)
 ; RV64-NEXT:    addi sp, s0, -80
 ; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 80
 ; RV64-NEXT:    ret
 ;
-; ZVBB-RV32-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; ZVBB-RV32-LABEL: vector_interleave_nxv12i64_nxv2i64:
 ; ZVBB-RV32:       # %bb.0:
 ; ZVBB-RV32-NEXT:    addi sp, sp, -80
 ; ZVBB-RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
 ; ZVBB-RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
 ; ZVBB-RV32-NEXT:    addi s0, sp, 80
 ; ZVBB-RV32-NEXT:    csrr a0, vlenb
-; ZVBB-RV32-NEXT:    slli a0, a0, 5
+; ZVBB-RV32-NEXT:    li a1, 28
+; ZVBB-RV32-NEXT:    mul a0, a0, a1
 ; ZVBB-RV32-NEXT:    sub sp, sp, a0
 ; ZVBB-RV32-NEXT:    andi sp, sp, -64
-; ZVBB-RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
-; ZVBB-RV32-NEXT:    addi a0, sp, 64
-; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
+; ZVBB-RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV32-NEXT:    vmv2r.v v20, v14
+; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV32-NEXT:    vmv2r.v v24, v10
 ; ZVBB-RV32-NEXT:    csrr a1, vlenb
-; ZVBB-RV32-NEXT:    slli a2, a1, 3
-; ZVBB-RV32-NEXT:    sub a1, a2, a1
+; ZVBB-RV32-NEXT:    li a0, 6
+; ZVBB-RV32-NEXT:    mul a1, a1, a0
 ; ZVBB-RV32-NEXT:    add a1, sp, a1
 ; ZVBB-RV32-NEXT:    addi a1, a1, 64
-; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV32-NEXT:    vmv1r.v v10, v25
+; ZVBB-RV32-NEXT:    vmv1r.v v11, v23
+; ZVBB-RV32-NEXT:    vmv1r.v v12, v21
+; ZVBB-RV32-NEXT:    addi a0, sp, 64
+; ZVBB-RV32-NEXT:    vmv1r.v v13, v17
 ; ZVBB-RV32-NEXT:    csrr a2, vlenb
-; ZVBB-RV32-NEXT:    vmv2r.v v20, v8
-; ZVBB-RV32-NEXT:    vmv1r.v v1, v20
-; ZVBB-RV32-NEXT:    vmv1r.v v3, v22
-; ZVBB-RV32-NEXT:    vmv1r.v v5, v24
-; ZVBB-RV32-NEXT:    vmv1r.v v7, v26
+; ZVBB-RV32-NEXT:    vmv1r.v v14, v19
+; ZVBB-RV32-NEXT:    vsseg6e64.v v9, (a1)
+; ZVBB-RV32-NEXT:    vmv1r.v v9, v24
+; ZVBB-RV32-NEXT:    add a5, a1, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v10, v22
 ; ZVBB-RV32-NEXT:    add a3, a0, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
-; ZVBB-RV32-NEXT:    slli a5, a2, 2
-; ZVBB-RV32-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV32-NEXT:    slli a6, a2, 4
-; ZVBB-RV32-NEXT:    add a7, a4, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV32-NEXT:    sub a5, a6, a5
-; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV32-NEXT:    add a6, a7, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV32-NEXT:    vsseg7e32.v v1, (a0)
-; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV32-NEXT:    vsseg7e32.v v21, (a1)
-; ZVBB-RV32-NEXT:    vl1re32.v v18, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v19, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v20, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v21, (a6)
-; ZVBB-RV32-NEXT:    add a6, a3, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v10, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v11, (a6)
-; ZVBB-RV32-NEXT:    vl1re32.v v8, (a0)
-; ZVBB-RV32-NEXT:    vl1re32.v v16, (a4)
-; ZVBB-RV32-NEXT:    vl1re32.v v9, (a3)
-; ZVBB-RV32-NEXT:    vl1re32.v v17, (a7)
-; ZVBB-RV32-NEXT:    csrr a0, vlenb
-; ZVBB-RV32-NEXT:    li a3, 14
-; ZVBB-RV32-NEXT:    mul a0, a0, a3
-; ZVBB-RV32-NEXT:    add a0, sp, a0
-; ZVBB-RV32-NEXT:    addi a0, a0, 64
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v12, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v13, (a6)
+; ZVBB-RV32-NEXT:    vmv1r.v v11, v20
+; ZVBB-RV32-NEXT:    add a4, a3, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v12, v16
+; ZVBB-RV32-NEXT:    add a6, a5, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v13, v18
+; ZVBB-RV32-NEXT:    vsseg6e64.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re64.v v14, (a1)
+; ZVBB-RV32-NEXT:    add a1, a6, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v15, (a5)
+; ZVBB-RV32-NEXT:    add a5, a1, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v18, (a5)
+; ZVBB-RV32-NEXT:    add a5, a5, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v19, (a5)
+; ZVBB-RV32-NEXT:    add a5, a4, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v16, (a6)
+; ZVBB-RV32-NEXT:    add a6, a5, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v13, (a6)
+; ZVBB-RV32-NEXT:    csrr a6, vlenb
+; ZVBB-RV32-NEXT:    li a7, 12
+; ZVBB-RV32-NEXT:    mul a6, a6, a7
+; ZVBB-RV32-NEXT:    add a6, sp, a6
+; ZVBB-RV32-NEXT:    addi a6, a6, 64
+; ZVBB-RV32-NEXT:    vl1re64.v v17, (a1)
+; ZVBB-RV32-NEXT:    vl1re64.v v10, (a4)
+; ZVBB-RV32-NEXT:    vl1re64.v v11, (a5)
+; ZVBB-RV32-NEXT:    vl1re64.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re64.v v9, (a3)
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
-; ZVBB-RV32-NEXT:    add a2, a0, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v14, (a6)
-; ZVBB-RV32-NEXT:    vl1re32.v v15, (a1)
-; ZVBB-RV32-NEXT:    add a5, a0, a5
-; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT:    add a2, a6, a2
 ; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
-; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
-; ZVBB-RV32-NEXT:    vl8re32.v v16, (a2)
-; ZVBB-RV32-NEXT:    vl8re32.v v8, (a0)
+; ZVBB-RV32-NEXT:    vs8r.v v8, (a6)
+; ZVBB-RV32-NEXT:    vl8re64.v v16, (a2)
+; ZVBB-RV32-NEXT:    vl8re64.v v8, (a6)
 ; ZVBB-RV32-NEXT:    addi sp, s0, -80
 ; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    addi sp, sp, 80
 ; ZVBB-RV32-NEXT:    ret
 ;
-; ZVBB-RV64-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; ZVBB-RV64-LABEL: vector_interleave_nxv12i64_nxv2i64:
 ; ZVBB-RV64:       # %bb.0:
 ; ZVBB-RV64-NEXT:    addi sp, sp, -80
 ; ZVBB-RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
 ; ZVBB-RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
 ; ZVBB-RV64-NEXT:    addi s0, sp, 80
 ; ZVBB-RV64-NEXT:    csrr a0, vlenb
-; ZVBB-RV64-NEXT:    slli a0, a0, 5
+; ZVBB-RV64-NEXT:    li a1, 28
+; ZVBB-RV64-NEXT:    mul a0, a0, a1
 ; ZVBB-RV64-NEXT:    sub sp, sp, a0
 ; ZVBB-RV64-NEXT:    andi sp, sp, -64
-; ZVBB-RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
-; ZVBB-RV64-NEXT:    addi a0, sp, 64
-; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
+; ZVBB-RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV64-NEXT:    vmv2r.v v20, v14
+; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV64-NEXT:    vmv2r.v v24, v10
 ; ZVBB-RV64-NEXT:    csrr a1, vlenb
-; ZVBB-RV64-NEXT:    slli a2, a1, 3
-; ZVBB-RV64-NEXT:    sub a1, a2, a1
+; ZVBB-RV64-NEXT:    li a0, 6
+; ZVBB-RV64-NEXT:    mul a1, a1, a0
 ; ZVBB-RV64-NEXT:    add a1, sp, a1
 ; ZVBB-RV64-NEXT:    addi a1, a1, 64
-; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV64-NEXT:    vmv1r.v v10, v25
+; ZVBB-RV64-NEXT:    vmv1r.v v11, v23
+; ZVBB-RV64-NEXT:    vmv1r.v v12, v21
+; ZVBB-RV64-NEXT:    addi a0, sp, 64
+; ZVBB-RV64-NEXT:    vmv1r.v v13, v17
 ; ZVBB-RV64-NEXT:    csrr a2, vlenb
-; ZVBB-RV64-NEXT:    vmv2r.v v20, v8
-; ZVBB-RV64-NEXT:    vmv1r.v v1, v20
-; ZVBB-RV64-NEXT:    vmv1r.v v3, v22
-; ZVBB-RV64-NEXT:    vmv1r.v v5, v24
-; ZVBB-RV64-NEXT:    vmv1r.v v7, v26
+; ZVBB-RV64-NEXT:    vmv1r.v v14, v19
+; ZVBB-RV64-NEXT:    vsseg6e64.v v9, (a1)
+; ZVBB-RV64-NEXT:    vmv1r.v v9, v24
+; ZVBB-RV64-NEXT:    add a5, a1, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v10, v22
 ; ZVBB-RV64-NEXT:    add a3, a0, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
-; ZVBB-RV64-NEXT:    slli a5, a2, 2
-; ZVBB-RV64-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV64-NEXT:    slli a6, a2, 4
-; ZVBB-RV64-NEXT:    add a7, a4, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV64-NEXT:    sub a5, a6, a5
-; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV64-NEXT:    add a6, a7, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV64-NEXT:    vsseg7e32.v v1, (a0)
-; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV64-NEXT:    vsseg7e32.v v21, (a1)
-; ZVBB-RV64-NEXT:    vl1re32.v v18, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v19, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v20, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v21, (a6)
-; ZVBB-RV64-NEXT:    add a6, a3, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v10, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v11, (a6)
-; ZVBB-RV64-NEXT:    vl1re32.v v8, (a0)
-; ZVBB-RV64-NEXT:    vl1re32.v v16, (a4)
-; ZVBB-RV64-NEXT:    vl1re32.v v9, (a3)
-; ZVBB-RV64-NEXT:    vl1re32.v v17, (a7)
-; ZVBB-RV64-NEXT:    csrr a0, vlenb
-; ZVBB-RV64-NEXT:    li a3, 14
-; ZVBB-RV64-NEXT:    mul a0, a0, a3
-; ZVBB-RV64-NEXT:    add a0, sp, a0
-; ZVBB-RV64-NEXT:    addi a0, a0, 64
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v12, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v13, (a6)
+; ZVBB-RV64-NEXT:    vmv1r.v v11, v20
+; ZVBB-RV64-NEXT:    add a4, a3, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v12, v16
+; ZVBB-RV64-NEXT:    add a6, a5, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v13, v18
+; ZVBB-RV64-NEXT:    vsseg6e64.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re64.v v14, (a1)
+; ZVBB-RV64-NEXT:    add a1, a6, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v15, (a5)
+; ZVBB-RV64-NEXT:    add a5, a1, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v18, (a5)
+; ZVBB-RV64-NEXT:    add a5, a5, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v19, (a5)
+; ZVBB-RV64-NEXT:    add a5, a4, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v16, (a6)
+; ZVBB-RV64-NEXT:    add a6, a5, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v13, (a6)
+; ZVBB-RV64-NEXT:    csrr a6, vlenb
+; ZVBB-RV64-NEXT:    li a7, 12
+; ZVBB-RV64-NEXT:    mul a6, a6, a7
+; ZVBB-RV64-NEXT:    add a6, sp, a6
+; ZVBB-RV64-NEXT:    addi a6, a6, 64
+; ZVBB-RV64-NEXT:    vl1re64.v v17, (a1)
+; ZVBB-RV64-NEXT:    vl1re64.v v10, (a4)
+; ZVBB-RV64-NEXT:    vl1re64.v v11, (a5)
+; ZVBB-RV64-NEXT:    vl1re64.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re64.v v9, (a3)
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
-; ZVBB-RV64-NEXT:    add a2, a0, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v14, (a6)
-; ZVBB-RV64-NEXT:    vl1re32.v v15, (a1)
-; ZVBB-RV64-NEXT:    add a5, a0, a5
-; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT:    add a2, a6, a2
 ; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
-; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
-; ZVBB-RV64-NEXT:    vl8re32.v v16, (a2)
-; ZVBB-RV64-NEXT:    vl8re32.v v8, (a0)
+; ZVBB-RV64-NEXT:    vs8r.v v8, (a6)
+; ZVBB-RV64-NEXT:    vl8re64.v v16, (a2)
+; ZVBB-RV64-NEXT:    vl8re64.v v8, (a6)
 ; ZVBB-RV64-NEXT:    addi sp, s0, -80
 ; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    addi sp, sp, 80
 ; ZVBB-RV64-NEXT:    ret
 ;
-; ZIP-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; ZIP-LABEL: vector_interleave_nxv12i64_nxv2i64:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -80
 ; ZIP-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
 ; ZIP-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
 ; ZIP-NEXT:    addi s0, sp, 80
 ; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    slli a0, a0, 5
+; ZIP-NEXT:    li a1, 28
+; ZIP-NEXT:    mul a0, a0, a1
 ; ZIP-NEXT:    sub sp, sp, a0
 ; ZIP-NEXT:    andi sp, sp, -64
-; ZIP-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; ZIP-NEXT:    vmv2r.v v26, v20
-; ZIP-NEXT:    addi a0, sp, 64
-; ZIP-NEXT:    vmv2r.v v24, v16
+; ZIP-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; ZIP-NEXT:    vmv2r.v v20, v14
+; ZIP-NEXT:    vmv2r.v v22, v12
+; ZIP-NEXT:    vmv2r.v v24, v10
 ; ZIP-NEXT:    csrr a1, vlenb
-; ZIP-NEXT:    slli a2, a1, 3
-; ZIP-NEXT:    sub a1, a2, a1
+; ZIP-NEXT:    li a0, 6
+; ZIP-NEXT:    mul a1, a1, a0
 ; ZIP-NEXT:    add a1, sp, a1
 ; ZIP-NEXT:    addi a1, a1, 64
-; ZIP-NEXT:    vmv2r.v v22, v12
+; ZIP-NEXT:    vmv1r.v v10, v25
+; ZIP-NEXT:    vmv1r.v v11, v23
+; ZIP-NEXT:    vmv1r.v v12, v21
+; ZIP-NEXT:    addi a0, sp, 64
+; ZIP-NEXT:    vmv1r.v v13, v17
 ; ZIP-NEXT:    csrr a2, vlenb
-; ZIP-NEXT:    vmv2r.v v20, v8
-; ZIP-NEXT:    vmv1r.v v1, v20
-; ZIP-NEXT:    vmv1r.v v3, v22
-; ZIP-NEXT:    vmv1r.v v5, v24
-; ZIP-NEXT:    vmv1r.v v7, v26
+; ZIP-NEXT:    vmv1r.v v14, v19
+; ZIP-NEXT:    vsseg6e64.v v9, (a1)
+; ZIP-NEXT:    vmv1r.v v9, v24
+; ZIP-NEXT:    add a5, a1, a2
+; ZIP-NEXT:    vmv1r.v v10, v22
 ; ZIP-NEXT:    add a3, a0, a2
-; ZIP-NEXT:    vmv1r.v v2, v10
-; ZIP-NEXT:    add a4, a1, a2
-; ZIP-NEXT:    slli a5, a2, 2
-; ZIP-NEXT:    vmv1r.v v4, v14
-; ZIP-NEXT:    slli a6, a2, 4
-; ZIP-NEXT:    add a7, a4, a2
-; ZIP-NEXT:    vmv1r.v v6, v18
-; ZIP-NEXT:    sub a5, a6, a5
-; ZIP-NEXT:    vmv1r.v v22, v11
-; ZIP-NEXT:    add a6, a7, a2
-; ZIP-NEXT:    vmv1r.v v24, v15
-; ZIP-NEXT:    vsseg7e32.v v1, (a0)
-; ZIP-NEXT:    vmv1r.v v26, v19
-; ZIP-NEXT:    vsseg7e32.v v21, (a1)
-; ZIP-NEXT:    vl1re32.v v18, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re32.v v19, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re32.v v20, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re32.v v21, (a6)
-; ZIP-NEXT:    add a6, a3, a2
-; ZIP-NEXT:    vl1re32.v v10, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re32.v v11, (a6)
-; ZIP-NEXT:    vl1re32.v v8, (a0)
-; ZIP-NEXT:    vl1re32.v v16, (a4)
-; ZIP-NEXT:    vl1re32.v v9, (a3)
-; ZIP-NEXT:    vl1re32.v v17, (a7)
-; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    li a3, 14
-; ZIP-NEXT:    mul a0, a0, a3
-; ZIP-NEXT:    add a0, sp, a0
-; ZIP-NEXT:    addi a0, a0, 64
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re32.v v12, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re32.v v13, (a6)
+; ZIP-NEXT:    vmv1r.v v11, v20
+; ZIP-NEXT:    add a4, a3, a2
+; ZIP-NEXT:    vmv1r.v v12, v16
+; ZIP-NEXT:    add a6, a5, a2
+; ZIP-NEXT:    vmv1r.v v13, v18
+; ZIP-NEXT:    vsseg6e64.v v8, (a0)
+; ZIP-NEXT:    vl1re64.v v14, (a1)
+; ZIP-NEXT:    add a1, a6, a2
+; ZIP-NEXT:    vl1re64.v v15, (a5)
+; ZIP-NEXT:    add a5, a1, a2
+; ZIP-NEXT:    vl1re64.v v18, (a5)
+; ZIP-NEXT:    add a5, a5, a2
+; ZIP-NEXT:    vl1re64.v v19, (a5)
+; ZIP-NEXT:    add a5, a4, a2
+; ZIP-NEXT:    vl1re64.v v16, (a6)
+; ZIP-NEXT:    add a6, a5, a2
+; ZIP-NEXT:    vl1re64.v v12, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re64.v v13, (a6)
+; ZIP-NEXT:    csrr a6, vlenb
+; ZIP-NEXT:    li a7, 12
+; ZIP-NEXT:    mul a6, a6, a7
+; ZIP-NEXT:    add a6, sp, a6
+; ZIP-NEXT:    addi a6, a6, 64
+; ZIP-NEXT:    vl1re64.v v17, (a1)
+; ZIP-NEXT:    vl1re64.v v10, (a4)
+; ZIP-NEXT:    vl1re64.v v11, (a5)
+; ZIP-NEXT:    vl1re64.v v8, (a0)
+; ZIP-NEXT:    vl1re64.v v9, (a3)
 ; ZIP-NEXT:    slli a2, a2, 3
-; ZIP-NEXT:    add a2, a0, a2
-; ZIP-NEXT:    vl1re32.v v14, (a6)
-; ZIP-NEXT:    vl1re32.v v15, (a1)
-; ZIP-NEXT:    add a5, a0, a5
-; ZIP-NEXT:    vs2r.v v20, (a5)
+; ZIP-NEXT:    add a2, a6, a2
 ; ZIP-NEXT:    vs4r.v v16, (a2)
-; ZIP-NEXT:    vs8r.v v8, (a0)
-; ZIP-NEXT:    vl8re32.v v16, (a2)
-; ZIP-NEXT:    vl8re32.v v8, (a0)
+; ZIP-NEXT:    vs8r.v v8, (a6)
+; ZIP-NEXT:    vl8re64.v v16, (a2)
+; ZIP-NEXT:    vl8re64.v v8, (a6)
 ; ZIP-NEXT:    addi sp, s0, -80
 ; ZIP-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    addi sp, sp, 80
 ; ZIP-NEXT:    ret
-  %res = call <vscale x 28 x i32> @llvm.vector.interleave7.nxv28i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, <vscale x 4 x i32> %e, <vscale x 4 x i32> %f, <vscale x 4 x i32> %g)
-  ret <vscale x 28 x i32> %res
+  %res = call <vscale x 12 x i64> @llvm.vector.interleave6.nxv12i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d, <vscale x 2 x i64> %e, <vscale x 2 x i64> %f)
+  ret <vscale x 12 x i64> %res
 }
 
-define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d, <vscale x 2 x i64> %e, <vscale x 2 x i64> %f, <vscale x 2 x i64> %g) nounwind {
+define <vscale x 112 x i1> @vector_interleave_nxv112i1_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e, <vscale x 16 x i1> %f, <vscale x 16 x i1> %g) nounwind {
+; CHECK-LABEL: vector_interleave_nxv112i1_nxv16i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 14
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v14, 0
+; CHECK-NEXT:    addi a4, sp, 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    csrr a2, vlenb
+; CHECK-NEXT:    vmerge.vim v16, v14, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v22, v14, 1, v0
+; CHECK-NEXT:    add a3, a4, a2
+; CHECK-NEXT:    srli a1, a2, 2
+; CHECK-NEXT:    add a5, a0, a2
+; CHECK-NEXT:    vmv4r.v v24, v16
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v18, v14, 1, v0
+; CHECK-NEXT:    add a6, a3, a2
+; CHECK-NEXT:    vmv1r.v v25, v22
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v8, v14, 1, v0
+; CHECK-NEXT:    vmv1r.v v26, v18
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmerge.vim v20, v14, 1, v0
+; CHECK-NEXT:    vmv1r.v v27, v8
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vim v10, v14, 1, v0
+; CHECK-NEXT:    vmv1r.v v28, v20
+; CHECK-NEXT:    vmv1r.v v18, v23
+; CHECK-NEXT:    add a7, a6, a2
+; CHECK-NEXT:    vmv1r.v v29, v10
+; CHECK-NEXT:    vmv1r.v v20, v9
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vmerge.vim v30, v14, 1, v0
+; CHECK-NEXT:    vmv1r.v v22, v11
+; CHECK-NEXT:    vsetvli t0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vsseg7e8.v v24, (a4)
+; CHECK-NEXT:    vmv1r.v v23, v31
+; CHECK-NEXT:    vsseg7e8.v v17, (a0)
+; CHECK-NEXT:    vl1r.v v8, (a6)
+; CHECK-NEXT:    add a6, a7, a2
+; CHECK-NEXT:    vl1r.v v10, (a4)
+; CHECK-NEXT:    add a4, a6, a2
+; CHECK-NEXT:    vl1r.v v12, (a6)
+; CHECK-NEXT:    add a6, a4, a2
+; CHECK-NEXT:    vl1r.v v14, (a6)
+; CHECK-NEXT:    add a6, a5, a2
+; CHECK-NEXT:    vl1r.v v16, (a5)
+; CHECK-NEXT:    add a5, a6, a2
+; CHECK-NEXT:    vl1r.v v18, (a5)
+; CHECK-NEXT:    add a5, a5, a2
+; CHECK-NEXT:    vl1r.v v9, (a7)
+; CHECK-NEXT:    add a7, a5, a2
+; CHECK-NEXT:    vl1r.v v20, (a7)
+; CHECK-NEXT:    add a7, a7, a2
+; CHECK-NEXT:    srli a2, a2, 1
+; CHECK-NEXT:    vl1r.v v11, (a3)
+; CHECK-NEXT:    add a3, a1, a1
+; CHECK-NEXT:    vl1r.v v13, (a4)
+; CHECK-NEXT:    add a4, a2, a2
+; CHECK-NEXT:    vl1r.v v15, (a0)
+; CHECK-NEXT:    vl1r.v v19, (a5)
+; CHECK-NEXT:    vl1r.v v17, (a6)
+; CHECK-NEXT:    vl1r.v v21, (a7)
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmsne.vi v22, v8, 0
+; CHECK-NEXT:    vmsne.vi v0, v10, 0
+; CHECK-NEXT:    vmsne.vi v9, v12, 0
+; CHECK-NEXT:    vmsne.vi v10, v14, 0
+; CHECK-NEXT:    vmsne.vi v11, v18, 0
+; CHECK-NEXT:    vmsne.vi v8, v16, 0
+; CHECK-NEXT:    vmsne.vi v12, v20, 0
+; CHECK-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v0, v22, a1
+; CHECK-NEXT:    vslideup.vx v9, v10, a1
+; CHECK-NEXT:    vslideup.vx v8, v11, a1
+; CHECK-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v0, v9, a2
+; CHECK-NEXT:    vslideup.vx v8, v12, a2
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 14
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
 ;
-; RV32-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; ZVBB-LABEL: vector_interleave_nxv112i1_nxv16i1:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    li a1, 14
+; ZVBB-NEXT:    mul a0, a0, a1
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; ZVBB-NEXT:    vmv.v.i v14, 0
+; ZVBB-NEXT:    addi a4, sp, 16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a1, a0, 3
+; ZVBB-NEXT:    sub a0, a1, a0
+; ZVBB-NEXT:    add a0, sp, a0
+; ZVBB-NEXT:    addi a0, a0, 16
+; ZVBB-NEXT:    csrr a2, vlenb
+; ZVBB-NEXT:    vmerge.vim v16, v14, 1, v0
+; ZVBB-NEXT:    vmv1r.v v0, v8
+; ZVBB-NEXT:    vmerge.vim v22, v14, 1, v0
+; ZVBB-NEXT:    add a3, a4, a2
+; ZVBB-NEXT:    srli a1, a2, 2
+; ZVBB-NEXT:    add a5, a0, a2
+; ZVBB-NEXT:    vmv4r.v v24, v16
+; ZVBB-NEXT:    vmv1r.v v0, v9
+; ZVBB-NEXT:    vmerge.vim v18, v14, 1, v0
+; ZVBB-NEXT:    add a6, a3, a2
+; ZVBB-NEXT:    vmv1r.v v25, v22
+; ZVBB-NEXT:    vmv1r.v v0, v10
+; ZVBB-NEXT:    vmerge.vim v8, v14, 1, v0
+; ZVBB-NEXT:    vmv1r.v v26, v18
+; ZVBB-NEXT:    vmv1r.v v0, v11
+; ZVBB-NEXT:    vmerge.vim v20, v14, 1, v0
+; ZVBB-NEXT:    vmv1r.v v27, v8
+; ZVBB-NEXT:    vmv1r.v v0, v12
+; ZVBB-NEXT:    vmerge.vim v10, v14, 1, v0
+; ZVBB-NEXT:    vmv1r.v v28, v20
+; ZVBB-NEXT:    vmv1r.v v18, v23
+; ZVBB-NEXT:    add a7, a6, a2
+; ZVBB-NEXT:    vmv1r.v v29, v10
+; ZVBB-NEXT:    vmv1r.v v20, v9
+; ZVBB-NEXT:    vmv1r.v v0, v13
+; ZVBB-NEXT:    vmerge.vim v30, v14, 1, v0
+; ZVBB-NEXT:    vmv1r.v v22, v11
+; ZVBB-NEXT:    vsetvli t0, zero, e8, m1, ta, ma
+; ZVBB-NEXT:    vsseg7e8.v v24, (a4)
+; ZVBB-NEXT:    vmv1r.v v23, v31
+; ZVBB-NEXT:    vsseg7e8.v v17, (a0)
+; ZVBB-NEXT:    vl1r.v v8, (a6)
+; ZVBB-NEXT:    add a6, a7, a2
+; ZVBB-NEXT:    vl1r.v v10, (a4)
+; ZVBB-NEXT:    add a4, a6, a2
+; ZVBB-NEXT:    vl1r.v v12, (a6)
+; ZVBB-NEXT:    add a6, a4, a2
+; ZVBB-NEXT:    vl1r.v v14, (a6)
+; ZVBB-NEXT:    add a6, a5, a2
+; ZVBB-NEXT:    vl1r.v v16, (a5)
+; ZVBB-NEXT:    add a5, a6, a2
+; ZVBB-NEXT:    vl1r.v v18, (a5)
+; ZVBB-NEXT:    add a5, a5, a2
+; ZVBB-NEXT:    vl1r.v v9, (a7)
+; ZVBB-NEXT:    add a7, a5, a2
+; ZVBB-NEXT:    vl1r.v v20, (a7)
+; ZVBB-NEXT:    add a7, a7, a2
+; ZVBB-NEXT:    srli a2, a2, 1
+; ZVBB-NEXT:    vl1r.v v11, (a3)
+; ZVBB-NEXT:    add a3, a1, a1
+; ZVBB-NEXT:    vl1r.v v13, (a4)
+; ZVBB-NEXT:    add a4, a2, a2
+; ZVBB-NEXT:    vl1r.v v15, (a0)
+; ZVBB-NEXT:    vl1r.v v19, (a5)
+; ZVBB-NEXT:    vl1r.v v17, (a6)
+; ZVBB-NEXT:    vl1r.v v21, (a7)
+; ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; ZVBB-NEXT:    vmsne.vi v22, v8, 0
+; ZVBB-NEXT:    vmsne.vi v0, v10, 0
+; ZVBB-NEXT:    vmsne.vi v9, v12, 0
+; ZVBB-NEXT:    vmsne.vi v10, v14, 0
+; ZVBB-NEXT:    vmsne.vi v11, v18, 0
+; ZVBB-NEXT:    vmsne.vi v8, v16, 0
+; ZVBB-NEXT:    vmsne.vi v12, v20, 0
+; ZVBB-NEXT:    vsetvli zero, a3, e8, mf2, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v22, a1
+; ZVBB-NEXT:    vslideup.vx v9, v10, a1
+; ZVBB-NEXT:    vslideup.vx v8, v11, a1
+; ZVBB-NEXT:    vsetvli zero, a4, e8, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v9, a2
+; ZVBB-NEXT:    vslideup.vx v8, v12, a2
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    li a1, 14
+; ZVBB-NEXT:    mul a0, a0, a1
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 112 x i1> @llvm.vector.interleave7.nxv112i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e, <vscale x 16 x i1> %f, <vscale x 16 x i1> %g)
+  ret <vscale x 112 x i1> %res
+}
+
+
+define <vscale x 112 x i8> @vector_interleave_nxv112i8_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, <vscale x 16 x i8> %e, <vscale x 16 x i8> %f, <vscale x 16 x i8> %g) nounwind {
+;
+; RV32-LABEL: vector_interleave_nxv112i8_nxv16i8:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -80
 ; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
@@ -3461,7 +3871,7 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    sub sp, sp, a0
 ; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV32-NEXT:    vmv2r.v v26, v20
 ; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vmv2r.v v24, v16
@@ -3489,51 +3899,51 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; RV32-NEXT:    vmv1r.v v22, v11
 ; RV32-NEXT:    add a6, a7, a2
 ; RV32-NEXT:    vmv1r.v v24, v15
-; RV32-NEXT:    vsseg7e64.v v1, (a0)
+; RV32-NEXT:    vsseg7e8.v v1, (a0)
 ; RV32-NEXT:    vmv1r.v v26, v19
-; RV32-NEXT:    vsseg7e64.v v21, (a1)
-; RV32-NEXT:    vl1re64.v v18, (a6)
+; RV32-NEXT:    vsseg7e8.v v21, (a1)
+; RV32-NEXT:    vl1r.v v18, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v19, (a6)
+; RV32-NEXT:    vl1r.v v19, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v20, (a6)
+; RV32-NEXT:    vl1r.v v20, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v21, (a6)
+; RV32-NEXT:    vl1r.v v21, (a6)
 ; RV32-NEXT:    add a6, a3, a2
-; RV32-NEXT:    vl1re64.v v10, (a6)
+; RV32-NEXT:    vl1r.v v10, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v11, (a6)
-; RV32-NEXT:    vl1re64.v v8, (a0)
-; RV32-NEXT:    vl1re64.v v16, (a4)
-; RV32-NEXT:    vl1re64.v v9, (a3)
-; RV32-NEXT:    vl1re64.v v17, (a7)
+; RV32-NEXT:    vl1r.v v11, (a6)
+; RV32-NEXT:    vl1r.v v8, (a0)
+; RV32-NEXT:    vl1r.v v16, (a4)
+; RV32-NEXT:    vl1r.v v9, (a3)
+; RV32-NEXT:    vl1r.v v17, (a7)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a3, 14
 ; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 64
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v12, (a6)
+; RV32-NEXT:    vl1r.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v13, (a6)
+; RV32-NEXT:    vl1r.v v13, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, a0, a2
-; RV32-NEXT:    vl1re64.v v14, (a6)
-; RV32-NEXT:    vl1re64.v v15, (a1)
+; RV32-NEXT:    vl1r.v v14, (a6)
+; RV32-NEXT:    vl1r.v v15, (a1)
 ; RV32-NEXT:    add a5, a0, a5
 ; RV32-NEXT:    vs2r.v v20, (a5)
 ; RV32-NEXT:    vs4r.v v16, (a2)
 ; RV32-NEXT:    vs8r.v v8, (a0)
-; RV32-NEXT:    vl8re64.v v16, (a2)
-; RV32-NEXT:    vl8re64.v v8, (a0)
+; RV32-NEXT:    vl8r.v v16, (a2)
+; RV32-NEXT:    vl8r.v v8, (a0)
 ; RV32-NEXT:    addi sp, s0, -80
 ; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 80
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; RV64-LABEL: vector_interleave_nxv112i8_nxv16i8:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -80
 ; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -3543,7 +3953,7 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; RV64-NEXT:    slli a0, a0, 5
 ; RV64-NEXT:    sub sp, sp, a0
 ; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; RV64-NEXT:    vmv2r.v v26, v20
 ; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    vmv2r.v v24, v16
@@ -3571,51 +3981,51 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; RV64-NEXT:    vmv1r.v v22, v11
 ; RV64-NEXT:    add a6, a7, a2
 ; RV64-NEXT:    vmv1r.v v24, v15
-; RV64-NEXT:    vsseg7e64.v v1, (a0)
+; RV64-NEXT:    vsseg7e8.v v1, (a0)
 ; RV64-NEXT:    vmv1r.v v26, v19
-; RV64-NEXT:    vsseg7e64.v v21, (a1)
-; RV64-NEXT:    vl1re64.v v18, (a6)
+; RV64-NEXT:    vsseg7e8.v v21, (a1)
+; RV64-NEXT:    vl1r.v v18, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v19, (a6)
+; RV64-NEXT:    vl1r.v v19, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v20, (a6)
+; RV64-NEXT:    vl1r.v v20, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v21, (a6)
+; RV64-NEXT:    vl1r.v v21, (a6)
 ; RV64-NEXT:    add a6, a3, a2
-; RV64-NEXT:    vl1re64.v v10, (a6)
+; RV64-NEXT:    vl1r.v v10, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v11, (a6)
-; RV64-NEXT:    vl1re64.v v8, (a0)
-; RV64-NEXT:    vl1re64.v v16, (a4)
-; RV64-NEXT:    vl1re64.v v9, (a3)
-; RV64-NEXT:    vl1re64.v v17, (a7)
+; RV64-NEXT:    vl1r.v v11, (a6)
+; RV64-NEXT:    vl1r.v v8, (a0)
+; RV64-NEXT:    vl1r.v v16, (a4)
+; RV64-NEXT:    vl1r.v v9, (a3)
+; RV64-NEXT:    vl1r.v v17, (a7)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    li a3, 14
 ; RV64-NEXT:    mul a0, a0, a3
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 64
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v12, (a6)
+; RV64-NEXT:    vl1r.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v13, (a6)
+; RV64-NEXT:    vl1r.v v13, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, a0, a2
-; RV64-NEXT:    vl1re64.v v14, (a6)
-; RV64-NEXT:    vl1re64.v v15, (a1)
+; RV64-NEXT:    vl1r.v v14, (a6)
+; RV64-NEXT:    vl1r.v v15, (a1)
 ; RV64-NEXT:    add a5, a0, a5
 ; RV64-NEXT:    vs2r.v v20, (a5)
 ; RV64-NEXT:    vs4r.v v16, (a2)
 ; RV64-NEXT:    vs8r.v v8, (a0)
-; RV64-NEXT:    vl8re64.v v16, (a2)
-; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    vl8r.v v16, (a2)
+; RV64-NEXT:    vl8r.v v8, (a0)
 ; RV64-NEXT:    addi sp, s0, -80
 ; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 80
 ; RV64-NEXT:    ret
 ;
-; ZVBB-RV32-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; ZVBB-RV32-LABEL: vector_interleave_nxv112i8_nxv16i8:
 ; ZVBB-RV32:       # %bb.0:
 ; ZVBB-RV32-NEXT:    addi sp, sp, -80
 ; ZVBB-RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
@@ -3625,7 +4035,7 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; ZVBB-RV32-NEXT:    slli a0, a0, 5
 ; ZVBB-RV32-NEXT:    sub sp, sp, a0
 ; ZVBB-RV32-NEXT:    andi sp, sp, -64
-; ZVBB-RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV32-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
@@ -3653,51 +4063,51 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
 ; ZVBB-RV32-NEXT:    add a6, a7, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV32-NEXT:    vsseg7e64.v v1, (a0)
+; ZVBB-RV32-NEXT:    vsseg7e8.v v1, (a0)
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV32-NEXT:    vsseg7e64.v v21, (a1)
-; ZVBB-RV32-NEXT:    vl1re64.v v18, (a6)
+; ZVBB-RV32-NEXT:    vsseg7e8.v v21, (a1)
+; ZVBB-RV32-NEXT:    vl1r.v v18, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v19, (a6)
+; ZVBB-RV32-NEXT:    vl1r.v v19, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v20, (a6)
+; ZVBB-RV32-NEXT:    vl1r.v v20, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v21, (a6)
+; ZVBB-RV32-NEXT:    vl1r.v v21, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a3, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v10, (a6)
+; ZVBB-RV32-NEXT:    vl1r.v v10, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v11, (a6)
-; ZVBB-RV32-NEXT:    vl1re64.v v8, (a0)
-; ZVBB-RV32-NEXT:    vl1re64.v v16, (a4)
-; ZVBB-RV32-NEXT:    vl1re64.v v9, (a3)
-; ZVBB-RV32-NEXT:    vl1re64.v v17, (a7)
+; ZVBB-RV32-NEXT:    vl1r.v v11, (a6)
+; ZVBB-RV32-NEXT:    vl1r.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1r.v v16, (a4)
+; ZVBB-RV32-NEXT:    vl1r.v v9, (a3)
+; ZVBB-RV32-NEXT:    vl1r.v v17, (a7)
 ; ZVBB-RV32-NEXT:    csrr a0, vlenb
 ; ZVBB-RV32-NEXT:    li a3, 14
 ; ZVBB-RV32-NEXT:    mul a0, a0, a3
 ; ZVBB-RV32-NEXT:    add a0, sp, a0
 ; ZVBB-RV32-NEXT:    addi a0, a0, 64
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v12, (a6)
+; ZVBB-RV32-NEXT:    vl1r.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v13, (a6)
+; ZVBB-RV32-NEXT:    vl1r.v v13, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
 ; ZVBB-RV32-NEXT:    add a2, a0, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v14, (a6)
-; ZVBB-RV32-NEXT:    vl1re64.v v15, (a1)
+; ZVBB-RV32-NEXT:    vl1r.v v14, (a6)
+; ZVBB-RV32-NEXT:    vl1r.v v15, (a1)
 ; ZVBB-RV32-NEXT:    add a5, a0, a5
 ; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
 ; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
-; ZVBB-RV32-NEXT:    vl8re64.v v16, (a2)
-; ZVBB-RV32-NEXT:    vl8re64.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl8r.v v16, (a2)
+; ZVBB-RV32-NEXT:    vl8r.v v8, (a0)
 ; ZVBB-RV32-NEXT:    addi sp, s0, -80
 ; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    addi sp, sp, 80
 ; ZVBB-RV32-NEXT:    ret
 ;
-; ZVBB-RV64-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; ZVBB-RV64-LABEL: vector_interleave_nxv112i8_nxv16i8:
 ; ZVBB-RV64:       # %bb.0:
 ; ZVBB-RV64-NEXT:    addi sp, sp, -80
 ; ZVBB-RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -3707,7 +4117,7 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; ZVBB-RV64-NEXT:    slli a0, a0, 5
 ; ZVBB-RV64-NEXT:    sub sp, sp, a0
 ; ZVBB-RV64-NEXT:    andi sp, sp, -64
-; ZVBB-RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV64-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
@@ -3735,51 +4145,51 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
 ; ZVBB-RV64-NEXT:    add a6, a7, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV64-NEXT:    vsseg7e64.v v1, (a0)
+; ZVBB-RV64-NEXT:    vsseg7e8.v v1, (a0)
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV64-NEXT:    vsseg7e64.v v21, (a1)
-; ZVBB-RV64-NEXT:    vl1re64.v v18, (a6)
+; ZVBB-RV64-NEXT:    vsseg7e8.v v21, (a1)
+; ZVBB-RV64-NEXT:    vl1r.v v18, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v19, (a6)
+; ZVBB-RV64-NEXT:    vl1r.v v19, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v20, (a6)
+; ZVBB-RV64-NEXT:    vl1r.v v20, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v21, (a6)
+; ZVBB-RV64-NEXT:    vl1r.v v21, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a3, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v10, (a6)
+; ZVBB-RV64-NEXT:    vl1r.v v10, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v11, (a6)
-; ZVBB-RV64-NEXT:    vl1re64.v v8, (a0)
-; ZVBB-RV64-NEXT:    vl1re64.v v16, (a4)
-; ZVBB-RV64-NEXT:    vl1re64.v v9, (a3)
-; ZVBB-RV64-NEXT:    vl1re64.v v17, (a7)
+; ZVBB-RV64-NEXT:    vl1r.v v11, (a6)
+; ZVBB-RV64-NEXT:    vl1r.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1r.v v16, (a4)
+; ZVBB-RV64-NEXT:    vl1r.v v9, (a3)
+; ZVBB-RV64-NEXT:    vl1r.v v17, (a7)
 ; ZVBB-RV64-NEXT:    csrr a0, vlenb
 ; ZVBB-RV64-NEXT:    li a3, 14
 ; ZVBB-RV64-NEXT:    mul a0, a0, a3
 ; ZVBB-RV64-NEXT:    add a0, sp, a0
 ; ZVBB-RV64-NEXT:    addi a0, a0, 64
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v12, (a6)
+; ZVBB-RV64-NEXT:    vl1r.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v13, (a6)
+; ZVBB-RV64-NEXT:    vl1r.v v13, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
 ; ZVBB-RV64-NEXT:    add a2, a0, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v14, (a6)
-; ZVBB-RV64-NEXT:    vl1re64.v v15, (a1)
+; ZVBB-RV64-NEXT:    vl1r.v v14, (a6)
+; ZVBB-RV64-NEXT:    vl1r.v v15, (a1)
 ; ZVBB-RV64-NEXT:    add a5, a0, a5
 ; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
 ; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
-; ZVBB-RV64-NEXT:    vl8re64.v v16, (a2)
-; ZVBB-RV64-NEXT:    vl8re64.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl8r.v v16, (a2)
+; ZVBB-RV64-NEXT:    vl8r.v v8, (a0)
 ; ZVBB-RV64-NEXT:    addi sp, s0, -80
 ; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    addi sp, sp, 80
 ; ZVBB-RV64-NEXT:    ret
 ;
-; ZIP-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; ZIP-LABEL: vector_interleave_nxv112i8_nxv16i8:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -80
 ; ZIP-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -3789,7 +4199,7 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; ZIP-NEXT:    slli a0, a0, 5
 ; ZIP-NEXT:    sub sp, sp, a0
 ; ZIP-NEXT:    andi sp, sp, -64
-; ZIP-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; ZIP-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
 ; ZIP-NEXT:    vmv2r.v v26, v20
 ; ZIP-NEXT:    addi a0, sp, 64
 ; ZIP-NEXT:    vmv2r.v v24, v16
@@ -3817,938 +4227,5556 @@ define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64
 ; ZIP-NEXT:    vmv1r.v v22, v11
 ; ZIP-NEXT:    add a6, a7, a2
 ; ZIP-NEXT:    vmv1r.v v24, v15
-; ZIP-NEXT:    vsseg7e64.v v1, (a0)
+; ZIP-NEXT:    vsseg7e8.v v1, (a0)
 ; ZIP-NEXT:    vmv1r.v v26, v19
-; ZIP-NEXT:    vsseg7e64.v v21, (a1)
-; ZIP-NEXT:    vl1re64.v v18, (a6)
+; ZIP-NEXT:    vsseg7e8.v v21, (a1)
+; ZIP-NEXT:    vl1r.v v18, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re64.v v19, (a6)
+; ZIP-NEXT:    vl1r.v v19, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re64.v v20, (a6)
+; ZIP-NEXT:    vl1r.v v20, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re64.v v21, (a6)
+; ZIP-NEXT:    vl1r.v v21, (a6)
 ; ZIP-NEXT:    add a6, a3, a2
-; ZIP-NEXT:    vl1re64.v v10, (a6)
+; ZIP-NEXT:    vl1r.v v10, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re64.v v11, (a6)
-; ZIP-NEXT:    vl1re64.v v8, (a0)
-; ZIP-NEXT:    vl1re64.v v16, (a4)
-; ZIP-NEXT:    vl1re64.v v9, (a3)
-; ZIP-NEXT:    vl1re64.v v17, (a7)
+; ZIP-NEXT:    vl1r.v v11, (a6)
+; ZIP-NEXT:    vl1r.v v8, (a0)
+; ZIP-NEXT:    vl1r.v v16, (a4)
+; ZIP-NEXT:    vl1r.v v9, (a3)
+; ZIP-NEXT:    vl1r.v v17, (a7)
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    li a3, 14
 ; ZIP-NEXT:    mul a0, a0, a3
 ; ZIP-NEXT:    add a0, sp, a0
 ; ZIP-NEXT:    addi a0, a0, 64
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re64.v v12, (a6)
+; ZIP-NEXT:    vl1r.v v12, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re64.v v13, (a6)
+; ZIP-NEXT:    vl1r.v v13, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    slli a2, a2, 3
 ; ZIP-NEXT:    add a2, a0, a2
-; ZIP-NEXT:    vl1re64.v v14, (a6)
-; ZIP-NEXT:    vl1re64.v v15, (a1)
+; ZIP-NEXT:    vl1r.v v14, (a6)
+; ZIP-NEXT:    vl1r.v v15, (a1)
 ; ZIP-NEXT:    add a5, a0, a5
 ; ZIP-NEXT:    vs2r.v v20, (a5)
 ; ZIP-NEXT:    vs4r.v v16, (a2)
 ; ZIP-NEXT:    vs8r.v v8, (a0)
-; ZIP-NEXT:    vl8re64.v v16, (a2)
-; ZIP-NEXT:    vl8re64.v v8, (a0)
+; ZIP-NEXT:    vl8r.v v16, (a2)
+; ZIP-NEXT:    vl8r.v v8, (a0)
 ; ZIP-NEXT:    addi sp, s0, -80
 ; ZIP-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    addi sp, sp, 80
 ; ZIP-NEXT:    ret
-  %res = call <vscale x 14 x i64> @llvm.vector.interleave7.nxv14i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d, <vscale x 2 x i64> %e, <vscale x 2 x i64> %f, <vscale x 2 x i64> %g)
-  ret <vscale x 14 x i64> %res
-}
-
-; Floats
-
-define <vscale x 4 x bfloat> @vector_interleave_nxv4bf16_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
-; V-LABEL: vector_interleave_nxv4bf16_nxv2bf16:
-; V:       # %bb.0:
-; V-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; V-NEXT:    vwaddu.vv v10, v8, v9
-; V-NEXT:    li a0, -1
-; V-NEXT:    csrr a1, vlenb
-; V-NEXT:    vwmaccu.vx v10, a0, v9
-; V-NEXT:    srli a1, a1, 2
-; V-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; V-NEXT:    vslidedown.vx v8, v10, a1
-; V-NEXT:    add a0, a1, a1
-; V-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; V-NEXT:    vslideup.vx v10, v8, a1
-; V-NEXT:    vmv.v.v v8, v10
-; V-NEXT:    ret
-;
-; ZVBB-LABEL: vector_interleave_nxv4bf16_nxv2bf16:
-; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVBB-NEXT:    vwsll.vi v10, v9, 16
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
-; ZVBB-NEXT:    srli a0, a0, 2
-; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVBB-NEXT:    vslidedown.vx v8, v10, a0
-; ZVBB-NEXT:    add a1, a0, a0
-; ZVBB-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v10, v8, a0
-; ZVBB-NEXT:    vmv.v.v v8, v10
-; ZVBB-NEXT:    ret
-;
-; ZIP-LABEL: vector_interleave_nxv4bf16_nxv2bf16:
-; ZIP:       # %bb.0:
-; ZIP-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZIP-NEXT:    ri.vzip2b.vv v11, v8, v9
-; ZIP-NEXT:    ri.vzip2a.vv v10, v8, v9
-; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    srli a0, a0, 2
-; ZIP-NEXT:    add a1, a0, a0
-; ZIP-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; ZIP-NEXT:    vslideup.vx v10, v11, a0
-; ZIP-NEXT:    vmv.v.v v8, v10
-; ZIP-NEXT:    ret
-  %res = call <vscale x 4 x bfloat> @llvm.vector.interleave2.nxv4bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
-  ret <vscale x 4 x bfloat> %res
+  %res = call <vscale x 112 x i8> @llvm.vector.interleave7.nxv112i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, <vscale x 16 x i8> %e, <vscale x 16 x i8> %f, <vscale x 16 x i8> %g)
+  ret <vscale x 112 x i8> %res
 }
 
-define <vscale x 8 x bfloat> @vector_interleave_nxv8bf16_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
-; V-LABEL: vector_interleave_nxv8bf16_nxv4bf16:
-; V:       # %bb.0:
-; V-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; V-NEXT:    vmv1r.v v10, v9
-; V-NEXT:    vmv1r.v v11, v8
-; V-NEXT:    vwaddu.vv v8, v11, v10
-; V-NEXT:    li a0, -1
-; V-NEXT:    vwmaccu.vx v8, a0, v10
-; V-NEXT:    ret
-;
-; ZVBB-LABEL: vector_interleave_nxv8bf16_nxv4bf16:
-; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVBB-NEXT:    vmv1r.v v10, v9
-; ZVBB-NEXT:    vmv1r.v v11, v8
-; ZVBB-NEXT:    vwsll.vi v8, v10, 16
-; ZVBB-NEXT:    vwaddu.wv v8, v8, v11
-; ZVBB-NEXT:    ret
-;
-; ZIP-LABEL: vector_interleave_nxv8bf16_nxv4bf16:
-; ZIP:       # %bb.0:
-; ZIP-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZIP-NEXT:    vmv1r.v v10, v9
-; ZIP-NEXT:    vmv1r.v v11, v8
-; ZIP-NEXT:    ri.vzip2b.vv v9, v8, v10
-; ZIP-NEXT:    ri.vzip2a.vv v8, v11, v10
-; ZIP-NEXT:    ret
-  %res = call <vscale x 8 x bfloat> @llvm.vector.interleave2.nxv8bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
-  ret <vscale x 8 x bfloat> %res
-}
 
-define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b) {
-; V-LABEL: vector_interleave_nxv4f16_nxv2f16:
-; V:       # %bb.0:
-; V-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; V-NEXT:    vwaddu.vv v10, v8, v9
-; V-NEXT:    li a0, -1
-; V-NEXT:    csrr a1, vlenb
-; V-NEXT:    vwmaccu.vx v10, a0, v9
-; V-NEXT:    srli a1, a1, 2
-; V-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; V-NEXT:    vslidedown.vx v8, v10, a1
-; V-NEXT:    add a0, a1, a1
-; V-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; V-NEXT:    vslideup.vx v10, v8, a1
-; V-NEXT:    vmv.v.v v8, v10
-; V-NEXT:    ret
+define <vscale x 56 x i16> @vector_interleave_nxv56i16_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c, <vscale x 8 x i16> %d, <vscale x 8 x i16> %e, <vscale x 8 x i16> %f, <vscale x 8 x i16> %g) nounwind {
+;
+; RV32-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -80
+; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi s0, sp, 80
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    sub sp, sp, a0
+; RV32-NEXT:    andi sp, sp, -64
+; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT:    vmv2r.v v26, v20
+; RV32-NEXT:    addi a0, sp, 64
+; RV32-NEXT:    vmv2r.v v24, v16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a2, a1, 3
+; RV32-NEXT:    sub a1, a2, a1
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    vmv2r.v v22, v12
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    vmv2r.v v20, v8
+; RV32-NEXT:    vmv1r.v v1, v20
+; RV32-NEXT:    vmv1r.v v3, v22
+; RV32-NEXT:    vmv1r.v v5, v24
+; RV32-NEXT:    vmv1r.v v7, v26
+; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    vmv1r.v v2, v10
+; RV32-NEXT:    add a4, a1, a2
+; RV32-NEXT:    slli a5, a2, 2
+; RV32-NEXT:    vmv1r.v v4, v14
+; RV32-NEXT:    slli a6, a2, 4
+; RV32-NEXT:    add a7, a4, a2
+; RV32-NEXT:    vmv1r.v v6, v18
+; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    vmv1r.v v22, v11
+; RV32-NEXT:    add a6, a7, a2
+; RV32-NEXT:    vmv1r.v v24, v15
+; RV32-NEXT:    vsseg7e16.v v1, (a0)
+; RV32-NEXT:    vmv1r.v v26, v19
+; RV32-NEXT:    vsseg7e16.v v21, (a1)
+; RV32-NEXT:    vl1re16.v v18, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re16.v v19, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re16.v v20, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re16.v v21, (a6)
+; RV32-NEXT:    add a6, a3, a2
+; RV32-NEXT:    vl1re16.v v10, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re16.v v11, (a6)
+; RV32-NEXT:    vl1re16.v v8, (a0)
+; RV32-NEXT:    vl1re16.v v16, (a4)
+; RV32-NEXT:    vl1re16.v v9, (a3)
+; RV32-NEXT:    vl1re16.v v17, (a7)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a3, 14
+; RV32-NEXT:    mul a0, a0, a3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re16.v v12, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re16.v v13, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    vl1re16.v v14, (a6)
+; RV32-NEXT:    vl1re16.v v15, (a1)
+; RV32-NEXT:    add a5, a0, a5
+; RV32-NEXT:    vs2r.v v20, (a5)
+; RV32-NEXT:    vs4r.v v16, (a2)
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    vl8re16.v v16, (a2)
+; RV32-NEXT:    vl8re16.v v8, (a0)
+; RV32-NEXT:    addi sp, s0, -80
+; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 80
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -80
+; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi s0, sp, 80
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    sub sp, sp, a0
+; RV64-NEXT:    andi sp, sp, -64
+; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT:    vmv2r.v v26, v20
+; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    vmv2r.v v24, v16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    sub a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 64
+; RV64-NEXT:    vmv2r.v v22, v12
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    vmv2r.v v20, v8
+; RV64-NEXT:    vmv1r.v v1, v20
+; RV64-NEXT:    vmv1r.v v3, v22
+; RV64-NEXT:    vmv1r.v v5, v24
+; RV64-NEXT:    vmv1r.v v7, v26
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    vmv1r.v v2, v10
+; RV64-NEXT:    add a4, a1, a2
+; RV64-NEXT:    slli a5, a2, 2
+; RV64-NEXT:    vmv1r.v v4, v14
+; RV64-NEXT:    slli a6, a2, 4
+; RV64-NEXT:    add a7, a4, a2
+; RV64-NEXT:    vmv1r.v v6, v18
+; RV64-NEXT:    sub a5, a6, a5
+; RV64-NEXT:    vmv1r.v v22, v11
+; RV64-NEXT:    add a6, a7, a2
+; RV64-NEXT:    vmv1r.v v24, v15
+; RV64-NEXT:    vsseg7e16.v v1, (a0)
+; RV64-NEXT:    vmv1r.v v26, v19
+; RV64-NEXT:    vsseg7e16.v v21, (a1)
+; RV64-NEXT:    vl1re16.v v18, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re16.v v19, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re16.v v20, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re16.v v21, (a6)
+; RV64-NEXT:    add a6, a3, a2
+; RV64-NEXT:    vl1re16.v v10, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re16.v v11, (a6)
+; RV64-NEXT:    vl1re16.v v8, (a0)
+; RV64-NEXT:    vl1re16.v v16, (a4)
+; RV64-NEXT:    vl1re16.v v9, (a3)
+; RV64-NEXT:    vl1re16.v v17, (a7)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a3, 14
+; RV64-NEXT:    mul a0, a0, a3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re16.v v12, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re16.v v13, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a2, a0, a2
+; RV64-NEXT:    vl1re16.v v14, (a6)
+; RV64-NEXT:    vl1re16.v v15, (a1)
+; RV64-NEXT:    add a5, a0, a5
+; RV64-NEXT:    vs2r.v v20, (a5)
+; RV64-NEXT:    vs4r.v v16, (a2)
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    vl8re16.v v16, (a2)
+; RV64-NEXT:    vl8re16.v v8, (a0)
+; RV64-NEXT:    addi sp, s0, -80
+; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 80
+; RV64-NEXT:    ret
+;
+; ZVBB-RV32-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; ZVBB-RV32:       # %bb.0:
+; ZVBB-RV32-NEXT:    addi sp, sp, -80
+; ZVBB-RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT:    addi s0, sp, 80
+; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    slli a0, a0, 5
+; ZVBB-RV32-NEXT:    sub sp, sp, a0
+; ZVBB-RV32-NEXT:    andi sp, sp, -64
+; ZVBB-RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
+; ZVBB-RV32-NEXT:    addi a0, sp, 64
+; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
+; ZVBB-RV32-NEXT:    csrr a1, vlenb
+; ZVBB-RV32-NEXT:    slli a2, a1, 3
+; ZVBB-RV32-NEXT:    sub a1, a2, a1
+; ZVBB-RV32-NEXT:    add a1, sp, a1
+; ZVBB-RV32-NEXT:    addi a1, a1, 64
+; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV32-NEXT:    csrr a2, vlenb
+; ZVBB-RV32-NEXT:    vmv2r.v v20, v8
+; ZVBB-RV32-NEXT:    vmv1r.v v1, v20
+; ZVBB-RV32-NEXT:    vmv1r.v v3, v22
+; ZVBB-RV32-NEXT:    vmv1r.v v5, v24
+; ZVBB-RV32-NEXT:    vmv1r.v v7, v26
+; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v2, v10
+; ZVBB-RV32-NEXT:    add a4, a1, a2
+; ZVBB-RV32-NEXT:    slli a5, a2, 2
+; ZVBB-RV32-NEXT:    vmv1r.v v4, v14
+; ZVBB-RV32-NEXT:    slli a6, a2, 4
+; ZVBB-RV32-NEXT:    add a7, a4, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v6, v18
+; ZVBB-RV32-NEXT:    sub a5, a6, a5
+; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
+; ZVBB-RV32-NEXT:    add a6, a7, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
+; ZVBB-RV32-NEXT:    vsseg7e16.v v1, (a0)
+; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
+; ZVBB-RV32-NEXT:    vsseg7e16.v v21, (a1)
+; ZVBB-RV32-NEXT:    vl1re16.v v18, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v19, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v20, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v21, (a6)
+; ZVBB-RV32-NEXT:    add a6, a3, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v10, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v11, (a6)
+; ZVBB-RV32-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re16.v v16, (a4)
+; ZVBB-RV32-NEXT:    vl1re16.v v9, (a3)
+; ZVBB-RV32-NEXT:    vl1re16.v v17, (a7)
+; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    li a3, 14
+; ZVBB-RV32-NEXT:    mul a0, a0, a3
+; ZVBB-RV32-NEXT:    add a0, sp, a0
+; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v12, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v13, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    slli a2, a2, 3
+; ZVBB-RV32-NEXT:    add a2, a0, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v14, (a6)
+; ZVBB-RV32-NEXT:    vl1re16.v v15, (a1)
+; ZVBB-RV32-NEXT:    add a5, a0, a5
+; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl8re16.v v16, (a2)
+; ZVBB-RV32-NEXT:    vl8re16.v v8, (a0)
+; ZVBB-RV32-NEXT:    addi sp, s0, -80
+; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT:    addi sp, sp, 80
+; ZVBB-RV32-NEXT:    ret
+;
+; ZVBB-RV64-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; ZVBB-RV64:       # %bb.0:
+; ZVBB-RV64-NEXT:    addi sp, sp, -80
+; ZVBB-RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT:    addi s0, sp, 80
+; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    slli a0, a0, 5
+; ZVBB-RV64-NEXT:    sub sp, sp, a0
+; ZVBB-RV64-NEXT:    andi sp, sp, -64
+; ZVBB-RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
+; ZVBB-RV64-NEXT:    addi a0, sp, 64
+; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
+; ZVBB-RV64-NEXT:    csrr a1, vlenb
+; ZVBB-RV64-NEXT:    slli a2, a1, 3
+; ZVBB-RV64-NEXT:    sub a1, a2, a1
+; ZVBB-RV64-NEXT:    add a1, sp, a1
+; ZVBB-RV64-NEXT:    addi a1, a1, 64
+; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV64-NEXT:    csrr a2, vlenb
+; ZVBB-RV64-NEXT:    vmv2r.v v20, v8
+; ZVBB-RV64-NEXT:    vmv1r.v v1, v20
+; ZVBB-RV64-NEXT:    vmv1r.v v3, v22
+; ZVBB-RV64-NEXT:    vmv1r.v v5, v24
+; ZVBB-RV64-NEXT:    vmv1r.v v7, v26
+; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v2, v10
+; ZVBB-RV64-NEXT:    add a4, a1, a2
+; ZVBB-RV64-NEXT:    slli a5, a2, 2
+; ZVBB-RV64-NEXT:    vmv1r.v v4, v14
+; ZVBB-RV64-NEXT:    slli a6, a2, 4
+; ZVBB-RV64-NEXT:    add a7, a4, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v6, v18
+; ZVBB-RV64-NEXT:    sub a5, a6, a5
+; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
+; ZVBB-RV64-NEXT:    add a6, a7, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
+; ZVBB-RV64-NEXT:    vsseg7e16.v v1, (a0)
+; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
+; ZVBB-RV64-NEXT:    vsseg7e16.v v21, (a1)
+; ZVBB-RV64-NEXT:    vl1re16.v v18, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v19, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v20, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v21, (a6)
+; ZVBB-RV64-NEXT:    add a6, a3, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v10, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v11, (a6)
+; ZVBB-RV64-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re16.v v16, (a4)
+; ZVBB-RV64-NEXT:    vl1re16.v v9, (a3)
+; ZVBB-RV64-NEXT:    vl1re16.v v17, (a7)
+; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    li a3, 14
+; ZVBB-RV64-NEXT:    mul a0, a0, a3
+; ZVBB-RV64-NEXT:    add a0, sp, a0
+; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v12, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v13, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    slli a2, a2, 3
+; ZVBB-RV64-NEXT:    add a2, a0, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v14, (a6)
+; ZVBB-RV64-NEXT:    vl1re16.v v15, (a1)
+; ZVBB-RV64-NEXT:    add a5, a0, a5
+; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl8re16.v v16, (a2)
+; ZVBB-RV64-NEXT:    vl8re16.v v8, (a0)
+; ZVBB-RV64-NEXT:    addi sp, s0, -80
+; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT:    addi sp, sp, 80
+; ZVBB-RV64-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave_nxv56i16_nxv8i16:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    addi sp, sp, -80
+; ZIP-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    addi s0, sp, 80
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    slli a0, a0, 5
+; ZIP-NEXT:    sub sp, sp, a0
+; ZIP-NEXT:    andi sp, sp, -64
+; ZIP-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT:    vmv2r.v v26, v20
+; ZIP-NEXT:    addi a0, sp, 64
+; ZIP-NEXT:    vmv2r.v v24, v16
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    slli a2, a1, 3
+; ZIP-NEXT:    sub a1, a2, a1
+; ZIP-NEXT:    add a1, sp, a1
+; ZIP-NEXT:    addi a1, a1, 64
+; ZIP-NEXT:    vmv2r.v v22, v12
+; ZIP-NEXT:    csrr a2, vlenb
+; ZIP-NEXT:    vmv2r.v v20, v8
+; ZIP-NEXT:    vmv1r.v v1, v20
+; ZIP-NEXT:    vmv1r.v v3, v22
+; ZIP-NEXT:    vmv1r.v v5, v24
+; ZIP-NEXT:    vmv1r.v v7, v26
+; ZIP-NEXT:    add a3, a0, a2
+; ZIP-NEXT:    vmv1r.v v2, v10
+; ZIP-NEXT:    add a4, a1, a2
+; ZIP-NEXT:    slli a5, a2, 2
+; ZIP-NEXT:    vmv1r.v v4, v14
+; ZIP-NEXT:    slli a6, a2, 4
+; ZIP-NEXT:    add a7, a4, a2
+; ZIP-NEXT:    vmv1r.v v6, v18
+; ZIP-NEXT:    sub a5, a6, a5
+; ZIP-NEXT:    vmv1r.v v22, v11
+; ZIP-NEXT:    add a6, a7, a2
+; ZIP-NEXT:    vmv1r.v v24, v15
+; ZIP-NEXT:    vsseg7e16.v v1, (a0)
+; ZIP-NEXT:    vmv1r.v v26, v19
+; ZIP-NEXT:    vsseg7e16.v v21, (a1)
+; ZIP-NEXT:    vl1re16.v v18, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re16.v v19, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re16.v v20, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re16.v v21, (a6)
+; ZIP-NEXT:    add a6, a3, a2
+; ZIP-NEXT:    vl1re16.v v10, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re16.v v11, (a6)
+; ZIP-NEXT:    vl1re16.v v8, (a0)
+; ZIP-NEXT:    vl1re16.v v16, (a4)
+; ZIP-NEXT:    vl1re16.v v9, (a3)
+; ZIP-NEXT:    vl1re16.v v17, (a7)
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    li a3, 14
+; ZIP-NEXT:    mul a0, a0, a3
+; ZIP-NEXT:    add a0, sp, a0
+; ZIP-NEXT:    addi a0, a0, 64
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re16.v v12, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re16.v v13, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    slli a2, a2, 3
+; ZIP-NEXT:    add a2, a0, a2
+; ZIP-NEXT:    vl1re16.v v14, (a6)
+; ZIP-NEXT:    vl1re16.v v15, (a1)
+; ZIP-NEXT:    add a5, a0, a5
+; ZIP-NEXT:    vs2r.v v20, (a5)
+; ZIP-NEXT:    vs4r.v v16, (a2)
+; ZIP-NEXT:    vs8r.v v8, (a0)
+; ZIP-NEXT:    vl8re16.v v16, (a2)
+; ZIP-NEXT:    vl8re16.v v8, (a0)
+; ZIP-NEXT:    addi sp, s0, -80
+; ZIP-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    addi sp, sp, 80
+; ZIP-NEXT:    ret
+  %res = call <vscale x 56 x i16> @llvm.vector.interleave7.nxv56i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c, <vscale x 8 x i16> %d, <vscale x 8 x i16> %e, <vscale x 8 x i16> %f, <vscale x 8 x i16> %g)
+  ret <vscale x 56 x i16> %res
+}
+
+
+define <vscale x 28 x i32> @vector_interleave_nxv28i32_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, <vscale x 4 x i32> %e, <vscale x 4 x i32> %f, <vscale x 4 x i32> %g) nounwind {
+;
+; RV32-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -80
+; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi s0, sp, 80
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    sub sp, sp, a0
+; RV32-NEXT:    andi sp, sp, -64
+; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv2r.v v26, v20
+; RV32-NEXT:    addi a0, sp, 64
+; RV32-NEXT:    vmv2r.v v24, v16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a2, a1, 3
+; RV32-NEXT:    sub a1, a2, a1
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    vmv2r.v v22, v12
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    vmv2r.v v20, v8
+; RV32-NEXT:    vmv1r.v v1, v20
+; RV32-NEXT:    vmv1r.v v3, v22
+; RV32-NEXT:    vmv1r.v v5, v24
+; RV32-NEXT:    vmv1r.v v7, v26
+; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    vmv1r.v v2, v10
+; RV32-NEXT:    add a4, a1, a2
+; RV32-NEXT:    slli a5, a2, 2
+; RV32-NEXT:    vmv1r.v v4, v14
+; RV32-NEXT:    slli a6, a2, 4
+; RV32-NEXT:    add a7, a4, a2
+; RV32-NEXT:    vmv1r.v v6, v18
+; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    vmv1r.v v22, v11
+; RV32-NEXT:    add a6, a7, a2
+; RV32-NEXT:    vmv1r.v v24, v15
+; RV32-NEXT:    vsseg7e32.v v1, (a0)
+; RV32-NEXT:    vmv1r.v v26, v19
+; RV32-NEXT:    vsseg7e32.v v21, (a1)
+; RV32-NEXT:    vl1re32.v v18, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re32.v v19, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re32.v v20, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re32.v v21, (a6)
+; RV32-NEXT:    add a6, a3, a2
+; RV32-NEXT:    vl1re32.v v10, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re32.v v11, (a6)
+; RV32-NEXT:    vl1re32.v v8, (a0)
+; RV32-NEXT:    vl1re32.v v16, (a4)
+; RV32-NEXT:    vl1re32.v v9, (a3)
+; RV32-NEXT:    vl1re32.v v17, (a7)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a3, 14
+; RV32-NEXT:    mul a0, a0, a3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re32.v v12, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re32.v v13, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    vl1re32.v v14, (a6)
+; RV32-NEXT:    vl1re32.v v15, (a1)
+; RV32-NEXT:    add a5, a0, a5
+; RV32-NEXT:    vs2r.v v20, (a5)
+; RV32-NEXT:    vs4r.v v16, (a2)
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    vl8re32.v v16, (a2)
+; RV32-NEXT:    vl8re32.v v8, (a0)
+; RV32-NEXT:    addi sp, s0, -80
+; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 80
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -80
+; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi s0, sp, 80
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    sub sp, sp, a0
+; RV64-NEXT:    andi sp, sp, -64
+; RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV64-NEXT:    vmv2r.v v26, v20
+; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    vmv2r.v v24, v16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    sub a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 64
+; RV64-NEXT:    vmv2r.v v22, v12
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    vmv2r.v v20, v8
+; RV64-NEXT:    vmv1r.v v1, v20
+; RV64-NEXT:    vmv1r.v v3, v22
+; RV64-NEXT:    vmv1r.v v5, v24
+; RV64-NEXT:    vmv1r.v v7, v26
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    vmv1r.v v2, v10
+; RV64-NEXT:    add a4, a1, a2
+; RV64-NEXT:    slli a5, a2, 2
+; RV64-NEXT:    vmv1r.v v4, v14
+; RV64-NEXT:    slli a6, a2, 4
+; RV64-NEXT:    add a7, a4, a2
+; RV64-NEXT:    vmv1r.v v6, v18
+; RV64-NEXT:    sub a5, a6, a5
+; RV64-NEXT:    vmv1r.v v22, v11
+; RV64-NEXT:    add a6, a7, a2
+; RV64-NEXT:    vmv1r.v v24, v15
+; RV64-NEXT:    vsseg7e32.v v1, (a0)
+; RV64-NEXT:    vmv1r.v v26, v19
+; RV64-NEXT:    vsseg7e32.v v21, (a1)
+; RV64-NEXT:    vl1re32.v v18, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re32.v v19, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re32.v v20, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re32.v v21, (a6)
+; RV64-NEXT:    add a6, a3, a2
+; RV64-NEXT:    vl1re32.v v10, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re32.v v11, (a6)
+; RV64-NEXT:    vl1re32.v v8, (a0)
+; RV64-NEXT:    vl1re32.v v16, (a4)
+; RV64-NEXT:    vl1re32.v v9, (a3)
+; RV64-NEXT:    vl1re32.v v17, (a7)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a3, 14
+; RV64-NEXT:    mul a0, a0, a3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re32.v v12, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re32.v v13, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a2, a0, a2
+; RV64-NEXT:    vl1re32.v v14, (a6)
+; RV64-NEXT:    vl1re32.v v15, (a1)
+; RV64-NEXT:    add a5, a0, a5
+; RV64-NEXT:    vs2r.v v20, (a5)
+; RV64-NEXT:    vs4r.v v16, (a2)
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    vl8re32.v v16, (a2)
+; RV64-NEXT:    vl8re32.v v8, (a0)
+; RV64-NEXT:    addi sp, s0, -80
+; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 80
+; RV64-NEXT:    ret
+;
+; ZVBB-RV32-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; ZVBB-RV32:       # %bb.0:
+; ZVBB-RV32-NEXT:    addi sp, sp, -80
+; ZVBB-RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT:    addi s0, sp, 80
+; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    slli a0, a0, 5
+; ZVBB-RV32-NEXT:    sub sp, sp, a0
+; ZVBB-RV32-NEXT:    andi sp, sp, -64
+; ZVBB-RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
+; ZVBB-RV32-NEXT:    addi a0, sp, 64
+; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
+; ZVBB-RV32-NEXT:    csrr a1, vlenb
+; ZVBB-RV32-NEXT:    slli a2, a1, 3
+; ZVBB-RV32-NEXT:    sub a1, a2, a1
+; ZVBB-RV32-NEXT:    add a1, sp, a1
+; ZVBB-RV32-NEXT:    addi a1, a1, 64
+; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV32-NEXT:    csrr a2, vlenb
+; ZVBB-RV32-NEXT:    vmv2r.v v20, v8
+; ZVBB-RV32-NEXT:    vmv1r.v v1, v20
+; ZVBB-RV32-NEXT:    vmv1r.v v3, v22
+; ZVBB-RV32-NEXT:    vmv1r.v v5, v24
+; ZVBB-RV32-NEXT:    vmv1r.v v7, v26
+; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v2, v10
+; ZVBB-RV32-NEXT:    add a4, a1, a2
+; ZVBB-RV32-NEXT:    slli a5, a2, 2
+; ZVBB-RV32-NEXT:    vmv1r.v v4, v14
+; ZVBB-RV32-NEXT:    slli a6, a2, 4
+; ZVBB-RV32-NEXT:    add a7, a4, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v6, v18
+; ZVBB-RV32-NEXT:    sub a5, a6, a5
+; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
+; ZVBB-RV32-NEXT:    add a6, a7, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
+; ZVBB-RV32-NEXT:    vsseg7e32.v v1, (a0)
+; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
+; ZVBB-RV32-NEXT:    vsseg7e32.v v21, (a1)
+; ZVBB-RV32-NEXT:    vl1re32.v v18, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v19, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v20, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v21, (a6)
+; ZVBB-RV32-NEXT:    add a6, a3, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v10, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v11, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re32.v v16, (a4)
+; ZVBB-RV32-NEXT:    vl1re32.v v9, (a3)
+; ZVBB-RV32-NEXT:    vl1re32.v v17, (a7)
+; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    li a3, 14
+; ZVBB-RV32-NEXT:    mul a0, a0, a3
+; ZVBB-RV32-NEXT:    add a0, sp, a0
+; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v12, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v13, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    slli a2, a2, 3
+; ZVBB-RV32-NEXT:    add a2, a0, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v14, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v15, (a1)
+; ZVBB-RV32-NEXT:    add a5, a0, a5
+; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl8re32.v v16, (a2)
+; ZVBB-RV32-NEXT:    vl8re32.v v8, (a0)
+; ZVBB-RV32-NEXT:    addi sp, s0, -80
+; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT:    addi sp, sp, 80
+; ZVBB-RV32-NEXT:    ret
+;
+; ZVBB-RV64-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; ZVBB-RV64:       # %bb.0:
+; ZVBB-RV64-NEXT:    addi sp, sp, -80
+; ZVBB-RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT:    addi s0, sp, 80
+; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    slli a0, a0, 5
+; ZVBB-RV64-NEXT:    sub sp, sp, a0
+; ZVBB-RV64-NEXT:    andi sp, sp, -64
+; ZVBB-RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
+; ZVBB-RV64-NEXT:    addi a0, sp, 64
+; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
+; ZVBB-RV64-NEXT:    csrr a1, vlenb
+; ZVBB-RV64-NEXT:    slli a2, a1, 3
+; ZVBB-RV64-NEXT:    sub a1, a2, a1
+; ZVBB-RV64-NEXT:    add a1, sp, a1
+; ZVBB-RV64-NEXT:    addi a1, a1, 64
+; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV64-NEXT:    csrr a2, vlenb
+; ZVBB-RV64-NEXT:    vmv2r.v v20, v8
+; ZVBB-RV64-NEXT:    vmv1r.v v1, v20
+; ZVBB-RV64-NEXT:    vmv1r.v v3, v22
+; ZVBB-RV64-NEXT:    vmv1r.v v5, v24
+; ZVBB-RV64-NEXT:    vmv1r.v v7, v26
+; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v2, v10
+; ZVBB-RV64-NEXT:    add a4, a1, a2
+; ZVBB-RV64-NEXT:    slli a5, a2, 2
+; ZVBB-RV64-NEXT:    vmv1r.v v4, v14
+; ZVBB-RV64-NEXT:    slli a6, a2, 4
+; ZVBB-RV64-NEXT:    add a7, a4, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v6, v18
+; ZVBB-RV64-NEXT:    sub a5, a6, a5
+; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
+; ZVBB-RV64-NEXT:    add a6, a7, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
+; ZVBB-RV64-NEXT:    vsseg7e32.v v1, (a0)
+; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
+; ZVBB-RV64-NEXT:    vsseg7e32.v v21, (a1)
+; ZVBB-RV64-NEXT:    vl1re32.v v18, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v19, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v20, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v21, (a6)
+; ZVBB-RV64-NEXT:    add a6, a3, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v10, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v11, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re32.v v16, (a4)
+; ZVBB-RV64-NEXT:    vl1re32.v v9, (a3)
+; ZVBB-RV64-NEXT:    vl1re32.v v17, (a7)
+; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    li a3, 14
+; ZVBB-RV64-NEXT:    mul a0, a0, a3
+; ZVBB-RV64-NEXT:    add a0, sp, a0
+; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v12, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v13, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    slli a2, a2, 3
+; ZVBB-RV64-NEXT:    add a2, a0, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v14, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v15, (a1)
+; ZVBB-RV64-NEXT:    add a5, a0, a5
+; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl8re32.v v16, (a2)
+; ZVBB-RV64-NEXT:    vl8re32.v v8, (a0)
+; ZVBB-RV64-NEXT:    addi sp, s0, -80
+; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT:    addi sp, sp, 80
+; ZVBB-RV64-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave_nxv28i32_nxv4i32:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    addi sp, sp, -80
+; ZIP-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    addi s0, sp, 80
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    slli a0, a0, 5
+; ZIP-NEXT:    sub sp, sp, a0
+; ZIP-NEXT:    andi sp, sp, -64
+; ZIP-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; ZIP-NEXT:    vmv2r.v v26, v20
+; ZIP-NEXT:    addi a0, sp, 64
+; ZIP-NEXT:    vmv2r.v v24, v16
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    slli a2, a1, 3
+; ZIP-NEXT:    sub a1, a2, a1
+; ZIP-NEXT:    add a1, sp, a1
+; ZIP-NEXT:    addi a1, a1, 64
+; ZIP-NEXT:    vmv2r.v v22, v12
+; ZIP-NEXT:    csrr a2, vlenb
+; ZIP-NEXT:    vmv2r.v v20, v8
+; ZIP-NEXT:    vmv1r.v v1, v20
+; ZIP-NEXT:    vmv1r.v v3, v22
+; ZIP-NEXT:    vmv1r.v v5, v24
+; ZIP-NEXT:    vmv1r.v v7, v26
+; ZIP-NEXT:    add a3, a0, a2
+; ZIP-NEXT:    vmv1r.v v2, v10
+; ZIP-NEXT:    add a4, a1, a2
+; ZIP-NEXT:    slli a5, a2, 2
+; ZIP-NEXT:    vmv1r.v v4, v14
+; ZIP-NEXT:    slli a6, a2, 4
+; ZIP-NEXT:    add a7, a4, a2
+; ZIP-NEXT:    vmv1r.v v6, v18
+; ZIP-NEXT:    sub a5, a6, a5
+; ZIP-NEXT:    vmv1r.v v22, v11
+; ZIP-NEXT:    add a6, a7, a2
+; ZIP-NEXT:    vmv1r.v v24, v15
+; ZIP-NEXT:    vsseg7e32.v v1, (a0)
+; ZIP-NEXT:    vmv1r.v v26, v19
+; ZIP-NEXT:    vsseg7e32.v v21, (a1)
+; ZIP-NEXT:    vl1re32.v v18, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re32.v v19, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re32.v v20, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re32.v v21, (a6)
+; ZIP-NEXT:    add a6, a3, a2
+; ZIP-NEXT:    vl1re32.v v10, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re32.v v11, (a6)
+; ZIP-NEXT:    vl1re32.v v8, (a0)
+; ZIP-NEXT:    vl1re32.v v16, (a4)
+; ZIP-NEXT:    vl1re32.v v9, (a3)
+; ZIP-NEXT:    vl1re32.v v17, (a7)
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    li a3, 14
+; ZIP-NEXT:    mul a0, a0, a3
+; ZIP-NEXT:    add a0, sp, a0
+; ZIP-NEXT:    addi a0, a0, 64
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re32.v v12, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re32.v v13, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    slli a2, a2, 3
+; ZIP-NEXT:    add a2, a0, a2
+; ZIP-NEXT:    vl1re32.v v14, (a6)
+; ZIP-NEXT:    vl1re32.v v15, (a1)
+; ZIP-NEXT:    add a5, a0, a5
+; ZIP-NEXT:    vs2r.v v20, (a5)
+; ZIP-NEXT:    vs4r.v v16, (a2)
+; ZIP-NEXT:    vs8r.v v8, (a0)
+; ZIP-NEXT:    vl8re32.v v16, (a2)
+; ZIP-NEXT:    vl8re32.v v8, (a0)
+; ZIP-NEXT:    addi sp, s0, -80
+; ZIP-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    addi sp, sp, 80
+; ZIP-NEXT:    ret
+  %res = call <vscale x 28 x i32> @llvm.vector.interleave7.nxv28i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, <vscale x 4 x i32> %e, <vscale x 4 x i32> %f, <vscale x 4 x i32> %g)
+  ret <vscale x 28 x i32> %res
+}
+
+define <vscale x 14 x i64> @vector_interleave_nxv14i64_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d, <vscale x 2 x i64> %e, <vscale x 2 x i64> %f, <vscale x 2 x i64> %g) nounwind {
+;
+; RV32-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -80
+; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi s0, sp, 80
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    slli a0, a0, 5
+; RV32-NEXT:    sub sp, sp, a0
+; RV32-NEXT:    andi sp, sp, -64
+; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT:    vmv2r.v v26, v20
+; RV32-NEXT:    addi a0, sp, 64
+; RV32-NEXT:    vmv2r.v v24, v16
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a2, a1, 3
+; RV32-NEXT:    sub a1, a2, a1
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    vmv2r.v v22, v12
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    vmv2r.v v20, v8
+; RV32-NEXT:    vmv1r.v v1, v20
+; RV32-NEXT:    vmv1r.v v3, v22
+; RV32-NEXT:    vmv1r.v v5, v24
+; RV32-NEXT:    vmv1r.v v7, v26
+; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    vmv1r.v v2, v10
+; RV32-NEXT:    add a4, a1, a2
+; RV32-NEXT:    slli a5, a2, 2
+; RV32-NEXT:    vmv1r.v v4, v14
+; RV32-NEXT:    slli a6, a2, 4
+; RV32-NEXT:    add a7, a4, a2
+; RV32-NEXT:    vmv1r.v v6, v18
+; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    vmv1r.v v22, v11
+; RV32-NEXT:    add a6, a7, a2
+; RV32-NEXT:    vmv1r.v v24, v15
+; RV32-NEXT:    vsseg7e64.v v1, (a0)
+; RV32-NEXT:    vmv1r.v v26, v19
+; RV32-NEXT:    vsseg7e64.v v21, (a1)
+; RV32-NEXT:    vl1re64.v v18, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re64.v v19, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re64.v v20, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re64.v v21, (a6)
+; RV32-NEXT:    add a6, a3, a2
+; RV32-NEXT:    vl1re64.v v10, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re64.v v11, (a6)
+; RV32-NEXT:    vl1re64.v v8, (a0)
+; RV32-NEXT:    vl1re64.v v16, (a4)
+; RV32-NEXT:    vl1re64.v v9, (a3)
+; RV32-NEXT:    vl1re64.v v17, (a7)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a3, 14
+; RV32-NEXT:    mul a0, a0, a3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re64.v v12, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re64.v v13, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    vl1re64.v v14, (a6)
+; RV32-NEXT:    vl1re64.v v15, (a1)
+; RV32-NEXT:    add a5, a0, a5
+; RV32-NEXT:    vs2r.v v20, (a5)
+; RV32-NEXT:    vs4r.v v16, (a2)
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    vl8re64.v v16, (a2)
+; RV32-NEXT:    vl8re64.v v8, (a0)
+; RV32-NEXT:    addi sp, s0, -80
+; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 80
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -80
+; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi s0, sp, 80
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    slli a0, a0, 5
+; RV64-NEXT:    sub sp, sp, a0
+; RV64-NEXT:    andi sp, sp, -64
+; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv2r.v v26, v20
+; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    vmv2r.v v24, v16
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    sub a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 64
+; RV64-NEXT:    vmv2r.v v22, v12
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    vmv2r.v v20, v8
+; RV64-NEXT:    vmv1r.v v1, v20
+; RV64-NEXT:    vmv1r.v v3, v22
+; RV64-NEXT:    vmv1r.v v5, v24
+; RV64-NEXT:    vmv1r.v v7, v26
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    vmv1r.v v2, v10
+; RV64-NEXT:    add a4, a1, a2
+; RV64-NEXT:    slli a5, a2, 2
+; RV64-NEXT:    vmv1r.v v4, v14
+; RV64-NEXT:    slli a6, a2, 4
+; RV64-NEXT:    add a7, a4, a2
+; RV64-NEXT:    vmv1r.v v6, v18
+; RV64-NEXT:    sub a5, a6, a5
+; RV64-NEXT:    vmv1r.v v22, v11
+; RV64-NEXT:    add a6, a7, a2
+; RV64-NEXT:    vmv1r.v v24, v15
+; RV64-NEXT:    vsseg7e64.v v1, (a0)
+; RV64-NEXT:    vmv1r.v v26, v19
+; RV64-NEXT:    vsseg7e64.v v21, (a1)
+; RV64-NEXT:    vl1re64.v v18, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re64.v v19, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re64.v v20, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re64.v v21, (a6)
+; RV64-NEXT:    add a6, a3, a2
+; RV64-NEXT:    vl1re64.v v10, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re64.v v11, (a6)
+; RV64-NEXT:    vl1re64.v v8, (a0)
+; RV64-NEXT:    vl1re64.v v16, (a4)
+; RV64-NEXT:    vl1re64.v v9, (a3)
+; RV64-NEXT:    vl1re64.v v17, (a7)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a3, 14
+; RV64-NEXT:    mul a0, a0, a3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re64.v v12, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re64.v v13, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a2, a0, a2
+; RV64-NEXT:    vl1re64.v v14, (a6)
+; RV64-NEXT:    vl1re64.v v15, (a1)
+; RV64-NEXT:    add a5, a0, a5
+; RV64-NEXT:    vs2r.v v20, (a5)
+; RV64-NEXT:    vs4r.v v16, (a2)
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    vl8re64.v v16, (a2)
+; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    addi sp, s0, -80
+; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 80
+; RV64-NEXT:    ret
+;
+; ZVBB-RV32-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; ZVBB-RV32:       # %bb.0:
+; ZVBB-RV32-NEXT:    addi sp, sp, -80
+; ZVBB-RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT:    addi s0, sp, 80
+; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    slli a0, a0, 5
+; ZVBB-RV32-NEXT:    sub sp, sp, a0
+; ZVBB-RV32-NEXT:    andi sp, sp, -64
+; ZVBB-RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
+; ZVBB-RV32-NEXT:    addi a0, sp, 64
+; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
+; ZVBB-RV32-NEXT:    csrr a1, vlenb
+; ZVBB-RV32-NEXT:    slli a2, a1, 3
+; ZVBB-RV32-NEXT:    sub a1, a2, a1
+; ZVBB-RV32-NEXT:    add a1, sp, a1
+; ZVBB-RV32-NEXT:    addi a1, a1, 64
+; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV32-NEXT:    csrr a2, vlenb
+; ZVBB-RV32-NEXT:    vmv2r.v v20, v8
+; ZVBB-RV32-NEXT:    vmv1r.v v1, v20
+; ZVBB-RV32-NEXT:    vmv1r.v v3, v22
+; ZVBB-RV32-NEXT:    vmv1r.v v5, v24
+; ZVBB-RV32-NEXT:    vmv1r.v v7, v26
+; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v2, v10
+; ZVBB-RV32-NEXT:    add a4, a1, a2
+; ZVBB-RV32-NEXT:    slli a5, a2, 2
+; ZVBB-RV32-NEXT:    vmv1r.v v4, v14
+; ZVBB-RV32-NEXT:    slli a6, a2, 4
+; ZVBB-RV32-NEXT:    add a7, a4, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v6, v18
+; ZVBB-RV32-NEXT:    sub a5, a6, a5
+; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
+; ZVBB-RV32-NEXT:    add a6, a7, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
+; ZVBB-RV32-NEXT:    vsseg7e64.v v1, (a0)
+; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
+; ZVBB-RV32-NEXT:    vsseg7e64.v v21, (a1)
+; ZVBB-RV32-NEXT:    vl1re64.v v18, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v19, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v20, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v21, (a6)
+; ZVBB-RV32-NEXT:    add a6, a3, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v10, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v11, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re64.v v16, (a4)
+; ZVBB-RV32-NEXT:    vl1re64.v v9, (a3)
+; ZVBB-RV32-NEXT:    vl1re64.v v17, (a7)
+; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    li a3, 14
+; ZVBB-RV32-NEXT:    mul a0, a0, a3
+; ZVBB-RV32-NEXT:    add a0, sp, a0
+; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v12, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v13, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    slli a2, a2, 3
+; ZVBB-RV32-NEXT:    add a2, a0, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v14, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v15, (a1)
+; ZVBB-RV32-NEXT:    add a5, a0, a5
+; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl8re64.v v16, (a2)
+; ZVBB-RV32-NEXT:    vl8re64.v v8, (a0)
+; ZVBB-RV32-NEXT:    addi sp, s0, -80
+; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT:    addi sp, sp, 80
+; ZVBB-RV32-NEXT:    ret
+;
+; ZVBB-RV64-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; ZVBB-RV64:       # %bb.0:
+; ZVBB-RV64-NEXT:    addi sp, sp, -80
+; ZVBB-RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT:    addi s0, sp, 80
+; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    slli a0, a0, 5
+; ZVBB-RV64-NEXT:    sub sp, sp, a0
+; ZVBB-RV64-NEXT:    andi sp, sp, -64
+; ZVBB-RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
+; ZVBB-RV64-NEXT:    addi a0, sp, 64
+; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
+; ZVBB-RV64-NEXT:    csrr a1, vlenb
+; ZVBB-RV64-NEXT:    slli a2, a1, 3
+; ZVBB-RV64-NEXT:    sub a1, a2, a1
+; ZVBB-RV64-NEXT:    add a1, sp, a1
+; ZVBB-RV64-NEXT:    addi a1, a1, 64
+; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV64-NEXT:    csrr a2, vlenb
+; ZVBB-RV64-NEXT:    vmv2r.v v20, v8
+; ZVBB-RV64-NEXT:    vmv1r.v v1, v20
+; ZVBB-RV64-NEXT:    vmv1r.v v3, v22
+; ZVBB-RV64-NEXT:    vmv1r.v v5, v24
+; ZVBB-RV64-NEXT:    vmv1r.v v7, v26
+; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v2, v10
+; ZVBB-RV64-NEXT:    add a4, a1, a2
+; ZVBB-RV64-NEXT:    slli a5, a2, 2
+; ZVBB-RV64-NEXT:    vmv1r.v v4, v14
+; ZVBB-RV64-NEXT:    slli a6, a2, 4
+; ZVBB-RV64-NEXT:    add a7, a4, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v6, v18
+; ZVBB-RV64-NEXT:    sub a5, a6, a5
+; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
+; ZVBB-RV64-NEXT:    add a6, a7, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
+; ZVBB-RV64-NEXT:    vsseg7e64.v v1, (a0)
+; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
+; ZVBB-RV64-NEXT:    vsseg7e64.v v21, (a1)
+; ZVBB-RV64-NEXT:    vl1re64.v v18, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v19, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v20, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v21, (a6)
+; ZVBB-RV64-NEXT:    add a6, a3, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v10, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v11, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re64.v v16, (a4)
+; ZVBB-RV64-NEXT:    vl1re64.v v9, (a3)
+; ZVBB-RV64-NEXT:    vl1re64.v v17, (a7)
+; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    li a3, 14
+; ZVBB-RV64-NEXT:    mul a0, a0, a3
+; ZVBB-RV64-NEXT:    add a0, sp, a0
+; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v12, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v13, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    slli a2, a2, 3
+; ZVBB-RV64-NEXT:    add a2, a0, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v14, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v15, (a1)
+; ZVBB-RV64-NEXT:    add a5, a0, a5
+; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl8re64.v v16, (a2)
+; ZVBB-RV64-NEXT:    vl8re64.v v8, (a0)
+; ZVBB-RV64-NEXT:    addi sp, s0, -80
+; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT:    addi sp, sp, 80
+; ZVBB-RV64-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave_nxv14i64_nxv2i64:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    addi sp, sp, -80
+; ZIP-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    addi s0, sp, 80
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    slli a0, a0, 5
+; ZIP-NEXT:    sub sp, sp, a0
+; ZIP-NEXT:    andi sp, sp, -64
+; ZIP-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; ZIP-NEXT:    vmv2r.v v26, v20
+; ZIP-NEXT:    addi a0, sp, 64
+; ZIP-NEXT:    vmv2r.v v24, v16
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    slli a2, a1, 3
+; ZIP-NEXT:    sub a1, a2, a1
+; ZIP-NEXT:    add a1, sp, a1
+; ZIP-NEXT:    addi a1, a1, 64
+; ZIP-NEXT:    vmv2r.v v22, v12
+; ZIP-NEXT:    csrr a2, vlenb
+; ZIP-NEXT:    vmv2r.v v20, v8
+; ZIP-NEXT:    vmv1r.v v1, v20
+; ZIP-NEXT:    vmv1r.v v3, v22
+; ZIP-NEXT:    vmv1r.v v5, v24
+; ZIP-NEXT:    vmv1r.v v7, v26
+; ZIP-NEXT:    add a3, a0, a2
+; ZIP-NEXT:    vmv1r.v v2, v10
+; ZIP-NEXT:    add a4, a1, a2
+; ZIP-NEXT:    slli a5, a2, 2
+; ZIP-NEXT:    vmv1r.v v4, v14
+; ZIP-NEXT:    slli a6, a2, 4
+; ZIP-NEXT:    add a7, a4, a2
+; ZIP-NEXT:    vmv1r.v v6, v18
+; ZIP-NEXT:    sub a5, a6, a5
+; ZIP-NEXT:    vmv1r.v v22, v11
+; ZIP-NEXT:    add a6, a7, a2
+; ZIP-NEXT:    vmv1r.v v24, v15
+; ZIP-NEXT:    vsseg7e64.v v1, (a0)
+; ZIP-NEXT:    vmv1r.v v26, v19
+; ZIP-NEXT:    vsseg7e64.v v21, (a1)
+; ZIP-NEXT:    vl1re64.v v18, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re64.v v19, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re64.v v20, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re64.v v21, (a6)
+; ZIP-NEXT:    add a6, a3, a2
+; ZIP-NEXT:    vl1re64.v v10, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re64.v v11, (a6)
+; ZIP-NEXT:    vl1re64.v v8, (a0)
+; ZIP-NEXT:    vl1re64.v v16, (a4)
+; ZIP-NEXT:    vl1re64.v v9, (a3)
+; ZIP-NEXT:    vl1re64.v v17, (a7)
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    li a3, 14
+; ZIP-NEXT:    mul a0, a0, a3
+; ZIP-NEXT:    add a0, sp, a0
+; ZIP-NEXT:    addi a0, a0, 64
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re64.v v12, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re64.v v13, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    slli a2, a2, 3
+; ZIP-NEXT:    add a2, a0, a2
+; ZIP-NEXT:    vl1re64.v v14, (a6)
+; ZIP-NEXT:    vl1re64.v v15, (a1)
+; ZIP-NEXT:    add a5, a0, a5
+; ZIP-NEXT:    vs2r.v v20, (a5)
+; ZIP-NEXT:    vs4r.v v16, (a2)
+; ZIP-NEXT:    vs8r.v v8, (a0)
+; ZIP-NEXT:    vl8re64.v v16, (a2)
+; ZIP-NEXT:    vl8re64.v v8, (a0)
+; ZIP-NEXT:    addi sp, s0, -80
+; ZIP-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    addi sp, sp, 80
+; ZIP-NEXT:    ret
+  %res = call <vscale x 14 x i64> @llvm.vector.interleave7.nxv14i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d, <vscale x 2 x i64> %e, <vscale x 2 x i64> %f, <vscale x 2 x i64> %g)
+  ret <vscale x 14 x i64> %res
+}
+
+define <vscale x 128 x i1> @vector_interleave_nxv128i1_nxv16i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e, <vscale x 16 x i1> %f, <vscale x 16 x i1> %g, <vscale x 16 x i1> %h) nounwind {
+; CHECK-LABEL: vector_interleave_nxv128i1_nxv16i1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmv.v.i v22, 0
+; CHECK-NEXT:    vmerge.vim v24, v22, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v8
+; CHECK-NEXT:    vmerge.vim v16, v22, 1, v0
+; CHECK-NEXT:    vmv1r.v v1, v24
+; CHECK-NEXT:    vmv1r.v v2, v16
+; CHECK-NEXT:    vmv1r.v v0, v9
+; CHECK-NEXT:    vmerge.vim v26, v22, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vim v18, v22, 1, v0
+; CHECK-NEXT:    vmv1r.v v3, v26
+; CHECK-NEXT:    vmv1r.v v4, v18
+; CHECK-NEXT:    vmv1r.v v0, v11
+; CHECK-NEXT:    vmerge.vim v8, v22, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vim v20, v22, 1, v0
+; CHECK-NEXT:    vmv1r.v v5, v8
+; CHECK-NEXT:    vmv1r.v v6, v20
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vmerge.vim v10, v22, 1, v0
+; CHECK-NEXT:    vmv1r.v v0, v14
+; CHECK-NEXT:    vmerge.vim v22, v22, 1, v0
+; CHECK-NEXT:    vmv1r.v v7, v10
+; CHECK-NEXT:    vmv1r.v v8, v22
+; CHECK-NEXT:    vmv1r.v v16, v25
+; CHECK-NEXT:    addi a2, sp, 16
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vsseg8e8.v v1, (a2)
+; CHECK-NEXT:    vmv1r.v v18, v27
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    add a3, a2, a0
+; CHECK-NEXT:    add a4, a1, a0
+; CHECK-NEXT:    add a5, a3, a0
+; CHECK-NEXT:    add a6, a4, a0
+; CHECK-NEXT:    add a7, a5, a0
+; CHECK-NEXT:    add t0, a6, a0
+; CHECK-NEXT:    add t1, a7, a0
+; CHECK-NEXT:    add t2, t0, a0
+; CHECK-NEXT:    vmv1r.v v20, v9
+; CHECK-NEXT:    add t3, t1, a0
+; CHECK-NEXT:    vmv1r.v v22, v11
+; CHECK-NEXT:    vsseg8e8.v v16, (a1)
+; CHECK-NEXT:    vl1r.v v10, (t1)
+; CHECK-NEXT:    add t1, t2, a0
+; CHECK-NEXT:    vl1r.v v12, (a5)
+; CHECK-NEXT:    add a5, t3, a0
+; CHECK-NEXT:    vl1r.v v14, (a2)
+; CHECK-NEXT:    add a2, t1, a0
+; CHECK-NEXT:    vl1r.v v16, (a5)
+; CHECK-NEXT:    add a5, a5, a0
+; CHECK-NEXT:    vl1r.v v8, (a2)
+; CHECK-NEXT:    add a2, a2, a0
+; CHECK-NEXT:    vl1r.v v18, (t2)
+; CHECK-NEXT:    vl1r.v v17, (a5)
+; CHECK-NEXT:    vl1r.v v11, (t3)
+; CHECK-NEXT:    vl1r.v v13, (a7)
+; CHECK-NEXT:    vl1r.v v15, (a3)
+; CHECK-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
+; CHECK-NEXT:    vmsne.vi v20, v16, 0
+; CHECK-NEXT:    vmsne.vi v16, v10, 0
+; CHECK-NEXT:    vl1r.v v10, (a6)
+; CHECK-NEXT:    vmsne.vi v17, v12, 0
+; CHECK-NEXT:    vmsne.vi v0, v14, 0
+; CHECK-NEXT:    vl1r.v v12, (a1)
+; CHECK-NEXT:    vl1r.v v9, (a2)
+; CHECK-NEXT:    vl1r.v v19, (t1)
+; CHECK-NEXT:    vl1r.v v11, (t0)
+; CHECK-NEXT:    vl1r.v v13, (a4)
+; CHECK-NEXT:    vmsne.vi v14, v8, 0
+; CHECK-NEXT:    vmsne.vi v9, v18, 0
+; CHECK-NEXT:    vmsne.vi v15, v10, 0
+; CHECK-NEXT:    vmsne.vi v8, v12, 0
+; CHECK-NEXT:    srli a1, a0, 2
+; CHECK-NEXT:    add a2, a1, a1
+; CHECK-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v16, v20, a1
+; CHECK-NEXT:    vslideup.vx v0, v17, a1
+; CHECK-NEXT:    vslideup.vx v9, v14, a1
+; CHECK-NEXT:    vslideup.vx v8, v15, a1
+; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    add a1, a0, a0
+; CHECK-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v0, v16, a0
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv128i1_nxv16i1:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 4
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    vsetvli a0, zero, e8, m2, ta, ma
+; ZVBB-NEXT:    vmv.v.i v22, 0
+; ZVBB-NEXT:    vmerge.vim v24, v22, 1, v0
+; ZVBB-NEXT:    vmv1r.v v0, v8
+; ZVBB-NEXT:    vmerge.vim v16, v22, 1, v0
+; ZVBB-NEXT:    vmv1r.v v1, v24
+; ZVBB-NEXT:    vmv1r.v v2, v16
+; ZVBB-NEXT:    vmv1r.v v0, v9
+; ZVBB-NEXT:    vmerge.vim v26, v22, 1, v0
+; ZVBB-NEXT:    vmv1r.v v0, v10
+; ZVBB-NEXT:    vmerge.vim v18, v22, 1, v0
+; ZVBB-NEXT:    vmv1r.v v3, v26
+; ZVBB-NEXT:    vmv1r.v v4, v18
+; ZVBB-NEXT:    vmv1r.v v0, v11
+; ZVBB-NEXT:    vmerge.vim v8, v22, 1, v0
+; ZVBB-NEXT:    vmv1r.v v0, v12
+; ZVBB-NEXT:    vmerge.vim v20, v22, 1, v0
+; ZVBB-NEXT:    vmv1r.v v5, v8
+; ZVBB-NEXT:    vmv1r.v v6, v20
+; ZVBB-NEXT:    vmv1r.v v0, v13
+; ZVBB-NEXT:    vmerge.vim v10, v22, 1, v0
+; ZVBB-NEXT:    vmv1r.v v0, v14
+; ZVBB-NEXT:    vmerge.vim v22, v22, 1, v0
+; ZVBB-NEXT:    vmv1r.v v7, v10
+; ZVBB-NEXT:    vmv1r.v v8, v22
+; ZVBB-NEXT:    vmv1r.v v16, v25
+; ZVBB-NEXT:    addi a2, sp, 16
+; ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; ZVBB-NEXT:    vsseg8e8.v v1, (a2)
+; ZVBB-NEXT:    vmv1r.v v18, v27
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    slli a1, a1, 3
+; ZVBB-NEXT:    add a1, sp, a1
+; ZVBB-NEXT:    addi a1, a1, 16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    add a3, a2, a0
+; ZVBB-NEXT:    add a4, a1, a0
+; ZVBB-NEXT:    add a5, a3, a0
+; ZVBB-NEXT:    add a6, a4, a0
+; ZVBB-NEXT:    add a7, a5, a0
+; ZVBB-NEXT:    add t0, a6, a0
+; ZVBB-NEXT:    add t1, a7, a0
+; ZVBB-NEXT:    add t2, t0, a0
+; ZVBB-NEXT:    vmv1r.v v20, v9
+; ZVBB-NEXT:    add t3, t1, a0
+; ZVBB-NEXT:    vmv1r.v v22, v11
+; ZVBB-NEXT:    vsseg8e8.v v16, (a1)
+; ZVBB-NEXT:    vl1r.v v10, (t1)
+; ZVBB-NEXT:    add t1, t2, a0
+; ZVBB-NEXT:    vl1r.v v12, (a5)
+; ZVBB-NEXT:    add a5, t3, a0
+; ZVBB-NEXT:    vl1r.v v14, (a2)
+; ZVBB-NEXT:    add a2, t1, a0
+; ZVBB-NEXT:    vl1r.v v16, (a5)
+; ZVBB-NEXT:    add a5, a5, a0
+; ZVBB-NEXT:    vl1r.v v8, (a2)
+; ZVBB-NEXT:    add a2, a2, a0
+; ZVBB-NEXT:    vl1r.v v18, (t2)
+; ZVBB-NEXT:    vl1r.v v17, (a5)
+; ZVBB-NEXT:    vl1r.v v11, (t3)
+; ZVBB-NEXT:    vl1r.v v13, (a7)
+; ZVBB-NEXT:    vl1r.v v15, (a3)
+; ZVBB-NEXT:    vsetvli a3, zero, e8, m2, ta, ma
+; ZVBB-NEXT:    vmsne.vi v20, v16, 0
+; ZVBB-NEXT:    vmsne.vi v16, v10, 0
+; ZVBB-NEXT:    vl1r.v v10, (a6)
+; ZVBB-NEXT:    vmsne.vi v17, v12, 0
+; ZVBB-NEXT:    vmsne.vi v0, v14, 0
+; ZVBB-NEXT:    vl1r.v v12, (a1)
+; ZVBB-NEXT:    vl1r.v v9, (a2)
+; ZVBB-NEXT:    vl1r.v v19, (t1)
+; ZVBB-NEXT:    vl1r.v v11, (t0)
+; ZVBB-NEXT:    vl1r.v v13, (a4)
+; ZVBB-NEXT:    vmsne.vi v14, v8, 0
+; ZVBB-NEXT:    vmsne.vi v9, v18, 0
+; ZVBB-NEXT:    vmsne.vi v15, v10, 0
+; ZVBB-NEXT:    vmsne.vi v8, v12, 0
+; ZVBB-NEXT:    srli a1, a0, 2
+; ZVBB-NEXT:    add a2, a1, a1
+; ZVBB-NEXT:    vsetvli zero, a2, e8, mf2, ta, ma
+; ZVBB-NEXT:    vslideup.vx v16, v20, a1
+; ZVBB-NEXT:    vslideup.vx v0, v17, a1
+; ZVBB-NEXT:    vslideup.vx v9, v14, a1
+; ZVBB-NEXT:    vslideup.vx v8, v15, a1
+; ZVBB-NEXT:    srli a0, a0, 1
+; ZVBB-NEXT:    add a1, a0, a0
+; ZVBB-NEXT:    vsetvli zero, a1, e8, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v0, v16, a0
+; ZVBB-NEXT:    vslideup.vx v8, v9, a0
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 4
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 128 x i1> @llvm.vector.interleave8.nxv128i1(<vscale x 16 x i1> %a, <vscale x 16 x i1> %b, <vscale x 16 x i1> %c, <vscale x 16 x i1> %d, <vscale x 16 x i1> %e, <vscale x 16 x i1> %f, <vscale x 16 x i1> %g, <vscale x 16 x i1> %h)
+  ret <vscale x 128 x i1> %res
+}
+
+
+define <vscale x 128 x i8> @vector_interleave_nxv128i8_nxv16i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, <vscale x 16 x i8> %e, <vscale x 16 x i8> %f, <vscale x 16 x i8> %g, <vscale x 16 x i8> %h) nounwind {
+;
+; CHECK-LABEL: vector_interleave_nxv128i8_nxv16i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v28, v22
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vmv2r.v v26, v18
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    add a2, a0, a3
+; CHECK-NEXT:    add a4, a1, a3
+; CHECK-NEXT:    add a5, a2, a3
+; CHECK-NEXT:    vmv1r.v v1, v8
+; CHECK-NEXT:    vmv2r.v v24, v14
+; CHECK-NEXT:    add a6, a4, a3
+; CHECK-NEXT:    vmv2r.v v22, v10
+; CHECK-NEXT:    vmv1r.v v2, v22
+; CHECK-NEXT:    add a7, a5, a3
+; CHECK-NEXT:    vmv1r.v v3, v12
+; CHECK-NEXT:    add t0, a6, a3
+; CHECK-NEXT:    vmv1r.v v4, v24
+; CHECK-NEXT:    add t1, a7, a3
+; CHECK-NEXT:    vmv1r.v v5, v16
+; CHECK-NEXT:    add t2, t0, a3
+; CHECK-NEXT:    vmv1r.v v6, v26
+; CHECK-NEXT:    add t3, t1, a3
+; CHECK-NEXT:    vmv1r.v v7, v20
+; CHECK-NEXT:    add t4, t2, a3
+; CHECK-NEXT:    vmv1r.v v8, v28
+; CHECK-NEXT:    vmv1r.v v22, v9
+; CHECK-NEXT:    add t5, t3, a3
+; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    add t6, t4, a3
+; CHECK-NEXT:    vmv1r.v v26, v17
+; CHECK-NEXT:    vsseg8e8.v v1, (a0)
+; CHECK-NEXT:    vmv1r.v v28, v21
+; CHECK-NEXT:    vsseg8e8.v v22, (a1)
+; CHECK-NEXT:    vl1r.v v14, (t5)
+; CHECK-NEXT:    add t5, t5, a3
+; CHECK-NEXT:    add a3, t6, a3
+; CHECK-NEXT:    vl1r.v v22, (t6)
+; CHECK-NEXT:    vl1r.v v15, (t5)
+; CHECK-NEXT:    vl1r.v v23, (a3)
+; CHECK-NEXT:    vl1r.v v12, (t1)
+; CHECK-NEXT:    vl1r.v v20, (t2)
+; CHECK-NEXT:    vl1r.v v13, (t3)
+; CHECK-NEXT:    vl1r.v v21, (t4)
+; CHECK-NEXT:    vl1r.v v10, (a5)
+; CHECK-NEXT:    vl1r.v v18, (a6)
+; CHECK-NEXT:    vl1r.v v11, (a7)
+; CHECK-NEXT:    vl1r.v v19, (t0)
+; CHECK-NEXT:    vl1r.v v8, (a0)
+; CHECK-NEXT:    vl1r.v v16, (a1)
+; CHECK-NEXT:    vl1r.v v9, (a2)
+; CHECK-NEXT:    vl1r.v v17, (a4)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv128i8_nxv16i8:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 4
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    vsetvli a0, zero, e8, m1, ta, ma
+; ZVBB-NEXT:    vmv2r.v v28, v22
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    vmv2r.v v26, v18
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    slli a1, a1, 3
+; ZVBB-NEXT:    add a1, sp, a1
+; ZVBB-NEXT:    addi a1, a1, 16
+; ZVBB-NEXT:    csrr a3, vlenb
+; ZVBB-NEXT:    add a2, a0, a3
+; ZVBB-NEXT:    add a4, a1, a3
+; ZVBB-NEXT:    add a5, a2, a3
+; ZVBB-NEXT:    vmv1r.v v1, v8
+; ZVBB-NEXT:    vmv2r.v v24, v14
+; ZVBB-NEXT:    add a6, a4, a3
+; ZVBB-NEXT:    vmv2r.v v22, v10
+; ZVBB-NEXT:    vmv1r.v v2, v22
+; ZVBB-NEXT:    add a7, a5, a3
+; ZVBB-NEXT:    vmv1r.v v3, v12
+; ZVBB-NEXT:    add t0, a6, a3
+; ZVBB-NEXT:    vmv1r.v v4, v24
+; ZVBB-NEXT:    add t1, a7, a3
+; ZVBB-NEXT:    vmv1r.v v5, v16
+; ZVBB-NEXT:    add t2, t0, a3
+; ZVBB-NEXT:    vmv1r.v v6, v26
+; ZVBB-NEXT:    add t3, t1, a3
+; ZVBB-NEXT:    vmv1r.v v7, v20
+; ZVBB-NEXT:    add t4, t2, a3
+; ZVBB-NEXT:    vmv1r.v v8, v28
+; ZVBB-NEXT:    vmv1r.v v22, v9
+; ZVBB-NEXT:    add t5, t3, a3
+; ZVBB-NEXT:    vmv1r.v v24, v13
+; ZVBB-NEXT:    add t6, t4, a3
+; ZVBB-NEXT:    vmv1r.v v26, v17
+; ZVBB-NEXT:    vsseg8e8.v v1, (a0)
+; ZVBB-NEXT:    vmv1r.v v28, v21
+; ZVBB-NEXT:    vsseg8e8.v v22, (a1)
+; ZVBB-NEXT:    vl1r.v v14, (t5)
+; ZVBB-NEXT:    add t5, t5, a3
+; ZVBB-NEXT:    add a3, t6, a3
+; ZVBB-NEXT:    vl1r.v v22, (t6)
+; ZVBB-NEXT:    vl1r.v v15, (t5)
+; ZVBB-NEXT:    vl1r.v v23, (a3)
+; ZVBB-NEXT:    vl1r.v v12, (t1)
+; ZVBB-NEXT:    vl1r.v v20, (t2)
+; ZVBB-NEXT:    vl1r.v v13, (t3)
+; ZVBB-NEXT:    vl1r.v v21, (t4)
+; ZVBB-NEXT:    vl1r.v v10, (a5)
+; ZVBB-NEXT:    vl1r.v v18, (a6)
+; ZVBB-NEXT:    vl1r.v v11, (a7)
+; ZVBB-NEXT:    vl1r.v v19, (t0)
+; ZVBB-NEXT:    vl1r.v v8, (a0)
+; ZVBB-NEXT:    vl1r.v v16, (a1)
+; ZVBB-NEXT:    vl1r.v v9, (a2)
+; ZVBB-NEXT:    vl1r.v v17, (a4)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 4
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 128 x i8> @llvm.vector.interleave8.nxv128i8(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b, <vscale x 16 x i8> %c, <vscale x 16 x i8> %d, <vscale x 16 x i8> %e, <vscale x 16 x i8> %f, <vscale x 16 x i8> %g, <vscale x 16 x i8> %h)
+  ret <vscale x 128 x i8> %res
+}
+
+
+define <vscale x 64 x i16> @vector_interleave_nxv64i16_nxv8i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c, <vscale x 8 x i16> %d, <vscale x 8 x i16> %e, <vscale x 8 x i16> %f, <vscale x 8 x i16> %g, <vscale x 8 x i16> %h) nounwind {
+;
+; CHECK-LABEL: vector_interleave_nxv64i16_nxv8i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v28, v22
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vmv2r.v v26, v18
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    add a2, a0, a3
+; CHECK-NEXT:    add a4, a1, a3
+; CHECK-NEXT:    add a5, a2, a3
+; CHECK-NEXT:    vmv1r.v v1, v8
+; CHECK-NEXT:    vmv2r.v v24, v14
+; CHECK-NEXT:    add a6, a4, a3
+; CHECK-NEXT:    vmv2r.v v22, v10
+; CHECK-NEXT:    vmv1r.v v2, v22
+; CHECK-NEXT:    add a7, a5, a3
+; CHECK-NEXT:    vmv1r.v v3, v12
+; CHECK-NEXT:    add t0, a6, a3
+; CHECK-NEXT:    vmv1r.v v4, v24
+; CHECK-NEXT:    add t1, a7, a3
+; CHECK-NEXT:    vmv1r.v v5, v16
+; CHECK-NEXT:    add t2, t0, a3
+; CHECK-NEXT:    vmv1r.v v6, v26
+; CHECK-NEXT:    add t3, t1, a3
+; CHECK-NEXT:    vmv1r.v v7, v20
+; CHECK-NEXT:    add t4, t2, a3
+; CHECK-NEXT:    vmv1r.v v8, v28
+; CHECK-NEXT:    vmv1r.v v22, v9
+; CHECK-NEXT:    add t5, t3, a3
+; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    add t6, t4, a3
+; CHECK-NEXT:    vmv1r.v v26, v17
+; CHECK-NEXT:    vsseg8e16.v v1, (a0)
+; CHECK-NEXT:    vmv1r.v v28, v21
+; CHECK-NEXT:    vsseg8e16.v v22, (a1)
+; CHECK-NEXT:    vl1re16.v v14, (t5)
+; CHECK-NEXT:    add t5, t5, a3
+; CHECK-NEXT:    add a3, t6, a3
+; CHECK-NEXT:    vl1re16.v v22, (t6)
+; CHECK-NEXT:    vl1re16.v v15, (t5)
+; CHECK-NEXT:    vl1re16.v v23, (a3)
+; CHECK-NEXT:    vl1re16.v v12, (t1)
+; CHECK-NEXT:    vl1re16.v v20, (t2)
+; CHECK-NEXT:    vl1re16.v v13, (t3)
+; CHECK-NEXT:    vl1re16.v v21, (t4)
+; CHECK-NEXT:    vl1re16.v v10, (a5)
+; CHECK-NEXT:    vl1re16.v v18, (a6)
+; CHECK-NEXT:    vl1re16.v v11, (a7)
+; CHECK-NEXT:    vl1re16.v v19, (t0)
+; CHECK-NEXT:    vl1re16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v16, (a1)
+; CHECK-NEXT:    vl1re16.v v9, (a2)
+; CHECK-NEXT:    vl1re16.v v17, (a4)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv64i16_nxv8i16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 4
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-NEXT:    vmv2r.v v28, v22
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    vmv2r.v v26, v18
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    slli a1, a1, 3
+; ZVBB-NEXT:    add a1, sp, a1
+; ZVBB-NEXT:    addi a1, a1, 16
+; ZVBB-NEXT:    csrr a3, vlenb
+; ZVBB-NEXT:    add a2, a0, a3
+; ZVBB-NEXT:    add a4, a1, a3
+; ZVBB-NEXT:    add a5, a2, a3
+; ZVBB-NEXT:    vmv1r.v v1, v8
+; ZVBB-NEXT:    vmv2r.v v24, v14
+; ZVBB-NEXT:    add a6, a4, a3
+; ZVBB-NEXT:    vmv2r.v v22, v10
+; ZVBB-NEXT:    vmv1r.v v2, v22
+; ZVBB-NEXT:    add a7, a5, a3
+; ZVBB-NEXT:    vmv1r.v v3, v12
+; ZVBB-NEXT:    add t0, a6, a3
+; ZVBB-NEXT:    vmv1r.v v4, v24
+; ZVBB-NEXT:    add t1, a7, a3
+; ZVBB-NEXT:    vmv1r.v v5, v16
+; ZVBB-NEXT:    add t2, t0, a3
+; ZVBB-NEXT:    vmv1r.v v6, v26
+; ZVBB-NEXT:    add t3, t1, a3
+; ZVBB-NEXT:    vmv1r.v v7, v20
+; ZVBB-NEXT:    add t4, t2, a3
+; ZVBB-NEXT:    vmv1r.v v8, v28
+; ZVBB-NEXT:    vmv1r.v v22, v9
+; ZVBB-NEXT:    add t5, t3, a3
+; ZVBB-NEXT:    vmv1r.v v24, v13
+; ZVBB-NEXT:    add t6, t4, a3
+; ZVBB-NEXT:    vmv1r.v v26, v17
+; ZVBB-NEXT:    vsseg8e16.v v1, (a0)
+; ZVBB-NEXT:    vmv1r.v v28, v21
+; ZVBB-NEXT:    vsseg8e16.v v22, (a1)
+; ZVBB-NEXT:    vl1re16.v v14, (t5)
+; ZVBB-NEXT:    add t5, t5, a3
+; ZVBB-NEXT:    add a3, t6, a3
+; ZVBB-NEXT:    vl1re16.v v22, (t6)
+; ZVBB-NEXT:    vl1re16.v v15, (t5)
+; ZVBB-NEXT:    vl1re16.v v23, (a3)
+; ZVBB-NEXT:    vl1re16.v v12, (t1)
+; ZVBB-NEXT:    vl1re16.v v20, (t2)
+; ZVBB-NEXT:    vl1re16.v v13, (t3)
+; ZVBB-NEXT:    vl1re16.v v21, (t4)
+; ZVBB-NEXT:    vl1re16.v v10, (a5)
+; ZVBB-NEXT:    vl1re16.v v18, (a6)
+; ZVBB-NEXT:    vl1re16.v v11, (a7)
+; ZVBB-NEXT:    vl1re16.v v19, (t0)
+; ZVBB-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-NEXT:    vl1re16.v v16, (a1)
+; ZVBB-NEXT:    vl1re16.v v9, (a2)
+; ZVBB-NEXT:    vl1re16.v v17, (a4)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 4
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 64 x i16> @llvm.vector.interleave8.nxv64i16(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b, <vscale x 8 x i16> %c, <vscale x 8 x i16> %d, <vscale x 8 x i16> %e, <vscale x 8 x i16> %f, <vscale x 8 x i16> %g, <vscale x 8 x i16> %h)
+  ret <vscale x 64 x i16> %res
+}
+
+
+define <vscale x 32 x i32> @vector_interleave_nxv32i32_nxv4i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, <vscale x 4 x i32> %e, <vscale x 4 x i32> %f, <vscale x 4 x i32> %g, <vscale x 4 x i32> %h) nounwind {
+;
+; CHECK-LABEL: vector_interleave_nxv32i32_nxv4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v28, v22
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vmv2r.v v26, v18
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    add a2, a0, a3
+; CHECK-NEXT:    add a4, a1, a3
+; CHECK-NEXT:    add a5, a2, a3
+; CHECK-NEXT:    vmv1r.v v1, v8
+; CHECK-NEXT:    vmv2r.v v24, v14
+; CHECK-NEXT:    add a6, a4, a3
+; CHECK-NEXT:    vmv2r.v v22, v10
+; CHECK-NEXT:    vmv1r.v v2, v22
+; CHECK-NEXT:    add a7, a5, a3
+; CHECK-NEXT:    vmv1r.v v3, v12
+; CHECK-NEXT:    add t0, a6, a3
+; CHECK-NEXT:    vmv1r.v v4, v24
+; CHECK-NEXT:    add t1, a7, a3
+; CHECK-NEXT:    vmv1r.v v5, v16
+; CHECK-NEXT:    add t2, t0, a3
+; CHECK-NEXT:    vmv1r.v v6, v26
+; CHECK-NEXT:    add t3, t1, a3
+; CHECK-NEXT:    vmv1r.v v7, v20
+; CHECK-NEXT:    add t4, t2, a3
+; CHECK-NEXT:    vmv1r.v v8, v28
+; CHECK-NEXT:    vmv1r.v v22, v9
+; CHECK-NEXT:    add t5, t3, a3
+; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    add t6, t4, a3
+; CHECK-NEXT:    vmv1r.v v26, v17
+; CHECK-NEXT:    vsseg8e32.v v1, (a0)
+; CHECK-NEXT:    vmv1r.v v28, v21
+; CHECK-NEXT:    vsseg8e32.v v22, (a1)
+; CHECK-NEXT:    vl1re32.v v14, (t5)
+; CHECK-NEXT:    add t5, t5, a3
+; CHECK-NEXT:    add a3, t6, a3
+; CHECK-NEXT:    vl1re32.v v22, (t6)
+; CHECK-NEXT:    vl1re32.v v15, (t5)
+; CHECK-NEXT:    vl1re32.v v23, (a3)
+; CHECK-NEXT:    vl1re32.v v12, (t1)
+; CHECK-NEXT:    vl1re32.v v20, (t2)
+; CHECK-NEXT:    vl1re32.v v13, (t3)
+; CHECK-NEXT:    vl1re32.v v21, (t4)
+; CHECK-NEXT:    vl1re32.v v10, (a5)
+; CHECK-NEXT:    vl1re32.v v18, (a6)
+; CHECK-NEXT:    vl1re32.v v11, (a7)
+; CHECK-NEXT:    vl1re32.v v19, (t0)
+; CHECK-NEXT:    vl1re32.v v8, (a0)
+; CHECK-NEXT:    vl1re32.v v16, (a1)
+; CHECK-NEXT:    vl1re32.v v9, (a2)
+; CHECK-NEXT:    vl1re32.v v17, (a4)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv32i32_nxv4i32:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 4
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-NEXT:    vmv2r.v v28, v22
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    vmv2r.v v26, v18
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    slli a1, a1, 3
+; ZVBB-NEXT:    add a1, sp, a1
+; ZVBB-NEXT:    addi a1, a1, 16
+; ZVBB-NEXT:    csrr a3, vlenb
+; ZVBB-NEXT:    add a2, a0, a3
+; ZVBB-NEXT:    add a4, a1, a3
+; ZVBB-NEXT:    add a5, a2, a3
+; ZVBB-NEXT:    vmv1r.v v1, v8
+; ZVBB-NEXT:    vmv2r.v v24, v14
+; ZVBB-NEXT:    add a6, a4, a3
+; ZVBB-NEXT:    vmv2r.v v22, v10
+; ZVBB-NEXT:    vmv1r.v v2, v22
+; ZVBB-NEXT:    add a7, a5, a3
+; ZVBB-NEXT:    vmv1r.v v3, v12
+; ZVBB-NEXT:    add t0, a6, a3
+; ZVBB-NEXT:    vmv1r.v v4, v24
+; ZVBB-NEXT:    add t1, a7, a3
+; ZVBB-NEXT:    vmv1r.v v5, v16
+; ZVBB-NEXT:    add t2, t0, a3
+; ZVBB-NEXT:    vmv1r.v v6, v26
+; ZVBB-NEXT:    add t3, t1, a3
+; ZVBB-NEXT:    vmv1r.v v7, v20
+; ZVBB-NEXT:    add t4, t2, a3
+; ZVBB-NEXT:    vmv1r.v v8, v28
+; ZVBB-NEXT:    vmv1r.v v22, v9
+; ZVBB-NEXT:    add t5, t3, a3
+; ZVBB-NEXT:    vmv1r.v v24, v13
+; ZVBB-NEXT:    add t6, t4, a3
+; ZVBB-NEXT:    vmv1r.v v26, v17
+; ZVBB-NEXT:    vsseg8e32.v v1, (a0)
+; ZVBB-NEXT:    vmv1r.v v28, v21
+; ZVBB-NEXT:    vsseg8e32.v v22, (a1)
+; ZVBB-NEXT:    vl1re32.v v14, (t5)
+; ZVBB-NEXT:    add t5, t5, a3
+; ZVBB-NEXT:    add a3, t6, a3
+; ZVBB-NEXT:    vl1re32.v v22, (t6)
+; ZVBB-NEXT:    vl1re32.v v15, (t5)
+; ZVBB-NEXT:    vl1re32.v v23, (a3)
+; ZVBB-NEXT:    vl1re32.v v12, (t1)
+; ZVBB-NEXT:    vl1re32.v v20, (t2)
+; ZVBB-NEXT:    vl1re32.v v13, (t3)
+; ZVBB-NEXT:    vl1re32.v v21, (t4)
+; ZVBB-NEXT:    vl1re32.v v10, (a5)
+; ZVBB-NEXT:    vl1re32.v v18, (a6)
+; ZVBB-NEXT:    vl1re32.v v11, (a7)
+; ZVBB-NEXT:    vl1re32.v v19, (t0)
+; ZVBB-NEXT:    vl1re32.v v8, (a0)
+; ZVBB-NEXT:    vl1re32.v v16, (a1)
+; ZVBB-NEXT:    vl1re32.v v9, (a2)
+; ZVBB-NEXT:    vl1re32.v v17, (a4)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 4
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 32 x i32> @llvm.vector.interleave8.nxv32i32(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, <vscale x 4 x i32> %d, <vscale x 4 x i32> %e, <vscale x 4 x i32> %f, <vscale x 4 x i32> %g, <vscale x 4 x i32> %h)
+  ret <vscale x 32 x i32> %res
+}
+
+define <vscale x 16 x i64> @vector_interleave_nxv16i64_nxv2i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d, <vscale x 2 x i64> %e, <vscale x 2 x i64> %f, <vscale x 2 x i64> %g, <vscale x 2 x i64> %h) nounwind {
+;
+; CHECK-LABEL: vector_interleave_nxv16i64_nxv2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v28, v22
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vmv2r.v v26, v18
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    add a2, a0, a3
+; CHECK-NEXT:    add a4, a1, a3
+; CHECK-NEXT:    add a5, a2, a3
+; CHECK-NEXT:    vmv1r.v v1, v8
+; CHECK-NEXT:    vmv2r.v v24, v14
+; CHECK-NEXT:    add a6, a4, a3
+; CHECK-NEXT:    vmv2r.v v22, v10
+; CHECK-NEXT:    vmv1r.v v2, v22
+; CHECK-NEXT:    add a7, a5, a3
+; CHECK-NEXT:    vmv1r.v v3, v12
+; CHECK-NEXT:    add t0, a6, a3
+; CHECK-NEXT:    vmv1r.v v4, v24
+; CHECK-NEXT:    add t1, a7, a3
+; CHECK-NEXT:    vmv1r.v v5, v16
+; CHECK-NEXT:    add t2, t0, a3
+; CHECK-NEXT:    vmv1r.v v6, v26
+; CHECK-NEXT:    add t3, t1, a3
+; CHECK-NEXT:    vmv1r.v v7, v20
+; CHECK-NEXT:    add t4, t2, a3
+; CHECK-NEXT:    vmv1r.v v8, v28
+; CHECK-NEXT:    vmv1r.v v22, v9
+; CHECK-NEXT:    add t5, t3, a3
+; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    add t6, t4, a3
+; CHECK-NEXT:    vmv1r.v v26, v17
+; CHECK-NEXT:    vsseg8e64.v v1, (a0)
+; CHECK-NEXT:    vmv1r.v v28, v21
+; CHECK-NEXT:    vsseg8e64.v v22, (a1)
+; CHECK-NEXT:    vl1re64.v v14, (t5)
+; CHECK-NEXT:    add t5, t5, a3
+; CHECK-NEXT:    add a3, t6, a3
+; CHECK-NEXT:    vl1re64.v v22, (t6)
+; CHECK-NEXT:    vl1re64.v v15, (t5)
+; CHECK-NEXT:    vl1re64.v v23, (a3)
+; CHECK-NEXT:    vl1re64.v v12, (t1)
+; CHECK-NEXT:    vl1re64.v v20, (t2)
+; CHECK-NEXT:    vl1re64.v v13, (t3)
+; CHECK-NEXT:    vl1re64.v v21, (t4)
+; CHECK-NEXT:    vl1re64.v v10, (a5)
+; CHECK-NEXT:    vl1re64.v v18, (a6)
+; CHECK-NEXT:    vl1re64.v v11, (a7)
+; CHECK-NEXT:    vl1re64.v v19, (t0)
+; CHECK-NEXT:    vl1re64.v v8, (a0)
+; CHECK-NEXT:    vl1re64.v v16, (a1)
+; CHECK-NEXT:    vl1re64.v v9, (a2)
+; CHECK-NEXT:    vl1re64.v v17, (a4)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16i64_nxv2i64:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 4
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-NEXT:    vmv2r.v v28, v22
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    vmv2r.v v26, v18
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    slli a1, a1, 3
+; ZVBB-NEXT:    add a1, sp, a1
+; ZVBB-NEXT:    addi a1, a1, 16
+; ZVBB-NEXT:    csrr a3, vlenb
+; ZVBB-NEXT:    add a2, a0, a3
+; ZVBB-NEXT:    add a4, a1, a3
+; ZVBB-NEXT:    add a5, a2, a3
+; ZVBB-NEXT:    vmv1r.v v1, v8
+; ZVBB-NEXT:    vmv2r.v v24, v14
+; ZVBB-NEXT:    add a6, a4, a3
+; ZVBB-NEXT:    vmv2r.v v22, v10
+; ZVBB-NEXT:    vmv1r.v v2, v22
+; ZVBB-NEXT:    add a7, a5, a3
+; ZVBB-NEXT:    vmv1r.v v3, v12
+; ZVBB-NEXT:    add t0, a6, a3
+; ZVBB-NEXT:    vmv1r.v v4, v24
+; ZVBB-NEXT:    add t1, a7, a3
+; ZVBB-NEXT:    vmv1r.v v5, v16
+; ZVBB-NEXT:    add t2, t0, a3
+; ZVBB-NEXT:    vmv1r.v v6, v26
+; ZVBB-NEXT:    add t3, t1, a3
+; ZVBB-NEXT:    vmv1r.v v7, v20
+; ZVBB-NEXT:    add t4, t2, a3
+; ZVBB-NEXT:    vmv1r.v v8, v28
+; ZVBB-NEXT:    vmv1r.v v22, v9
+; ZVBB-NEXT:    add t5, t3, a3
+; ZVBB-NEXT:    vmv1r.v v24, v13
+; ZVBB-NEXT:    add t6, t4, a3
+; ZVBB-NEXT:    vmv1r.v v26, v17
+; ZVBB-NEXT:    vsseg8e64.v v1, (a0)
+; ZVBB-NEXT:    vmv1r.v v28, v21
+; ZVBB-NEXT:    vsseg8e64.v v22, (a1)
+; ZVBB-NEXT:    vl1re64.v v14, (t5)
+; ZVBB-NEXT:    add t5, t5, a3
+; ZVBB-NEXT:    add a3, t6, a3
+; ZVBB-NEXT:    vl1re64.v v22, (t6)
+; ZVBB-NEXT:    vl1re64.v v15, (t5)
+; ZVBB-NEXT:    vl1re64.v v23, (a3)
+; ZVBB-NEXT:    vl1re64.v v12, (t1)
+; ZVBB-NEXT:    vl1re64.v v20, (t2)
+; ZVBB-NEXT:    vl1re64.v v13, (t3)
+; ZVBB-NEXT:    vl1re64.v v21, (t4)
+; ZVBB-NEXT:    vl1re64.v v10, (a5)
+; ZVBB-NEXT:    vl1re64.v v18, (a6)
+; ZVBB-NEXT:    vl1re64.v v11, (a7)
+; ZVBB-NEXT:    vl1re64.v v19, (t0)
+; ZVBB-NEXT:    vl1re64.v v8, (a0)
+; ZVBB-NEXT:    vl1re64.v v16, (a1)
+; ZVBB-NEXT:    vl1re64.v v9, (a2)
+; ZVBB-NEXT:    vl1re64.v v17, (a4)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 4
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 16 x i64> @llvm.vector.interleave8.nxv16i64(<vscale x 2 x i64> %a, <vscale x 2 x i64> %b, <vscale x 2 x i64> %c, <vscale x 2 x i64> %d, <vscale x 2 x i64> %e, <vscale x 2 x i64> %f, <vscale x 2 x i64> %g, <vscale x 2 x i64> %h)
+  ret <vscale x 16 x i64> %res
+}
+
+; Floats
+
+define <vscale x 4 x bfloat> @vector_interleave_nxv4bf16_nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) {
+; V-LABEL: vector_interleave_nxv4bf16_nxv2bf16:
+; V:       # %bb.0:
+; V-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; V-NEXT:    vwaddu.vv v10, v8, v9
+; V-NEXT:    li a0, -1
+; V-NEXT:    csrr a1, vlenb
+; V-NEXT:    vwmaccu.vx v10, a0, v9
+; V-NEXT:    srli a1, a1, 2
+; V-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; V-NEXT:    vslidedown.vx v8, v10, a1
+; V-NEXT:    add a0, a1, a1
+; V-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; V-NEXT:    vslideup.vx v10, v8, a1
+; V-NEXT:    vmv.v.v v8, v10
+; V-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv4bf16_nxv2bf16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vwsll.vi v10, v9, 16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
+; ZVBB-NEXT:    srli a0, a0, 2
+; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVBB-NEXT:    vslidedown.vx v8, v10, a0
+; ZVBB-NEXT:    add a1, a0, a0
+; ZVBB-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v10, v8, a0
+; ZVBB-NEXT:    vmv.v.v v8, v10
+; ZVBB-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave_nxv4bf16_nxv2bf16:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZIP-NEXT:    ri.vzip2b.vv v11, v8, v9
+; ZIP-NEXT:    ri.vzip2a.vv v10, v8, v9
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    srli a0, a0, 2
+; ZIP-NEXT:    add a1, a0, a0
+; ZIP-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; ZIP-NEXT:    vslideup.vx v10, v11, a0
+; ZIP-NEXT:    vmv.v.v v8, v10
+; ZIP-NEXT:    ret
+  %res = call <vscale x 4 x bfloat> @llvm.vector.interleave2.nxv4bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b)
+  ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @vector_interleave_nxv8bf16_nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) {
+; V-LABEL: vector_interleave_nxv8bf16_nxv4bf16:
+; V:       # %bb.0:
+; V-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; V-NEXT:    vmv1r.v v10, v9
+; V-NEXT:    vmv1r.v v11, v8
+; V-NEXT:    vwaddu.vv v8, v11, v10
+; V-NEXT:    li a0, -1
+; V-NEXT:    vwmaccu.vx v8, a0, v10
+; V-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv8bf16_nxv4bf16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-NEXT:    vmv1r.v v10, v9
+; ZVBB-NEXT:    vmv1r.v v11, v8
+; ZVBB-NEXT:    vwsll.vi v8, v10, 16
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v11
+; ZVBB-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave_nxv8bf16_nxv4bf16:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT:    vmv1r.v v10, v9
+; ZIP-NEXT:    vmv1r.v v11, v8
+; ZIP-NEXT:    ri.vzip2b.vv v9, v8, v10
+; ZIP-NEXT:    ri.vzip2a.vv v8, v11, v10
+; ZIP-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.vector.interleave2.nxv8bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 4 x half> @vector_interleave_nxv4f16_nxv2f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b) {
+; V-LABEL: vector_interleave_nxv4f16_nxv2f16:
+; V:       # %bb.0:
+; V-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; V-NEXT:    vwaddu.vv v10, v8, v9
+; V-NEXT:    li a0, -1
+; V-NEXT:    csrr a1, vlenb
+; V-NEXT:    vwmaccu.vx v10, a0, v9
+; V-NEXT:    srli a1, a1, 2
+; V-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; V-NEXT:    vslidedown.vx v8, v10, a1
+; V-NEXT:    add a0, a1, a1
+; V-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; V-NEXT:    vslideup.vx v10, v8, a1
+; V-NEXT:    vmv.v.v v8, v10
+; V-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave_nxv4f16_nxv2f16:
 ; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVBB-NEXT:    vwsll.vi v10, v9, 16
+; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vwsll.vi v10, v9, 16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
+; ZVBB-NEXT:    srli a0, a0, 2
+; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVBB-NEXT:    vslidedown.vx v8, v10, a0
+; ZVBB-NEXT:    add a1, a0, a0
+; ZVBB-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v10, v8, a0
+; ZVBB-NEXT:    vmv.v.v v8, v10
+; ZVBB-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave_nxv4f16_nxv2f16:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZIP-NEXT:    ri.vzip2b.vv v11, v8, v9
+; ZIP-NEXT:    ri.vzip2a.vv v10, v8, v9
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    srli a0, a0, 2
+; ZIP-NEXT:    add a1, a0, a0
+; ZIP-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
+; ZIP-NEXT:    vslideup.vx v10, v11, a0
+; ZIP-NEXT:    vmv.v.v v8, v10
+; ZIP-NEXT:    ret
+  %res = call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
+  ret <vscale x 4 x half> %res
+}
+
+define <vscale x 8 x half> @vector_interleave_nxv8f16_nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
+; V-LABEL: vector_interleave_nxv8f16_nxv4f16:
+; V:       # %bb.0:
+; V-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; V-NEXT:    vmv1r.v v10, v9
+; V-NEXT:    vmv1r.v v11, v8
+; V-NEXT:    vwaddu.vv v8, v11, v10
+; V-NEXT:    li a0, -1
+; V-NEXT:    vwmaccu.vx v8, a0, v10
+; V-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv8f16_nxv4f16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-NEXT:    vmv1r.v v10, v9
+; ZVBB-NEXT:    vmv1r.v v11, v8
+; ZVBB-NEXT:    vwsll.vi v8, v10, 16
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v11
+; ZVBB-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave_nxv8f16_nxv4f16:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT:    vmv1r.v v10, v9
+; ZIP-NEXT:    vmv1r.v v11, v8
+; ZIP-NEXT:    ri.vzip2b.vv v9, v8, v10
+; ZIP-NEXT:    ri.vzip2a.vv v8, v11, v10
+; ZIP-NEXT:    ret
+  %res = call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
+  ret <vscale x 8 x half> %res
+}
+
+define <vscale x 4 x float> @vector_interleave_nxv4f32_nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b) {
+; V-LABEL: vector_interleave_nxv4f32_nxv2f32:
+; V:       # %bb.0:
+; V-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; V-NEXT:    vmv1r.v v10, v9
+; V-NEXT:    vmv1r.v v11, v8
+; V-NEXT:    vwaddu.vv v8, v11, v10
+; V-NEXT:    li a0, -1
+; V-NEXT:    vwmaccu.vx v8, a0, v10
+; V-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv4f32_nxv2f32:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-NEXT:    vmv1r.v v10, v9
+; ZVBB-NEXT:    vmv1r.v v11, v8
+; ZVBB-NEXT:    li a0, 32
+; ZVBB-NEXT:    vwsll.vx v8, v10, a0
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v11
+; ZVBB-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave_nxv4f32_nxv2f32:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; ZIP-NEXT:    vmv1r.v v10, v9
+; ZIP-NEXT:    vmv1r.v v11, v8
+; ZIP-NEXT:    ri.vzip2b.vv v9, v8, v10
+; ZIP-NEXT:    ri.vzip2a.vv v8, v11, v10
+; ZIP-NEXT:    ret
+  %res = call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
+  ret <vscale x 4 x float> %res
+}
+
+define <vscale x 16 x bfloat> @vector_interleave_nxv16bf16_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
+; V-LABEL: vector_interleave_nxv16bf16_nxv8bf16:
+; V:       # %bb.0:
+; V-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; V-NEXT:    vmv2r.v v12, v10
+; V-NEXT:    vmv2r.v v14, v8
+; V-NEXT:    vwaddu.vv v8, v14, v12
+; V-NEXT:    li a0, -1
+; V-NEXT:    vwmaccu.vx v8, a0, v12
+; V-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16bf16_nxv8bf16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVBB-NEXT:    vmv2r.v v12, v10
+; ZVBB-NEXT:    vmv2r.v v14, v8
+; ZVBB-NEXT:    vwsll.vi v8, v12, 16
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v14
+; ZVBB-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave_nxv16bf16_nxv8bf16:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZIP-NEXT:    vmv2r.v v12, v10
+; ZIP-NEXT:    vmv2r.v v14, v8
+; ZIP-NEXT:    ri.vzip2b.vv v10, v8, v12
+; ZIP-NEXT:    ri.vzip2a.vv v8, v14, v12
+; ZIP-NEXT:    ret
+  %res = call <vscale x 16 x bfloat> @llvm.vector.interleave2.nxv16bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
+  ret <vscale x 16 x bfloat> %res
+}
+
+define <vscale x 16 x half> @vector_interleave_nxv16f16_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; V-LABEL: vector_interleave_nxv16f16_nxv8f16:
+; V:       # %bb.0:
+; V-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; V-NEXT:    vmv2r.v v12, v10
+; V-NEXT:    vmv2r.v v14, v8
+; V-NEXT:    vwaddu.vv v8, v14, v12
+; V-NEXT:    li a0, -1
+; V-NEXT:    vwmaccu.vx v8, a0, v12
+; V-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16f16_nxv8f16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVBB-NEXT:    vmv2r.v v12, v10
+; ZVBB-NEXT:    vmv2r.v v14, v8
+; ZVBB-NEXT:    vwsll.vi v8, v12, 16
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v14
+; ZVBB-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave_nxv16f16_nxv8f16:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZIP-NEXT:    vmv2r.v v12, v10
+; ZIP-NEXT:    vmv2r.v v14, v8
+; ZIP-NEXT:    ri.vzip2b.vv v10, v8, v12
+; ZIP-NEXT:    ri.vzip2a.vv v8, v14, v12
+; ZIP-NEXT:    ret
+  %res = call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 16 x half> %res
+}
+
+define <vscale x 8 x float> @vector_interleave_nxv8f32_nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; V-LABEL: vector_interleave_nxv8f32_nxv4f32:
+; V:       # %bb.0:
+; V-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; V-NEXT:    vmv2r.v v12, v10
+; V-NEXT:    vmv2r.v v14, v8
+; V-NEXT:    vwaddu.vv v8, v14, v12
+; V-NEXT:    li a0, -1
+; V-NEXT:    vwmaccu.vx v8, a0, v12
+; V-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv8f32_nxv4f32:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; ZVBB-NEXT:    vmv2r.v v12, v10
+; ZVBB-NEXT:    vmv2r.v v14, v8
+; ZVBB-NEXT:    li a0, 32
+; ZVBB-NEXT:    vwsll.vx v8, v12, a0
+; ZVBB-NEXT:    vwaddu.wv v8, v8, v14
+; ZVBB-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave_nxv8f32_nxv4f32:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; ZIP-NEXT:    vmv2r.v v12, v10
+; ZIP-NEXT:    vmv2r.v v14, v8
+; ZIP-NEXT:    ri.vzip2b.vv v10, v8, v12
+; ZIP-NEXT:    ri.vzip2a.vv v8, v14, v12
+; ZIP-NEXT:    ret
+  %res = call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 8 x float> %res
+}
+
+define <vscale x 4 x double> @vector_interleave_nxv4f64_nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; V-LABEL: vector_interleave_nxv4f64_nxv2f64:
+; V:       # %bb.0:
+; V-NEXT:    csrr a0, vlenb
+; V-NEXT:    vsetvli a1, zero, e16, m1, ta, mu
+; V-NEXT:    vid.v v12
+; V-NEXT:    srli a0, a0, 2
+; V-NEXT:    vand.vi v13, v12, 1
+; V-NEXT:    vmsne.vi v0, v13, 0
+; V-NEXT:    vsrl.vi v16, v12, 1
+; V-NEXT:    vadd.vx v16, v16, a0, v0.t
+; V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; V-NEXT:    vrgatherei16.vv v12, v8, v16
+; V-NEXT:    vmv.v.v v8, v12
+; V-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv4f64_nxv2f64:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, mu
+; ZVBB-NEXT:    vid.v v12
+; ZVBB-NEXT:    srli a0, a0, 2
+; ZVBB-NEXT:    vand.vi v13, v12, 1
+; ZVBB-NEXT:    vmsne.vi v0, v13, 0
+; ZVBB-NEXT:    vsrl.vi v16, v12, 1
+; ZVBB-NEXT:    vadd.vx v16, v16, a0, v0.t
+; ZVBB-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; ZVBB-NEXT:    vrgatherei16.vv v12, v8, v16
+; ZVBB-NEXT:    vmv.v.v v8, v12
+; ZVBB-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave_nxv4f64_nxv2f64:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
+; ZIP-NEXT:    vmv2r.v v12, v10
+; ZIP-NEXT:    vmv2r.v v14, v8
+; ZIP-NEXT:    ri.vzip2b.vv v10, v8, v12
+; ZIP-NEXT:    ri.vzip2a.vv v8, v14, v12
+; ZIP-NEXT:    ret
+  %res = call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 4 x double> %res
+}
+
+
+
+define <vscale x 64 x bfloat> @vector_interleave_nxv64bf16_nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) {
+; V-LABEL: vector_interleave_nxv64bf16_nxv32bf16:
+; V:       # %bb.0:
+; V-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; V-NEXT:    vmv8r.v v24, v8
+; V-NEXT:    vwaddu.vv v8, v24, v16
+; V-NEXT:    li a0, -1
+; V-NEXT:    vwaddu.vv v0, v28, v20
+; V-NEXT:    vwmaccu.vx v8, a0, v16
+; V-NEXT:    vwmaccu.vx v0, a0, v20
+; V-NEXT:    vmv8r.v v16, v0
+; V-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv64bf16_nxv32bf16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVBB-NEXT:    vwsll.vi v24, v16, 16
+; ZVBB-NEXT:    vwsll.vi v0, v20, 16
+; ZVBB-NEXT:    vwaddu.wv v24, v24, v8
+; ZVBB-NEXT:    vwaddu.wv v0, v0, v12
+; ZVBB-NEXT:    vmv8r.v v8, v24
+; ZVBB-NEXT:    vmv8r.v v16, v0
+; ZVBB-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave_nxv64bf16_nxv32bf16:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZIP-NEXT:    ri.vzip2b.vv v28, v8, v16
+; ZIP-NEXT:    ri.vzip2b.vv v4, v12, v20
+; ZIP-NEXT:    ri.vzip2a.vv v24, v8, v16
+; ZIP-NEXT:    ri.vzip2a.vv v0, v12, v20
+; ZIP-NEXT:    vmv8r.v v8, v24
+; ZIP-NEXT:    vmv8r.v v16, v0
+; ZIP-NEXT:    ret
+  %res = call <vscale x 64 x bfloat> @llvm.vector.interleave2.nxv64bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
+  ret <vscale x 64 x bfloat> %res
+}
+
+define <vscale x 64 x half> @vector_interleave_nxv64f16_nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b) {
+; V-LABEL: vector_interleave_nxv64f16_nxv32f16:
+; V:       # %bb.0:
+; V-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; V-NEXT:    vmv8r.v v24, v8
+; V-NEXT:    vwaddu.vv v8, v24, v16
+; V-NEXT:    li a0, -1
+; V-NEXT:    vwaddu.vv v0, v28, v20
+; V-NEXT:    vwmaccu.vx v8, a0, v16
+; V-NEXT:    vwmaccu.vx v0, a0, v20
+; V-NEXT:    vmv8r.v v16, v0
+; V-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv64f16_nxv32f16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVBB-NEXT:    vwsll.vi v24, v16, 16
+; ZVBB-NEXT:    vwsll.vi v0, v20, 16
+; ZVBB-NEXT:    vwaddu.wv v24, v24, v8
+; ZVBB-NEXT:    vwaddu.wv v0, v0, v12
+; ZVBB-NEXT:    vmv8r.v v8, v24
+; ZVBB-NEXT:    vmv8r.v v16, v0
+; ZVBB-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave_nxv64f16_nxv32f16:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZIP-NEXT:    ri.vzip2b.vv v28, v8, v16
+; ZIP-NEXT:    ri.vzip2b.vv v4, v12, v20
+; ZIP-NEXT:    ri.vzip2a.vv v24, v8, v16
+; ZIP-NEXT:    ri.vzip2a.vv v0, v12, v20
+; ZIP-NEXT:    vmv8r.v v8, v24
+; ZIP-NEXT:    vmv8r.v v16, v0
+; ZIP-NEXT:    ret
+  %res = call <vscale x 64 x half> @llvm.vector.interleave2.nxv64f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b)
+  ret <vscale x 64 x half> %res
+}
+
+define <vscale x 32 x float> @vector_interleave_nxv32f32_nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b) {
+; V-LABEL: vector_interleave_nxv32f32_nxv16f32:
+; V:       # %bb.0:
+; V-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; V-NEXT:    vmv8r.v v24, v8
+; V-NEXT:    vwaddu.vv v8, v24, v16
+; V-NEXT:    li a0, -1
+; V-NEXT:    vwaddu.vv v0, v28, v20
+; V-NEXT:    vwmaccu.vx v8, a0, v16
+; V-NEXT:    vwmaccu.vx v0, a0, v20
+; V-NEXT:    vmv8r.v v16, v0
+; V-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv32f32_nxv16f32:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    li a0, 32
+; ZVBB-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
+; ZVBB-NEXT:    vwsll.vx v24, v16, a0
+; ZVBB-NEXT:    vwsll.vx v0, v20, a0
+; ZVBB-NEXT:    vwaddu.wv v24, v24, v8
+; ZVBB-NEXT:    vwaddu.wv v0, v0, v12
+; ZVBB-NEXT:    vmv8r.v v8, v24
+; ZVBB-NEXT:    vmv8r.v v16, v0
+; ZVBB-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave_nxv32f32_nxv16f32:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
+; ZIP-NEXT:    ri.vzip2b.vv v28, v8, v16
+; ZIP-NEXT:    ri.vzip2b.vv v4, v12, v20
+; ZIP-NEXT:    ri.vzip2a.vv v24, v8, v16
+; ZIP-NEXT:    ri.vzip2a.vv v0, v12, v20
+; ZIP-NEXT:    vmv8r.v v8, v24
+; ZIP-NEXT:    vmv8r.v v16, v0
+; ZIP-NEXT:    ret
+  %res = call <vscale x 32 x float> @llvm.vector.interleave2.nxv32f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b)
+  ret <vscale x 32 x float> %res
+}
+
+define <vscale x 16 x double> @vector_interleave_nxv16f64_nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b) {
+; V-LABEL: vector_interleave_nxv16f64_nxv8f64:
+; V:       # %bb.0:
+; V-NEXT:    csrr a0, vlenb
+; V-NEXT:    vsetvli a1, zero, e16, m2, ta, mu
+; V-NEXT:    vid.v v6
+; V-NEXT:    vmv8r.v v24, v8
+; V-NEXT:    srli a0, a0, 1
+; V-NEXT:    vmv4r.v v28, v16
+; V-NEXT:    vmv4r.v v16, v12
+; V-NEXT:    vand.vi v8, v6, 1
+; V-NEXT:    vmsne.vi v0, v8, 0
+; V-NEXT:    vsrl.vi v6, v6, 1
+; V-NEXT:    vadd.vx v6, v6, a0, v0.t
+; V-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; V-NEXT:    vrgatherei16.vv v8, v24, v6
+; V-NEXT:    vrgatherei16.vv v24, v16, v6
+; V-NEXT:    vmv.v.v v16, v24
+; V-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16f64_nxv8f64:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    vsetvli a1, zero, e16, m2, ta, mu
+; ZVBB-NEXT:    vid.v v6
+; ZVBB-NEXT:    vmv8r.v v24, v8
+; ZVBB-NEXT:    srli a0, a0, 1
+; ZVBB-NEXT:    vmv4r.v v28, v16
+; ZVBB-NEXT:    vmv4r.v v16, v12
+; ZVBB-NEXT:    vand.vi v8, v6, 1
+; ZVBB-NEXT:    vmsne.vi v0, v8, 0
+; ZVBB-NEXT:    vsrl.vi v6, v6, 1
+; ZVBB-NEXT:    vadd.vx v6, v6, a0, v0.t
+; ZVBB-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
+; ZVBB-NEXT:    vrgatherei16.vv v8, v24, v6
+; ZVBB-NEXT:    vrgatherei16.vv v24, v16, v6
+; ZVBB-NEXT:    vmv.v.v v16, v24
+; ZVBB-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave_nxv16f64_nxv8f64:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
+; ZIP-NEXT:    ri.vzip2b.vv v28, v8, v16
+; ZIP-NEXT:    ri.vzip2b.vv v4, v12, v20
+; ZIP-NEXT:    ri.vzip2a.vv v24, v8, v16
+; ZIP-NEXT:    ri.vzip2a.vv v0, v12, v20
+; ZIP-NEXT:    vmv8r.v v8, v24
+; ZIP-NEXT:    vmv8r.v v16, v0
+; ZIP-NEXT:    ret
+  %res = call <vscale x 16 x double> @llvm.vector.interleave2.nxv16f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b)
+  ret <vscale x 16 x double> %res
+}
+
+define <vscale x 6 x half> @vector_interleave_nxv6f16_nxv2f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv6f16_nxv2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a2, a1, 1
+; CHECK-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsseg3e16.v v8, (a0)
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    vle16.v v9, (a3)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    add a0, a1, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a1
+; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v9, (a2)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv6f16_nxv2f16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    srli a2, a1, 1
+; ZVBB-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsseg3e16.v v8, (a0)
+; ZVBB-NEXT:    add a3, a0, a2
+; ZVBB-NEXT:    vle16.v v9, (a3)
+; ZVBB-NEXT:    vle16.v v8, (a0)
+; ZVBB-NEXT:    srli a1, a1, 2
+; ZVBB-NEXT:    add a0, a1, a1
+; ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v8, v9, a1
+; ZVBB-NEXT:    add a2, a3, a2
+; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vle16.v v9, (a2)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 6 x half> @llvm.vector.interleave3.nxv6f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2)
+  ret <vscale x 6 x half> %res
+}
+
+define <vscale x 12 x half> @vector_interleave_nxv12f16_nxv4f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv12f16_nxv4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsseg3e16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v8, (a0)
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vl1re16.v v9, (a0)
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vl1re16.v v10, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv12f16_nxv4f16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a1, a0, 1
+; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVBB-NEXT:    vsseg3e16.v v8, (a0)
+; ZVBB-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-NEXT:    add a0, a0, a1
+; ZVBB-NEXT:    vl1re16.v v9, (a0)
+; ZVBB-NEXT:    add a0, a0, a1
+; ZVBB-NEXT:    vl1re16.v v10, (a0)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a1, a0, 1
+; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 12 x half> @llvm.vector.interleave3.nxv12f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2)
+  ret <vscale x 12 x half> %res
+}
+
+define <vscale x 24 x half> @vector_interleave_nxv24f16_nxv8f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv24f16_nxv8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 6
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsseg3e16.v v8, (a0)
+; CHECK-NEXT:    vl2re16.v v8, (a0)
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vl2re16.v v10, (a0)
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vl2re16.v v12, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 6
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv24f16_nxv8f16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    li a1, 6
+; ZVBB-NEXT:    mul a0, a0, a1
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    slli a1, a1, 1
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVBB-NEXT:    vsseg3e16.v v8, (a0)
+; ZVBB-NEXT:    vl2re16.v v8, (a0)
+; ZVBB-NEXT:    add a0, a0, a1
+; ZVBB-NEXT:    vl2re16.v v10, (a0)
+; ZVBB-NEXT:    add a0, a0, a1
+; ZVBB-NEXT:    vl2re16.v v12, (a0)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    li a1, 6
+; ZVBB-NEXT:    mul a0, a0, a1
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 24 x half> @llvm.vector.interleave3.nxv24f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2)
+  ret <vscale x 24 x half> %res
+}
+
+define <vscale x 6 x bfloat> @vector_interleave_nxv6bf16_nxv2bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv6bf16_nxv2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a2, a1, 1
+; CHECK-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsseg3e16.v v8, (a0)
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    vle16.v v9, (a3)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    add a0, a1, a1
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a1
+; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v9, (a2)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv6bf16_nxv2bf16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    srli a2, a1, 1
+; ZVBB-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsseg3e16.v v8, (a0)
+; ZVBB-NEXT:    add a3, a0, a2
+; ZVBB-NEXT:    vle16.v v9, (a3)
+; ZVBB-NEXT:    vle16.v v8, (a0)
+; ZVBB-NEXT:    srli a1, a1, 2
+; ZVBB-NEXT:    add a0, a1, a1
+; ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v8, v9, a1
+; ZVBB-NEXT:    add a2, a3, a2
+; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vle16.v v9, (a2)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 6 x bfloat> @llvm.vector.interleave3.nxv6bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2)
+  ret <vscale x 6 x bfloat> %res
+}
+
+define <vscale x 12 x bfloat> @vector_interleave_nxv12bf16_nxv4bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv12bf16_nxv4bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsseg3e16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v8, (a0)
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vl1re16.v v9, (a0)
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vl1re16.v v10, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv12bf16_nxv4bf16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a1, a0, 1
+; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
+; ZVBB-NEXT:    vsseg3e16.v v8, (a0)
+; ZVBB-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-NEXT:    add a0, a0, a1
+; ZVBB-NEXT:    vl1re16.v v9, (a0)
+; ZVBB-NEXT:    add a0, a0, a1
+; ZVBB-NEXT:    vl1re16.v v10, (a0)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a1, a0, 1
+; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 12 x bfloat> @llvm.vector.interleave3.nxv12bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2)
+  ret <vscale x 12 x bfloat> %res
+}
+
+define <vscale x 24 x bfloat> @vector_interleave_nxv24bf16_nxv8bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv24bf16_nxv8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 6
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsseg3e16.v v8, (a0)
+; CHECK-NEXT:    vl2re16.v v8, (a0)
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vl2re16.v v10, (a0)
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vl2re16.v v12, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 6
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv24bf16_nxv8bf16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    li a1, 6
+; ZVBB-NEXT:    mul a0, a0, a1
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    slli a1, a1, 1
+; ZVBB-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
+; ZVBB-NEXT:    vsseg3e16.v v8, (a0)
+; ZVBB-NEXT:    vl2re16.v v8, (a0)
+; ZVBB-NEXT:    add a0, a0, a1
+; ZVBB-NEXT:    vl2re16.v v10, (a0)
+; ZVBB-NEXT:    add a0, a0, a1
+; ZVBB-NEXT:    vl2re16.v v12, (a0)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    li a1, 6
+; ZVBB-NEXT:    mul a0, a0, a1
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 24 x bfloat> @llvm.vector.interleave3.nxv24bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2)
+  ret <vscale x 24 x bfloat> %res
+}
+
+define <vscale x 3 x float> @vector_interleave_nxv3f32_nxv1f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv3f32_nxv1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a2, a1, 1
+; CHECK-NEXT:    vsetvli a3, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsseg3e32.v v8, (a0)
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    vle32.v v9, (a3)
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    srli a1, a1, 3
+; CHECK-NEXT:    add a0, a1, a1
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a1
+; CHECK-NEXT:    add a2, a3, a2
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v9, (a2)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv3f32_nxv1f32:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    srli a2, a1, 1
+; ZVBB-NEXT:    vsetvli a3, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsseg3e32.v v8, (a0)
+; ZVBB-NEXT:    add a3, a0, a2
+; ZVBB-NEXT:    vle32.v v9, (a3)
+; ZVBB-NEXT:    vle32.v v8, (a0)
+; ZVBB-NEXT:    srli a1, a1, 3
+; ZVBB-NEXT:    add a0, a1, a1
+; ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v8, v9, a1
+; ZVBB-NEXT:    add a2, a3, a2
+; ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vle32.v v9, (a2)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 3 x float> @llvm.vector.interleave3.nxv3f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2)
+  ret <vscale x 3 x float> %res
+}
+
+define <vscale x 6 x float> @vector_interleave_nxv6f32_nxv2f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv6f32_nxv2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsseg3e32.v v8, (a0)
+; CHECK-NEXT:    vl1re32.v v8, (a0)
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vl1re32.v v9, (a0)
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vl1re32.v v10, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv6f32_nxv2f32:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a1, a0, 1
+; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
+; ZVBB-NEXT:    vsseg3e32.v v8, (a0)
+; ZVBB-NEXT:    vl1re32.v v8, (a0)
+; ZVBB-NEXT:    add a0, a0, a1
+; ZVBB-NEXT:    vl1re32.v v9, (a0)
+; ZVBB-NEXT:    add a0, a0, a1
+; ZVBB-NEXT:    vl1re32.v v10, (a0)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a1, a0, 1
+; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 6 x float> @llvm.vector.interleave3.nxv6f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2)
+  ret <vscale x 6 x float> %res
+}
+
+define <vscale x 12 x float> @vector_interleave_nxv12f32_nxv4f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv12f32_nxv4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 6
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vsseg3e32.v v8, (a0)
+; CHECK-NEXT:    vl2re32.v v8, (a0)
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vl2re32.v v10, (a0)
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vl2re32.v v12, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 6
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv12f32_nxv4f32:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    li a1, 6
+; ZVBB-NEXT:    mul a0, a0, a1
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    slli a1, a1, 1
+; ZVBB-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; ZVBB-NEXT:    vsseg3e32.v v8, (a0)
+; ZVBB-NEXT:    vl2re32.v v8, (a0)
+; ZVBB-NEXT:    add a0, a0, a1
+; ZVBB-NEXT:    vl2re32.v v10, (a0)
+; ZVBB-NEXT:    add a0, a0, a1
+; ZVBB-NEXT:    vl2re32.v v12, (a0)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    li a1, 6
+; ZVBB-NEXT:    mul a0, a0, a1
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 12 x float> @llvm.vector.interleave3.nxv12f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2)
+  ret <vscale x 12 x float> %res
+}
+
+define <vscale x 3 x double> @vector_interleave_nxv3f64_nxv1f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv3f64_nxv1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsseg3e64.v v8, (a0)
+; CHECK-NEXT:    vl1re64.v v8, (a0)
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vl1re64.v v9, (a0)
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vl1re64.v v10, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv3f64_nxv1f64:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a1, a0, 1
+; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
+; ZVBB-NEXT:    vsseg3e64.v v8, (a0)
+; ZVBB-NEXT:    vl1re64.v v8, (a0)
+; ZVBB-NEXT:    add a0, a0, a1
+; ZVBB-NEXT:    vl1re64.v v9, (a0)
+; ZVBB-NEXT:    add a0, a0, a1
+; ZVBB-NEXT:    vl1re64.v v10, (a0)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a1, a0, 1
+; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 3 x double> @llvm.vector.interleave3.nxv3f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2)
+  ret <vscale x 3 x double> %res
+}
+
+define <vscale x 6 x double> @vector_interleave_nxv6f64_nxv2f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2) nounwind {
+; CHECK-LABEL: vector_interleave_nxv6f64_nxv2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 6
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsseg3e64.v v8, (a0)
+; CHECK-NEXT:    vl2re64.v v8, (a0)
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vl2re64.v v10, (a0)
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vl2re64.v v12, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 6
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv6f64_nxv2f64:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    li a1, 6
+; ZVBB-NEXT:    mul a0, a0, a1
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    slli a1, a1, 1
+; ZVBB-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
+; ZVBB-NEXT:    vsseg3e64.v v8, (a0)
+; ZVBB-NEXT:    vl2re64.v v8, (a0)
+; ZVBB-NEXT:    add a0, a0, a1
+; ZVBB-NEXT:    vl2re64.v v10, (a0)
+; ZVBB-NEXT:    add a0, a0, a1
+; ZVBB-NEXT:    vl2re64.v v12, (a0)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    li a1, 6
+; ZVBB-NEXT:    mul a0, a0, a1
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 6 x double> @llvm.vector.interleave3.nxv6f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2)
+  ret <vscale x 6 x double> %res
+}
+
+define <vscale x 8 x half> @vector_interleave_nxv8f16_nxv2f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv8f16_nxv2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a2, a1, 1
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsseg4e16.v v8, (a0)
+; CHECK-NEXT:    add a4, a3, a2
+; CHECK-NEXT:    add a2, a4, a2
+; CHECK-NEXT:    vle16.v v9, (a4)
+; CHECK-NEXT:    vle16.v v8, (a2)
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    add a2, a1, a1
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a1
+; CHECK-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v10, (a3)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v10, a1
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv8f16_nxv2f16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    srli a2, a1, 1
+; ZVBB-NEXT:    add a3, a0, a2
+; ZVBB-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsseg4e16.v v8, (a0)
+; ZVBB-NEXT:    add a4, a3, a2
+; ZVBB-NEXT:    add a2, a4, a2
+; ZVBB-NEXT:    vle16.v v9, (a4)
+; ZVBB-NEXT:    vle16.v v8, (a2)
+; ZVBB-NEXT:    srli a1, a1, 2
+; ZVBB-NEXT:    add a2, a1, a1
+; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v9, v8, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vle16.v v10, (a3)
+; ZVBB-NEXT:    vle16.v v8, (a0)
+; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v8, v10, a1
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 8 x half> @llvm.vector.interleave4.nxv8f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3)
+  ret <vscale x 8 x half> %res
+}
+
+define <vscale x 16 x half> @vector_interleave_nxv16f16_nxv4f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv16f16_nxv4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    vsetvli a4, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsseg4e16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v10, (a3)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vl1re16.v v11, (a1)
+; CHECK-NEXT:    vl1re16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v9, (a2)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16f16_nxv4f16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 2
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e16, m1, ta, ma
+; ZVBB-NEXT:    vsseg4e16.v v8, (a0)
+; ZVBB-NEXT:    vl1re16.v v10, (a3)
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vl1re16.v v11, (a1)
+; ZVBB-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-NEXT:    vl1re16.v v9, (a2)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 2
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 16 x half> @llvm.vector.interleave4.nxv16f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3)
+  ret <vscale x 16 x half> %res
+}
+
+define <vscale x 32 x half> @vector_interleave_nxv32f16_nxv8f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv32f16_nxv8f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    vsetvli a3, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsseg4e16.v v8, (a0)
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    vl2re16.v v12, (a3)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vl2re16.v v14, (a1)
+; CHECK-NEXT:    vl2re16.v v8, (a0)
+; CHECK-NEXT:    vl2re16.v v10, (a2)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv32f16_nxv8f16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    slli a1, a1, 1
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    vsetvli a3, zero, e16, m2, ta, ma
+; ZVBB-NEXT:    vsseg4e16.v v8, (a0)
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    vl2re16.v v12, (a3)
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vl2re16.v v14, (a1)
+; ZVBB-NEXT:    vl2re16.v v8, (a0)
+; ZVBB-NEXT:    vl2re16.v v10, (a2)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 32 x half> @llvm.vector.interleave4.nxv32f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3)
+  ret <vscale x 32 x half> %res
+}
+
+define <vscale x 8 x bfloat> @vector_interleave_nxv8bf16_nxv2bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv8bf16_nxv2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a2, a1, 1
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsseg4e16.v v8, (a0)
+; CHECK-NEXT:    add a4, a3, a2
+; CHECK-NEXT:    add a2, a4, a2
+; CHECK-NEXT:    vle16.v v9, (a4)
+; CHECK-NEXT:    vle16.v v8, (a2)
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    add a2, a1, a1
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a1
+; CHECK-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v10, (a3)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v10, a1
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv8bf16_nxv2bf16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    srli a2, a1, 1
+; ZVBB-NEXT:    add a3, a0, a2
+; ZVBB-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsseg4e16.v v8, (a0)
+; ZVBB-NEXT:    add a4, a3, a2
+; ZVBB-NEXT:    add a2, a4, a2
+; ZVBB-NEXT:    vle16.v v9, (a4)
+; ZVBB-NEXT:    vle16.v v8, (a2)
+; ZVBB-NEXT:    srli a1, a1, 2
+; ZVBB-NEXT:    add a2, a1, a1
+; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v9, v8, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vle16.v v10, (a3)
+; ZVBB-NEXT:    vle16.v v8, (a0)
+; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v8, v10, a1
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.vector.interleave4.nxv8bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 16 x bfloat> @vector_interleave_nxv16bf16_nxv4bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv16bf16_nxv4bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    vsetvli a4, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsseg4e16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v10, (a3)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vl1re16.v v11, (a1)
+; CHECK-NEXT:    vl1re16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v9, (a2)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16bf16_nxv4bf16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    vwaddu.wv v10, v10, v8
-; ZVBB-NEXT:    srli a0, a0, 2
-; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
-; ZVBB-NEXT:    vslidedown.vx v8, v10, a0
-; ZVBB-NEXT:    add a1, a0, a0
-; ZVBB-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v10, v8, a0
-; ZVBB-NEXT:    vmv.v.v v8, v10
+; ZVBB-NEXT:    slli a0, a0, 2
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e16, m1, ta, ma
+; ZVBB-NEXT:    vsseg4e16.v v8, (a0)
+; ZVBB-NEXT:    vl1re16.v v10, (a3)
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vl1re16.v v11, (a1)
+; ZVBB-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-NEXT:    vl1re16.v v9, (a2)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 2
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
+  %res = call <vscale x 16 x bfloat> @llvm.vector.interleave4.nxv16bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3)
+  ret <vscale x 16 x bfloat> %res
+}
+
+define <vscale x 32 x bfloat> @vector_interleave_nxv32bf16_nxv8bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv32bf16_nxv8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    vsetvli a3, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsseg4e16.v v8, (a0)
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    vl2re16.v v12, (a3)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vl2re16.v v14, (a1)
+; CHECK-NEXT:    vl2re16.v v8, (a0)
+; CHECK-NEXT:    vl2re16.v v10, (a2)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
 ;
-; ZIP-LABEL: vector_interleave_nxv4f16_nxv2f16:
-; ZIP:       # %bb.0:
-; ZIP-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZIP-NEXT:    ri.vzip2b.vv v11, v8, v9
-; ZIP-NEXT:    ri.vzip2a.vv v10, v8, v9
-; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    srli a0, a0, 2
-; ZIP-NEXT:    add a1, a0, a0
-; ZIP-NEXT:    vsetvli zero, a1, e16, m1, ta, ma
-; ZIP-NEXT:    vslideup.vx v10, v11, a0
-; ZIP-NEXT:    vmv.v.v v8, v10
-; ZIP-NEXT:    ret
-  %res = call <vscale x 4 x half> @llvm.vector.interleave2.nxv4f16(<vscale x 2 x half> %a, <vscale x 2 x half> %b)
-  ret <vscale x 4 x half> %res
+; ZVBB-LABEL: vector_interleave_nxv32bf16_nxv8bf16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    slli a1, a1, 1
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    vsetvli a3, zero, e16, m2, ta, ma
+; ZVBB-NEXT:    vsseg4e16.v v8, (a0)
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    vl2re16.v v12, (a3)
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vl2re16.v v14, (a1)
+; ZVBB-NEXT:    vl2re16.v v8, (a0)
+; ZVBB-NEXT:    vl2re16.v v10, (a2)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 32 x bfloat> @llvm.vector.interleave4.nxv32bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3)
+  ret <vscale x 32 x bfloat> %res
 }
 
-define <vscale x 8 x half> @vector_interleave_nxv8f16_nxv4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
-; V-LABEL: vector_interleave_nxv8f16_nxv4f16:
-; V:       # %bb.0:
-; V-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; V-NEXT:    vmv1r.v v10, v9
-; V-NEXT:    vmv1r.v v11, v8
-; V-NEXT:    vwaddu.vv v8, v11, v10
-; V-NEXT:    li a0, -1
-; V-NEXT:    vwmaccu.vx v8, a0, v10
-; V-NEXT:    ret
+define <vscale x 4 x float> @vector_interleave_nxv4f32_nxv1f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv4f32_nxv1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a2, a1, 1
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    vsetvli a4, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsseg4e32.v v8, (a0)
+; CHECK-NEXT:    add a4, a3, a2
+; CHECK-NEXT:    add a2, a4, a2
+; CHECK-NEXT:    vle32.v v9, (a4)
+; CHECK-NEXT:    vle32.v v8, (a2)
+; CHECK-NEXT:    srli a1, a1, 3
+; CHECK-NEXT:    add a2, a1, a1
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a1
+; CHECK-NEXT:    vsetvli a4, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v10, (a3)
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v10, a1
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv8f16_nxv4f16:
+; ZVBB-LABEL: vector_interleave_nxv4f32_nxv1f32:
 ; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVBB-NEXT:    vmv1r.v v10, v9
-; ZVBB-NEXT:    vmv1r.v v11, v8
-; ZVBB-NEXT:    vwsll.vi v8, v10, 16
-; ZVBB-NEXT:    vwaddu.wv v8, v8, v11
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    srli a2, a1, 1
+; ZVBB-NEXT:    add a3, a0, a2
+; ZVBB-NEXT:    vsetvli a4, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsseg4e32.v v8, (a0)
+; ZVBB-NEXT:    add a4, a3, a2
+; ZVBB-NEXT:    add a2, a4, a2
+; ZVBB-NEXT:    vle32.v v9, (a4)
+; ZVBB-NEXT:    vle32.v v8, (a2)
+; ZVBB-NEXT:    srli a1, a1, 3
+; ZVBB-NEXT:    add a2, a1, a1
+; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v9, v8, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vle32.v v10, (a3)
+; ZVBB-NEXT:    vle32.v v8, (a0)
+; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v8, v10, a1
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
+  %res = call <vscale x 4 x float> @llvm.vector.interleave4.nxv4f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3)
+  ret <vscale x 4 x float> %res
+}
+
+define <vscale x 8 x float> @vector_interleave_nxv8f32_nxv2f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv8f32_nxv2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsseg4e32.v v8, (a0)
+; CHECK-NEXT:    vl1re32.v v10, (a3)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vl1re32.v v11, (a1)
+; CHECK-NEXT:    vl1re32.v v8, (a0)
+; CHECK-NEXT:    vl1re32.v v9, (a2)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
 ;
-; ZIP-LABEL: vector_interleave_nxv8f16_nxv4f16:
-; ZIP:       # %bb.0:
-; ZIP-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZIP-NEXT:    vmv1r.v v10, v9
-; ZIP-NEXT:    vmv1r.v v11, v8
-; ZIP-NEXT:    ri.vzip2b.vv v9, v8, v10
-; ZIP-NEXT:    ri.vzip2a.vv v8, v11, v10
-; ZIP-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.vector.interleave2.nxv8f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b)
-  ret <vscale x 8 x half> %res
+; ZVBB-LABEL: vector_interleave_nxv8f32_nxv2f32:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 2
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
+; ZVBB-NEXT:    vsseg4e32.v v8, (a0)
+; ZVBB-NEXT:    vl1re32.v v10, (a3)
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vl1re32.v v11, (a1)
+; ZVBB-NEXT:    vl1re32.v v8, (a0)
+; ZVBB-NEXT:    vl1re32.v v9, (a2)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 2
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 8 x float> @llvm.vector.interleave4.nxv8f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3)
+  ret <vscale x 8 x float> %res
 }
 
-define <vscale x 4 x float> @vector_interleave_nxv4f32_nxv2f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b) {
-; V-LABEL: vector_interleave_nxv4f32_nxv2f32:
-; V:       # %bb.0:
-; V-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; V-NEXT:    vmv1r.v v10, v9
-; V-NEXT:    vmv1r.v v11, v8
-; V-NEXT:    vwaddu.vv v8, v11, v10
-; V-NEXT:    li a0, -1
-; V-NEXT:    vwmaccu.vx v8, a0, v10
-; V-NEXT:    ret
+define <vscale x 16 x float> @vector_interleave_nxv16f32_nxv4f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv16f32_nxv4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    vsetvli a3, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vsseg4e32.v v8, (a0)
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    vl2re32.v v12, (a3)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vl2re32.v v14, (a1)
+; CHECK-NEXT:    vl2re32.v v8, (a0)
+; CHECK-NEXT:    vl2re32.v v10, (a2)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv4f32_nxv2f32:
+; ZVBB-LABEL: vector_interleave_nxv16f32_nxv4f32:
 ; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; ZVBB-NEXT:    vmv1r.v v10, v9
-; ZVBB-NEXT:    vmv1r.v v11, v8
-; ZVBB-NEXT:    li a0, 32
-; ZVBB-NEXT:    vwsll.vx v8, v10, a0
-; ZVBB-NEXT:    vwaddu.wv v8, v8, v11
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    slli a1, a1, 1
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    vsetvli a3, zero, e32, m2, ta, ma
+; ZVBB-NEXT:    vsseg4e32.v v8, (a0)
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    vl2re32.v v12, (a3)
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vl2re32.v v14, (a1)
+; ZVBB-NEXT:    vl2re32.v v8, (a0)
+; ZVBB-NEXT:    vl2re32.v v10, (a2)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-;
-; ZIP-LABEL: vector_interleave_nxv4f32_nxv2f32:
-; ZIP:       # %bb.0:
-; ZIP-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; ZIP-NEXT:    vmv1r.v v10, v9
-; ZIP-NEXT:    vmv1r.v v11, v8
-; ZIP-NEXT:    ri.vzip2b.vv v9, v8, v10
-; ZIP-NEXT:    ri.vzip2a.vv v8, v11, v10
-; ZIP-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.vector.interleave2.nxv4f32(<vscale x 2 x float> %a, <vscale x 2 x float> %b)
-  ret <vscale x 4 x float> %res
+  %res = call <vscale x 16 x float> @llvm.vector.interleave4.nxv16f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3)
+  ret <vscale x 16 x float> %res
 }
 
-define <vscale x 16 x bfloat> @vector_interleave_nxv16bf16_nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) {
-; V-LABEL: vector_interleave_nxv16bf16_nxv8bf16:
-; V:       # %bb.0:
-; V-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; V-NEXT:    vmv2r.v v12, v10
-; V-NEXT:    vmv2r.v v14, v8
-; V-NEXT:    vwaddu.vv v8, v14, v12
-; V-NEXT:    li a0, -1
-; V-NEXT:    vwmaccu.vx v8, a0, v12
-; V-NEXT:    ret
+define <vscale x 4 x double> @vector_interleave_nxv4f64_nxv1f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv4f64_nxv1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsseg4e64.v v8, (a0)
+; CHECK-NEXT:    vl1re64.v v10, (a3)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vl1re64.v v11, (a1)
+; CHECK-NEXT:    vl1re64.v v8, (a0)
+; CHECK-NEXT:    vl1re64.v v9, (a2)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv16bf16_nxv8bf16:
+; ZVBB-LABEL: vector_interleave_nxv4f64_nxv1f64:
 ; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVBB-NEXT:    vmv2r.v v12, v10
-; ZVBB-NEXT:    vmv2r.v v14, v8
-; ZVBB-NEXT:    vwsll.vi v8, v12, 16
-; ZVBB-NEXT:    vwaddu.wv v8, v8, v14
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 2
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
+; ZVBB-NEXT:    vsseg4e64.v v8, (a0)
+; ZVBB-NEXT:    vl1re64.v v10, (a3)
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vl1re64.v v11, (a1)
+; ZVBB-NEXT:    vl1re64.v v8, (a0)
+; ZVBB-NEXT:    vl1re64.v v9, (a2)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 2
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-;
-; ZIP-LABEL: vector_interleave_nxv16bf16_nxv8bf16:
-; ZIP:       # %bb.0:
-; ZIP-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZIP-NEXT:    vmv2r.v v12, v10
-; ZIP-NEXT:    vmv2r.v v14, v8
-; ZIP-NEXT:    ri.vzip2b.vv v10, v8, v12
-; ZIP-NEXT:    ri.vzip2a.vv v8, v14, v12
-; ZIP-NEXT:    ret
-  %res = call <vscale x 16 x bfloat> @llvm.vector.interleave2.nxv16bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b)
-  ret <vscale x 16 x bfloat> %res
+  %res = call <vscale x 4 x double> @llvm.vector.interleave4.nxv4f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3)
+  ret <vscale x 4 x double> %res
 }
 
-define <vscale x 16 x half> @vector_interleave_nxv16f16_nxv8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
-; V-LABEL: vector_interleave_nxv16f16_nxv8f16:
-; V:       # %bb.0:
-; V-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; V-NEXT:    vmv2r.v v12, v10
-; V-NEXT:    vmv2r.v v14, v8
-; V-NEXT:    vwaddu.vv v8, v14, v12
-; V-NEXT:    li a0, -1
-; V-NEXT:    vwmaccu.vx v8, a0, v12
-; V-NEXT:    ret
+define <vscale x 8 x double> @vector_interleave_nxv8f64_nxv2f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3) nounwind {
+; CHECK-LABEL: vector_interleave_nxv8f64_nxv2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 1
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    vsetvli a3, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsseg4e64.v v8, (a0)
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    vl2re64.v v12, (a3)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vl2re64.v v14, (a1)
+; CHECK-NEXT:    vl2re64.v v8, (a0)
+; CHECK-NEXT:    vl2re64.v v10, (a2)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv16f16_nxv8f16:
+; ZVBB-LABEL: vector_interleave_nxv8f64_nxv2f64:
 ; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZVBB-NEXT:    vmv2r.v v12, v10
-; ZVBB-NEXT:    vmv2r.v v14, v8
-; ZVBB-NEXT:    vwsll.vi v8, v12, 16
-; ZVBB-NEXT:    vwaddu.wv v8, v8, v14
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    slli a1, a1, 1
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    vsetvli a3, zero, e64, m2, ta, ma
+; ZVBB-NEXT:    vsseg4e64.v v8, (a0)
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    vl2re64.v v12, (a3)
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vl2re64.v v14, (a1)
+; ZVBB-NEXT:    vl2re64.v v8, (a0)
+; ZVBB-NEXT:    vl2re64.v v10, (a2)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-;
-; ZIP-LABEL: vector_interleave_nxv16f16_nxv8f16:
-; ZIP:       # %bb.0:
-; ZIP-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; ZIP-NEXT:    vmv2r.v v12, v10
-; ZIP-NEXT:    vmv2r.v v14, v8
-; ZIP-NEXT:    ri.vzip2b.vv v10, v8, v12
-; ZIP-NEXT:    ri.vzip2a.vv v8, v14, v12
-; ZIP-NEXT:    ret
-  %res = call <vscale x 16 x half> @llvm.vector.interleave2.nxv16f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b)
-  ret <vscale x 16 x half> %res
+  %res = call <vscale x 8 x double> @llvm.vector.interleave4.nxv6f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3)
+  ret <vscale x 8 x double> %res
 }
 
-define <vscale x 8 x float> @vector_interleave_nxv8f32_nxv4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
-; V-LABEL: vector_interleave_nxv8f32_nxv4f32:
-; V:       # %bb.0:
-; V-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; V-NEXT:    vmv2r.v v12, v10
-; V-NEXT:    vmv2r.v v14, v8
-; V-NEXT:    vwaddu.vv v8, v14, v12
-; V-NEXT:    li a0, -1
-; V-NEXT:    vwmaccu.vx v8, a0, v12
-; V-NEXT:    ret
+define <vscale x 10 x half> @vector_interleave_nxv10f16_nxv2f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4) nounwind {
+; CHECK-LABEL: vector_interleave_nxv10f16_nxv2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a2, a1, 1
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    add a4, a3, a2
+; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsseg5e16.v v8, (a0)
+; CHECK-NEXT:    add a5, a4, a2
+; CHECK-NEXT:    vle16.v v8, (a5)
+; CHECK-NEXT:    vle16.v v9, (a4)
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    add a4, a1, a1
+; CHECK-NEXT:    vle16.v v10, (a3)
+; CHECK-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a1
+; CHECK-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v10, a1
+; CHECK-NEXT:    add a2, a5, a2
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v10, (a2)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv8f32_nxv4f32:
+; ZVBB-LABEL: vector_interleave_nxv10f16_nxv2f16:
 ; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; ZVBB-NEXT:    vmv2r.v v12, v10
-; ZVBB-NEXT:    vmv2r.v v14, v8
-; ZVBB-NEXT:    li a0, 32
-; ZVBB-NEXT:    vwsll.vx v8, v12, a0
-; ZVBB-NEXT:    vwaddu.wv v8, v8, v14
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a1, a0, 1
+; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    srli a2, a1, 1
+; ZVBB-NEXT:    add a3, a0, a2
+; ZVBB-NEXT:    add a4, a3, a2
+; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsseg5e16.v v8, (a0)
+; ZVBB-NEXT:    add a5, a4, a2
+; ZVBB-NEXT:    vle16.v v8, (a5)
+; ZVBB-NEXT:    vle16.v v9, (a4)
+; ZVBB-NEXT:    srli a1, a1, 2
+; ZVBB-NEXT:    add a4, a1, a1
+; ZVBB-NEXT:    vle16.v v10, (a3)
+; ZVBB-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v9, v8, a1
+; ZVBB-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vle16.v v8, (a0)
+; ZVBB-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v8, v10, a1
+; ZVBB-NEXT:    add a2, a5, a2
+; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vle16.v v10, (a2)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a1, a0, 1
+; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
+  %res = call <vscale x 10 x half> @llvm.vector.interleave5.nxv10f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4)
+  ret <vscale x 10 x half> %res
+}
+
+define <vscale x 20 x half> @vector_interleave_nxv20f16_nxv4f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4) nounwind {
+; CHECK-LABEL: vector_interleave_nxv20f16_nxv4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 2
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    vsetvli a4, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsseg5e16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v10, (a3)
+; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    vl1re16.v v11, (a3)
+; CHECK-NEXT:    vl1re16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v9, (a2)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vl1re16.v v12, (a1)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 2
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
 ;
-; ZIP-LABEL: vector_interleave_nxv8f32_nxv4f32:
-; ZIP:       # %bb.0:
-; ZIP-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
-; ZIP-NEXT:    vmv2r.v v12, v10
-; ZIP-NEXT:    vmv2r.v v14, v8
-; ZIP-NEXT:    ri.vzip2b.vv v10, v8, v12
-; ZIP-NEXT:    ri.vzip2a.vv v8, v14, v12
-; ZIP-NEXT:    ret
-  %res = call <vscale x 8 x float> @llvm.vector.interleave2.nxv8f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b)
-  ret <vscale x 8 x float> %res
-}
-
-define <vscale x 4 x double> @vector_interleave_nxv4f64_nxv2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
-; V-LABEL: vector_interleave_nxv4f64_nxv2f64:
-; V:       # %bb.0:
-; V-NEXT:    csrr a0, vlenb
-; V-NEXT:    vsetvli a1, zero, e16, m1, ta, mu
-; V-NEXT:    vid.v v12
-; V-NEXT:    srli a0, a0, 2
-; V-NEXT:    vand.vi v13, v12, 1
-; V-NEXT:    vmsne.vi v0, v13, 0
-; V-NEXT:    vsrl.vi v16, v12, 1
-; V-NEXT:    vadd.vx v16, v16, a0, v0.t
-; V-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; V-NEXT:    vrgatherei16.vv v12, v8, v16
-; V-NEXT:    vmv.v.v v8, v12
-; V-NEXT:    ret
-;
-; ZVBB-LABEL: vector_interleave_nxv4f64_nxv2f64:
+; ZVBB-LABEL: vector_interleave_nxv20f16_nxv4f16:
 ; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    vsetvli a1, zero, e16, m1, ta, mu
-; ZVBB-NEXT:    vid.v v12
-; ZVBB-NEXT:    srli a0, a0, 2
-; ZVBB-NEXT:    vand.vi v13, v12, 1
-; ZVBB-NEXT:    vmsne.vi v0, v13, 0
-; ZVBB-NEXT:    vsrl.vi v16, v12, 1
-; ZVBB-NEXT:    vadd.vx v16, v16, a0, v0.t
-; ZVBB-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
-; ZVBB-NEXT:    vrgatherei16.vv v12, v8, v16
-; ZVBB-NEXT:    vmv.v.v v8, v12
+; ZVBB-NEXT:    slli a1, a0, 2
+; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e16, m1, ta, ma
+; ZVBB-NEXT:    vsseg5e16.v v8, (a0)
+; ZVBB-NEXT:    vl1re16.v v10, (a3)
+; ZVBB-NEXT:    add a3, a3, a1
+; ZVBB-NEXT:    vl1re16.v v11, (a3)
+; ZVBB-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-NEXT:    vl1re16.v v9, (a2)
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vl1re16.v v12, (a1)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a1, a0, 2
+; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-;
-; ZIP-LABEL: vector_interleave_nxv4f64_nxv2f64:
-; ZIP:       # %bb.0:
-; ZIP-NEXT:    vsetvli a0, zero, e64, m2, ta, ma
-; ZIP-NEXT:    vmv2r.v v12, v10
-; ZIP-NEXT:    vmv2r.v v14, v8
-; ZIP-NEXT:    ri.vzip2b.vv v10, v8, v12
-; ZIP-NEXT:    ri.vzip2a.vv v8, v14, v12
-; ZIP-NEXT:    ret
-  %res = call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b)
-  ret <vscale x 4 x double> %res
+  %res = call <vscale x 20 x half> @llvm.vector.interleave5.nxv20f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4)
+  ret <vscale x 20 x half> %res
 }
 
-
-
-define <vscale x 64 x bfloat> @vector_interleave_nxv64bf16_nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) {
-; V-LABEL: vector_interleave_nxv64bf16_nxv32bf16:
-; V:       # %bb.0:
-; V-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; V-NEXT:    vmv8r.v v24, v8
-; V-NEXT:    vwaddu.vv v8, v24, v16
-; V-NEXT:    li a0, -1
-; V-NEXT:    vwaddu.vv v0, v28, v20
-; V-NEXT:    vwmaccu.vx v8, a0, v16
-; V-NEXT:    vwmaccu.vx v0, a0, v20
-; V-NEXT:    vmv8r.v v16, v0
-; V-NEXT:    ret
+define <vscale x 40 x half> @vector_interleave_nxv40f16_nxv8f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4) nounwind {
+; RV32-LABEL: vector_interleave_nxv40f16_nxv8f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -80
+; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi s0, sp, 80
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 28
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    sub sp, sp, a0
+; RV32-NEXT:    andi sp, sp, -64
+; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT:    vmv2r.v v20, v16
+; RV32-NEXT:    addi a0, sp, 64
+; RV32-NEXT:    vmv2r.v v18, v12
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a2, a1, 2
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    vmv2r.v v16, v8
+; RV32-NEXT:    vmv2r.v v22, v16
+; RV32-NEXT:    vmv2r.v v24, v18
+; RV32-NEXT:    vmv1r.v v26, v20
+; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    vmv1r.v v23, v10
+; RV32-NEXT:    add a4, a1, a2
+; RV32-NEXT:    add a5, a4, a2
+; RV32-NEXT:    vmv1r.v v25, v14
+; RV32-NEXT:    add a6, a5, a2
+; RV32-NEXT:    vmv1r.v v18, v11
+; RV32-NEXT:    vsseg5e16.v v22, (a0)
+; RV32-NEXT:    vmv1r.v v20, v15
+; RV32-NEXT:    vsseg5e16.v v17, (a1)
+; RV32-NEXT:    vl1re16.v v16, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re16.v v17, (a6)
+; RV32-NEXT:    add a6, a3, a2
+; RV32-NEXT:    vl1re16.v v10, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re16.v v11, (a6)
+; RV32-NEXT:    vl1re16.v v8, (a0)
+; RV32-NEXT:    vl1re16.v v9, (a3)
+; RV32-NEXT:    vl1re16.v v14, (a4)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a3, 10
+; RV32-NEXT:    mul a0, a0, a3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re16.v v15, (a5)
+; RV32-NEXT:    vl1re16.v v12, (a6)
+; RV32-NEXT:    vl1re16.v v13, (a1)
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    vs2r.v v16, (a2)
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    vl8re16.v v16, (a2)
+; RV32-NEXT:    vl8re16.v v8, (a0)
+; RV32-NEXT:    addi sp, s0, -80
+; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 80
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vector_interleave_nxv40f16_nxv8f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -80
+; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi s0, sp, 80
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 28
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    sub sp, sp, a0
+; RV64-NEXT:    andi sp, sp, -64
+; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT:    vmv2r.v v20, v16
+; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    vmv2r.v v18, v12
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 2
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 64
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    vmv2r.v v16, v8
+; RV64-NEXT:    vmv2r.v v22, v16
+; RV64-NEXT:    vmv2r.v v24, v18
+; RV64-NEXT:    vmv1r.v v26, v20
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    vmv1r.v v23, v10
+; RV64-NEXT:    add a4, a1, a2
+; RV64-NEXT:    add a5, a4, a2
+; RV64-NEXT:    vmv1r.v v25, v14
+; RV64-NEXT:    add a6, a5, a2
+; RV64-NEXT:    vmv1r.v v18, v11
+; RV64-NEXT:    vsseg5e16.v v22, (a0)
+; RV64-NEXT:    vmv1r.v v20, v15
+; RV64-NEXT:    vsseg5e16.v v17, (a1)
+; RV64-NEXT:    vl1re16.v v16, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re16.v v17, (a6)
+; RV64-NEXT:    add a6, a3, a2
+; RV64-NEXT:    vl1re16.v v10, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re16.v v11, (a6)
+; RV64-NEXT:    vl1re16.v v8, (a0)
+; RV64-NEXT:    vl1re16.v v9, (a3)
+; RV64-NEXT:    vl1re16.v v14, (a4)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a3, 10
+; RV64-NEXT:    mul a0, a0, a3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re16.v v15, (a5)
+; RV64-NEXT:    vl1re16.v v12, (a6)
+; RV64-NEXT:    vl1re16.v v13, (a1)
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a2, a0, a2
+; RV64-NEXT:    vs2r.v v16, (a2)
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    vl8re16.v v16, (a2)
+; RV64-NEXT:    vl8re16.v v8, (a0)
+; RV64-NEXT:    addi sp, s0, -80
+; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 80
+; RV64-NEXT:    ret
+;
+; ZVBB-RV32-LABEL: vector_interleave_nxv40f16_nxv8f16:
+; ZVBB-RV32:       # %bb.0:
+; ZVBB-RV32-NEXT:    addi sp, sp, -80
+; ZVBB-RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT:    addi s0, sp, 80
+; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    li a1, 28
+; ZVBB-RV32-NEXT:    mul a0, a0, a1
+; ZVBB-RV32-NEXT:    sub sp, sp, a0
+; ZVBB-RV32-NEXT:    andi sp, sp, -64
+; ZVBB-RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV32-NEXT:    vmv2r.v v20, v16
+; ZVBB-RV32-NEXT:    addi a0, sp, 64
+; ZVBB-RV32-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV32-NEXT:    csrr a1, vlenb
+; ZVBB-RV32-NEXT:    slli a2, a1, 2
+; ZVBB-RV32-NEXT:    add a1, a2, a1
+; ZVBB-RV32-NEXT:    add a1, sp, a1
+; ZVBB-RV32-NEXT:    addi a1, a1, 64
+; ZVBB-RV32-NEXT:    csrr a2, vlenb
+; ZVBB-RV32-NEXT:    vmv2r.v v16, v8
+; ZVBB-RV32-NEXT:    vmv2r.v v22, v16
+; ZVBB-RV32-NEXT:    vmv2r.v v24, v18
+; ZVBB-RV32-NEXT:    vmv1r.v v26, v20
+; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v23, v10
+; ZVBB-RV32-NEXT:    add a4, a1, a2
+; ZVBB-RV32-NEXT:    add a5, a4, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v25, v14
+; ZVBB-RV32-NEXT:    add a6, a5, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v18, v11
+; ZVBB-RV32-NEXT:    vsseg5e16.v v22, (a0)
+; ZVBB-RV32-NEXT:    vmv1r.v v20, v15
+; ZVBB-RV32-NEXT:    vsseg5e16.v v17, (a1)
+; ZVBB-RV32-NEXT:    vl1re16.v v16, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v17, (a6)
+; ZVBB-RV32-NEXT:    add a6, a3, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v10, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v11, (a6)
+; ZVBB-RV32-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re16.v v9, (a3)
+; ZVBB-RV32-NEXT:    vl1re16.v v14, (a4)
+; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    li a3, 10
+; ZVBB-RV32-NEXT:    mul a0, a0, a3
+; ZVBB-RV32-NEXT:    add a0, sp, a0
+; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v15, (a5)
+; ZVBB-RV32-NEXT:    vl1re16.v v12, (a6)
+; ZVBB-RV32-NEXT:    vl1re16.v v13, (a1)
+; ZVBB-RV32-NEXT:    slli a2, a2, 3
+; ZVBB-RV32-NEXT:    add a2, a0, a2
+; ZVBB-RV32-NEXT:    vs2r.v v16, (a2)
+; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl8re16.v v16, (a2)
+; ZVBB-RV32-NEXT:    vl8re16.v v8, (a0)
+; ZVBB-RV32-NEXT:    addi sp, s0, -80
+; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT:    addi sp, sp, 80
+; ZVBB-RV32-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv64bf16_nxv32bf16:
-; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVBB-NEXT:    vwsll.vi v24, v16, 16
-; ZVBB-NEXT:    vwsll.vi v0, v20, 16
-; ZVBB-NEXT:    vwaddu.wv v24, v24, v8
-; ZVBB-NEXT:    vwaddu.wv v0, v0, v12
-; ZVBB-NEXT:    vmv8r.v v8, v24
-; ZVBB-NEXT:    vmv8r.v v16, v0
-; ZVBB-NEXT:    ret
+; ZVBB-RV64-LABEL: vector_interleave_nxv40f16_nxv8f16:
+; ZVBB-RV64:       # %bb.0:
+; ZVBB-RV64-NEXT:    addi sp, sp, -80
+; ZVBB-RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT:    addi s0, sp, 80
+; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    li a1, 28
+; ZVBB-RV64-NEXT:    mul a0, a0, a1
+; ZVBB-RV64-NEXT:    sub sp, sp, a0
+; ZVBB-RV64-NEXT:    andi sp, sp, -64
+; ZVBB-RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV64-NEXT:    vmv2r.v v20, v16
+; ZVBB-RV64-NEXT:    addi a0, sp, 64
+; ZVBB-RV64-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV64-NEXT:    csrr a1, vlenb
+; ZVBB-RV64-NEXT:    slli a2, a1, 2
+; ZVBB-RV64-NEXT:    add a1, a2, a1
+; ZVBB-RV64-NEXT:    add a1, sp, a1
+; ZVBB-RV64-NEXT:    addi a1, a1, 64
+; ZVBB-RV64-NEXT:    csrr a2, vlenb
+; ZVBB-RV64-NEXT:    vmv2r.v v16, v8
+; ZVBB-RV64-NEXT:    vmv2r.v v22, v16
+; ZVBB-RV64-NEXT:    vmv2r.v v24, v18
+; ZVBB-RV64-NEXT:    vmv1r.v v26, v20
+; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v23, v10
+; ZVBB-RV64-NEXT:    add a4, a1, a2
+; ZVBB-RV64-NEXT:    add a5, a4, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v25, v14
+; ZVBB-RV64-NEXT:    add a6, a5, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v18, v11
+; ZVBB-RV64-NEXT:    vsseg5e16.v v22, (a0)
+; ZVBB-RV64-NEXT:    vmv1r.v v20, v15
+; ZVBB-RV64-NEXT:    vsseg5e16.v v17, (a1)
+; ZVBB-RV64-NEXT:    vl1re16.v v16, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v17, (a6)
+; ZVBB-RV64-NEXT:    add a6, a3, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v10, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v11, (a6)
+; ZVBB-RV64-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re16.v v9, (a3)
+; ZVBB-RV64-NEXT:    vl1re16.v v14, (a4)
+; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    li a3, 10
+; ZVBB-RV64-NEXT:    mul a0, a0, a3
+; ZVBB-RV64-NEXT:    add a0, sp, a0
+; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v15, (a5)
+; ZVBB-RV64-NEXT:    vl1re16.v v12, (a6)
+; ZVBB-RV64-NEXT:    vl1re16.v v13, (a1)
+; ZVBB-RV64-NEXT:    slli a2, a2, 3
+; ZVBB-RV64-NEXT:    add a2, a0, a2
+; ZVBB-RV64-NEXT:    vs2r.v v16, (a2)
+; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl8re16.v v16, (a2)
+; ZVBB-RV64-NEXT:    vl8re16.v v8, (a0)
+; ZVBB-RV64-NEXT:    addi sp, s0, -80
+; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT:    addi sp, sp, 80
+; ZVBB-RV64-NEXT:    ret
 ;
-; ZIP-LABEL: vector_interleave_nxv64bf16_nxv32bf16:
+; ZIP-LABEL: vector_interleave_nxv40f16_nxv8f16:
 ; ZIP:       # %bb.0:
-; ZIP-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZIP-NEXT:    ri.vzip2b.vv v28, v8, v16
-; ZIP-NEXT:    ri.vzip2b.vv v4, v12, v20
-; ZIP-NEXT:    ri.vzip2a.vv v24, v8, v16
-; ZIP-NEXT:    ri.vzip2a.vv v0, v12, v20
-; ZIP-NEXT:    vmv8r.v v8, v24
-; ZIP-NEXT:    vmv8r.v v16, v0
+; ZIP-NEXT:    addi sp, sp, -80
+; ZIP-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    addi s0, sp, 80
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    li a1, 28
+; ZIP-NEXT:    mul a0, a0, a1
+; ZIP-NEXT:    sub sp, sp, a0
+; ZIP-NEXT:    andi sp, sp, -64
+; ZIP-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT:    vmv2r.v v20, v16
+; ZIP-NEXT:    addi a0, sp, 64
+; ZIP-NEXT:    vmv2r.v v18, v12
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    slli a2, a1, 2
+; ZIP-NEXT:    add a1, a2, a1
+; ZIP-NEXT:    add a1, sp, a1
+; ZIP-NEXT:    addi a1, a1, 64
+; ZIP-NEXT:    csrr a2, vlenb
+; ZIP-NEXT:    vmv2r.v v16, v8
+; ZIP-NEXT:    vmv2r.v v22, v16
+; ZIP-NEXT:    vmv2r.v v24, v18
+; ZIP-NEXT:    vmv1r.v v26, v20
+; ZIP-NEXT:    add a3, a0, a2
+; ZIP-NEXT:    vmv1r.v v23, v10
+; ZIP-NEXT:    add a4, a1, a2
+; ZIP-NEXT:    add a5, a4, a2
+; ZIP-NEXT:    vmv1r.v v25, v14
+; ZIP-NEXT:    add a6, a5, a2
+; ZIP-NEXT:    vmv1r.v v18, v11
+; ZIP-NEXT:    vsseg5e16.v v22, (a0)
+; ZIP-NEXT:    vmv1r.v v20, v15
+; ZIP-NEXT:    vsseg5e16.v v17, (a1)
+; ZIP-NEXT:    vl1re16.v v16, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re16.v v17, (a6)
+; ZIP-NEXT:    add a6, a3, a2
+; ZIP-NEXT:    vl1re16.v v10, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re16.v v11, (a6)
+; ZIP-NEXT:    vl1re16.v v8, (a0)
+; ZIP-NEXT:    vl1re16.v v9, (a3)
+; ZIP-NEXT:    vl1re16.v v14, (a4)
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    li a3, 10
+; ZIP-NEXT:    mul a0, a0, a3
+; ZIP-NEXT:    add a0, sp, a0
+; ZIP-NEXT:    addi a0, a0, 64
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re16.v v15, (a5)
+; ZIP-NEXT:    vl1re16.v v12, (a6)
+; ZIP-NEXT:    vl1re16.v v13, (a1)
+; ZIP-NEXT:    slli a2, a2, 3
+; ZIP-NEXT:    add a2, a0, a2
+; ZIP-NEXT:    vs2r.v v16, (a2)
+; ZIP-NEXT:    vs8r.v v8, (a0)
+; ZIP-NEXT:    vl8re16.v v16, (a2)
+; ZIP-NEXT:    vl8re16.v v8, (a0)
+; ZIP-NEXT:    addi sp, s0, -80
+; ZIP-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    addi sp, sp, 80
 ; ZIP-NEXT:    ret
-  %res = call <vscale x 64 x bfloat> @llvm.vector.interleave2.nxv64bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b)
-  ret <vscale x 64 x bfloat> %res
+  %res = call <vscale x 40 x half> @llvm.vector.interleave5.nxv40f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4)
+  ret <vscale x 40 x half> %res
 }
 
-define <vscale x 64 x half> @vector_interleave_nxv64f16_nxv32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b) {
-; V-LABEL: vector_interleave_nxv64f16_nxv32f16:
-; V:       # %bb.0:
-; V-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; V-NEXT:    vmv8r.v v24, v8
-; V-NEXT:    vwaddu.vv v8, v24, v16
-; V-NEXT:    li a0, -1
-; V-NEXT:    vwaddu.vv v0, v28, v20
-; V-NEXT:    vwmaccu.vx v8, a0, v16
-; V-NEXT:    vwmaccu.vx v0, a0, v20
-; V-NEXT:    vmv8r.v v16, v0
-; V-NEXT:    ret
+define <vscale x 10 x bfloat> @vector_interleave_nxv10bf16_nxv2bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4) nounwind {
+; CHECK-LABEL: vector_interleave_nxv10bf16_nxv2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a2, a1, 1
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    add a4, a3, a2
+; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsseg5e16.v v8, (a0)
+; CHECK-NEXT:    add a5, a4, a2
+; CHECK-NEXT:    vle16.v v8, (a5)
+; CHECK-NEXT:    vle16.v v9, (a4)
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    add a4, a1, a1
+; CHECK-NEXT:    vle16.v v10, (a3)
+; CHECK-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a1
+; CHECK-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v10, a1
+; CHECK-NEXT:    add a2, a5, a2
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v10, (a2)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv64f16_nxv32f16:
+; ZVBB-LABEL: vector_interleave_nxv10bf16_nxv2bf16:
 ; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZVBB-NEXT:    vwsll.vi v24, v16, 16
-; ZVBB-NEXT:    vwsll.vi v0, v20, 16
-; ZVBB-NEXT:    vwaddu.wv v24, v24, v8
-; ZVBB-NEXT:    vwaddu.wv v0, v0, v12
-; ZVBB-NEXT:    vmv8r.v v8, v24
-; ZVBB-NEXT:    vmv8r.v v16, v0
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a1, a0, 1
+; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    srli a2, a1, 1
+; ZVBB-NEXT:    add a3, a0, a2
+; ZVBB-NEXT:    add a4, a3, a2
+; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsseg5e16.v v8, (a0)
+; ZVBB-NEXT:    add a5, a4, a2
+; ZVBB-NEXT:    vle16.v v8, (a5)
+; ZVBB-NEXT:    vle16.v v9, (a4)
+; ZVBB-NEXT:    srli a1, a1, 2
+; ZVBB-NEXT:    add a4, a1, a1
+; ZVBB-NEXT:    vle16.v v10, (a3)
+; ZVBB-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v9, v8, a1
+; ZVBB-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vle16.v v8, (a0)
+; ZVBB-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v8, v10, a1
+; ZVBB-NEXT:    add a2, a5, a2
+; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vle16.v v10, (a2)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a1, a0, 1
+; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-;
-; ZIP-LABEL: vector_interleave_nxv64f16_nxv32f16:
-; ZIP:       # %bb.0:
-; ZIP-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; ZIP-NEXT:    ri.vzip2b.vv v28, v8, v16
-; ZIP-NEXT:    ri.vzip2b.vv v4, v12, v20
-; ZIP-NEXT:    ri.vzip2a.vv v24, v8, v16
-; ZIP-NEXT:    ri.vzip2a.vv v0, v12, v20
-; ZIP-NEXT:    vmv8r.v v8, v24
-; ZIP-NEXT:    vmv8r.v v16, v0
-; ZIP-NEXT:    ret
-  %res = call <vscale x 64 x half> @llvm.vector.interleave2.nxv64f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b)
-  ret <vscale x 64 x half> %res
+  %res = call <vscale x 10 x bfloat> @llvm.vector.interleave5.nxv10bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4)
+  ret <vscale x 10 x bfloat> %res
 }
 
-define <vscale x 32 x float> @vector_interleave_nxv32f32_nxv16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b) {
-; V-LABEL: vector_interleave_nxv32f32_nxv16f32:
-; V:       # %bb.0:
-; V-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; V-NEXT:    vmv8r.v v24, v8
-; V-NEXT:    vwaddu.vv v8, v24, v16
-; V-NEXT:    li a0, -1
-; V-NEXT:    vwaddu.vv v0, v28, v20
-; V-NEXT:    vwmaccu.vx v8, a0, v16
-; V-NEXT:    vwmaccu.vx v0, a0, v20
-; V-NEXT:    vmv8r.v v16, v0
-; V-NEXT:    ret
+define <vscale x 20 x bfloat> @vector_interleave_nxv20bf16_nxv4bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4) nounwind {
+; CHECK-LABEL: vector_interleave_nxv20bf16_nxv4bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 2
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    vsetvli a4, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsseg5e16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v10, (a3)
+; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    vl1re16.v v11, (a3)
+; CHECK-NEXT:    vl1re16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v9, (a2)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vl1re16.v v12, (a1)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 2
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv32f32_nxv16f32:
+; ZVBB-LABEL: vector_interleave_nxv20bf16_nxv4bf16:
 ; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    li a0, 32
-; ZVBB-NEXT:    vsetvli a1, zero, e32, m4, ta, ma
-; ZVBB-NEXT:    vwsll.vx v24, v16, a0
-; ZVBB-NEXT:    vwsll.vx v0, v20, a0
-; ZVBB-NEXT:    vwaddu.wv v24, v24, v8
-; ZVBB-NEXT:    vwaddu.wv v0, v0, v12
-; ZVBB-NEXT:    vmv8r.v v8, v24
-; ZVBB-NEXT:    vmv8r.v v16, v0
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a1, a0, 2
+; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e16, m1, ta, ma
+; ZVBB-NEXT:    vsseg5e16.v v8, (a0)
+; ZVBB-NEXT:    vl1re16.v v10, (a3)
+; ZVBB-NEXT:    add a3, a3, a1
+; ZVBB-NEXT:    vl1re16.v v11, (a3)
+; ZVBB-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-NEXT:    vl1re16.v v9, (a2)
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vl1re16.v v12, (a1)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a1, a0, 2
+; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-;
-; ZIP-LABEL: vector_interleave_nxv32f32_nxv16f32:
-; ZIP:       # %bb.0:
-; ZIP-NEXT:    vsetvli a0, zero, e32, m4, ta, ma
-; ZIP-NEXT:    ri.vzip2b.vv v28, v8, v16
-; ZIP-NEXT:    ri.vzip2b.vv v4, v12, v20
-; ZIP-NEXT:    ri.vzip2a.vv v24, v8, v16
-; ZIP-NEXT:    ri.vzip2a.vv v0, v12, v20
-; ZIP-NEXT:    vmv8r.v v8, v24
-; ZIP-NEXT:    vmv8r.v v16, v0
-; ZIP-NEXT:    ret
-  %res = call <vscale x 32 x float> @llvm.vector.interleave2.nxv32f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b)
-  ret <vscale x 32 x float> %res
+  %res = call <vscale x 20 x bfloat> @llvm.vector.interleave5.nxv20bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4)
+  ret <vscale x 20 x bfloat> %res
 }
 
-define <vscale x 16 x double> @vector_interleave_nxv16f64_nxv8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b) {
-; V-LABEL: vector_interleave_nxv16f64_nxv8f64:
-; V:       # %bb.0:
-; V-NEXT:    csrr a0, vlenb
-; V-NEXT:    vsetvli a1, zero, e16, m2, ta, mu
-; V-NEXT:    vid.v v6
-; V-NEXT:    vmv8r.v v24, v8
-; V-NEXT:    srli a0, a0, 1
-; V-NEXT:    vmv4r.v v28, v16
-; V-NEXT:    vmv4r.v v16, v12
-; V-NEXT:    vand.vi v8, v6, 1
-; V-NEXT:    vmsne.vi v0, v8, 0
-; V-NEXT:    vsrl.vi v6, v6, 1
-; V-NEXT:    vadd.vx v6, v6, a0, v0.t
-; V-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; V-NEXT:    vrgatherei16.vv v8, v24, v6
-; V-NEXT:    vrgatherei16.vv v24, v16, v6
-; V-NEXT:    vmv.v.v v16, v24
-; V-NEXT:    ret
+define <vscale x 40 x bfloat> @vector_interleave_nxv40bf16_nxv8bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4) nounwind {
+; RV32-LABEL: vector_interleave_nxv40bf16_nxv8bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -80
+; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi s0, sp, 80
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 28
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    sub sp, sp, a0
+; RV32-NEXT:    andi sp, sp, -64
+; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT:    vmv2r.v v20, v16
+; RV32-NEXT:    addi a0, sp, 64
+; RV32-NEXT:    vmv2r.v v18, v12
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a2, a1, 2
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    vmv2r.v v16, v8
+; RV32-NEXT:    vmv2r.v v22, v16
+; RV32-NEXT:    vmv2r.v v24, v18
+; RV32-NEXT:    vmv1r.v v26, v20
+; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    vmv1r.v v23, v10
+; RV32-NEXT:    add a4, a1, a2
+; RV32-NEXT:    add a5, a4, a2
+; RV32-NEXT:    vmv1r.v v25, v14
+; RV32-NEXT:    add a6, a5, a2
+; RV32-NEXT:    vmv1r.v v18, v11
+; RV32-NEXT:    vsseg5e16.v v22, (a0)
+; RV32-NEXT:    vmv1r.v v20, v15
+; RV32-NEXT:    vsseg5e16.v v17, (a1)
+; RV32-NEXT:    vl1re16.v v16, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re16.v v17, (a6)
+; RV32-NEXT:    add a6, a3, a2
+; RV32-NEXT:    vl1re16.v v10, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re16.v v11, (a6)
+; RV32-NEXT:    vl1re16.v v8, (a0)
+; RV32-NEXT:    vl1re16.v v9, (a3)
+; RV32-NEXT:    vl1re16.v v14, (a4)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a3, 10
+; RV32-NEXT:    mul a0, a0, a3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re16.v v15, (a5)
+; RV32-NEXT:    vl1re16.v v12, (a6)
+; RV32-NEXT:    vl1re16.v v13, (a1)
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    vs2r.v v16, (a2)
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    vl8re16.v v16, (a2)
+; RV32-NEXT:    vl8re16.v v8, (a0)
+; RV32-NEXT:    addi sp, s0, -80
+; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 80
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vector_interleave_nxv40bf16_nxv8bf16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -80
+; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi s0, sp, 80
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 28
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    sub sp, sp, a0
+; RV64-NEXT:    andi sp, sp, -64
+; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT:    vmv2r.v v20, v16
+; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    vmv2r.v v18, v12
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 2
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 64
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    vmv2r.v v16, v8
+; RV64-NEXT:    vmv2r.v v22, v16
+; RV64-NEXT:    vmv2r.v v24, v18
+; RV64-NEXT:    vmv1r.v v26, v20
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    vmv1r.v v23, v10
+; RV64-NEXT:    add a4, a1, a2
+; RV64-NEXT:    add a5, a4, a2
+; RV64-NEXT:    vmv1r.v v25, v14
+; RV64-NEXT:    add a6, a5, a2
+; RV64-NEXT:    vmv1r.v v18, v11
+; RV64-NEXT:    vsseg5e16.v v22, (a0)
+; RV64-NEXT:    vmv1r.v v20, v15
+; RV64-NEXT:    vsseg5e16.v v17, (a1)
+; RV64-NEXT:    vl1re16.v v16, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re16.v v17, (a6)
+; RV64-NEXT:    add a6, a3, a2
+; RV64-NEXT:    vl1re16.v v10, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re16.v v11, (a6)
+; RV64-NEXT:    vl1re16.v v8, (a0)
+; RV64-NEXT:    vl1re16.v v9, (a3)
+; RV64-NEXT:    vl1re16.v v14, (a4)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a3, 10
+; RV64-NEXT:    mul a0, a0, a3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re16.v v15, (a5)
+; RV64-NEXT:    vl1re16.v v12, (a6)
+; RV64-NEXT:    vl1re16.v v13, (a1)
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a2, a0, a2
+; RV64-NEXT:    vs2r.v v16, (a2)
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    vl8re16.v v16, (a2)
+; RV64-NEXT:    vl8re16.v v8, (a0)
+; RV64-NEXT:    addi sp, s0, -80
+; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 80
+; RV64-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv16f64_nxv8f64:
-; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    vsetvli a1, zero, e16, m2, ta, mu
-; ZVBB-NEXT:    vid.v v6
-; ZVBB-NEXT:    vmv8r.v v24, v8
-; ZVBB-NEXT:    srli a0, a0, 1
-; ZVBB-NEXT:    vmv4r.v v28, v16
-; ZVBB-NEXT:    vmv4r.v v16, v12
-; ZVBB-NEXT:    vand.vi v8, v6, 1
-; ZVBB-NEXT:    vmsne.vi v0, v8, 0
-; ZVBB-NEXT:    vsrl.vi v6, v6, 1
-; ZVBB-NEXT:    vadd.vx v6, v6, a0, v0.t
-; ZVBB-NEXT:    vsetvli zero, zero, e64, m8, ta, ma
-; ZVBB-NEXT:    vrgatherei16.vv v8, v24, v6
-; ZVBB-NEXT:    vrgatherei16.vv v24, v16, v6
-; ZVBB-NEXT:    vmv.v.v v16, v24
-; ZVBB-NEXT:    ret
+; ZVBB-RV32-LABEL: vector_interleave_nxv40bf16_nxv8bf16:
+; ZVBB-RV32:       # %bb.0:
+; ZVBB-RV32-NEXT:    addi sp, sp, -80
+; ZVBB-RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT:    addi s0, sp, 80
+; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    li a1, 28
+; ZVBB-RV32-NEXT:    mul a0, a0, a1
+; ZVBB-RV32-NEXT:    sub sp, sp, a0
+; ZVBB-RV32-NEXT:    andi sp, sp, -64
+; ZVBB-RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV32-NEXT:    vmv2r.v v20, v16
+; ZVBB-RV32-NEXT:    addi a0, sp, 64
+; ZVBB-RV32-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV32-NEXT:    csrr a1, vlenb
+; ZVBB-RV32-NEXT:    slli a2, a1, 2
+; ZVBB-RV32-NEXT:    add a1, a2, a1
+; ZVBB-RV32-NEXT:    add a1, sp, a1
+; ZVBB-RV32-NEXT:    addi a1, a1, 64
+; ZVBB-RV32-NEXT:    csrr a2, vlenb
+; ZVBB-RV32-NEXT:    vmv2r.v v16, v8
+; ZVBB-RV32-NEXT:    vmv2r.v v22, v16
+; ZVBB-RV32-NEXT:    vmv2r.v v24, v18
+; ZVBB-RV32-NEXT:    vmv1r.v v26, v20
+; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v23, v10
+; ZVBB-RV32-NEXT:    add a4, a1, a2
+; ZVBB-RV32-NEXT:    add a5, a4, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v25, v14
+; ZVBB-RV32-NEXT:    add a6, a5, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v18, v11
+; ZVBB-RV32-NEXT:    vsseg5e16.v v22, (a0)
+; ZVBB-RV32-NEXT:    vmv1r.v v20, v15
+; ZVBB-RV32-NEXT:    vsseg5e16.v v17, (a1)
+; ZVBB-RV32-NEXT:    vl1re16.v v16, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v17, (a6)
+; ZVBB-RV32-NEXT:    add a6, a3, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v10, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v11, (a6)
+; ZVBB-RV32-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re16.v v9, (a3)
+; ZVBB-RV32-NEXT:    vl1re16.v v14, (a4)
+; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    li a3, 10
+; ZVBB-RV32-NEXT:    mul a0, a0, a3
+; ZVBB-RV32-NEXT:    add a0, sp, a0
+; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v15, (a5)
+; ZVBB-RV32-NEXT:    vl1re16.v v12, (a6)
+; ZVBB-RV32-NEXT:    vl1re16.v v13, (a1)
+; ZVBB-RV32-NEXT:    slli a2, a2, 3
+; ZVBB-RV32-NEXT:    add a2, a0, a2
+; ZVBB-RV32-NEXT:    vs2r.v v16, (a2)
+; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl8re16.v v16, (a2)
+; ZVBB-RV32-NEXT:    vl8re16.v v8, (a0)
+; ZVBB-RV32-NEXT:    addi sp, s0, -80
+; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT:    addi sp, sp, 80
+; ZVBB-RV32-NEXT:    ret
 ;
-; ZIP-LABEL: vector_interleave_nxv16f64_nxv8f64:
+; ZVBB-RV64-LABEL: vector_interleave_nxv40bf16_nxv8bf16:
+; ZVBB-RV64:       # %bb.0:
+; ZVBB-RV64-NEXT:    addi sp, sp, -80
+; ZVBB-RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT:    addi s0, sp, 80
+; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    li a1, 28
+; ZVBB-RV64-NEXT:    mul a0, a0, a1
+; ZVBB-RV64-NEXT:    sub sp, sp, a0
+; ZVBB-RV64-NEXT:    andi sp, sp, -64
+; ZVBB-RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV64-NEXT:    vmv2r.v v20, v16
+; ZVBB-RV64-NEXT:    addi a0, sp, 64
+; ZVBB-RV64-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV64-NEXT:    csrr a1, vlenb
+; ZVBB-RV64-NEXT:    slli a2, a1, 2
+; ZVBB-RV64-NEXT:    add a1, a2, a1
+; ZVBB-RV64-NEXT:    add a1, sp, a1
+; ZVBB-RV64-NEXT:    addi a1, a1, 64
+; ZVBB-RV64-NEXT:    csrr a2, vlenb
+; ZVBB-RV64-NEXT:    vmv2r.v v16, v8
+; ZVBB-RV64-NEXT:    vmv2r.v v22, v16
+; ZVBB-RV64-NEXT:    vmv2r.v v24, v18
+; ZVBB-RV64-NEXT:    vmv1r.v v26, v20
+; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v23, v10
+; ZVBB-RV64-NEXT:    add a4, a1, a2
+; ZVBB-RV64-NEXT:    add a5, a4, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v25, v14
+; ZVBB-RV64-NEXT:    add a6, a5, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v18, v11
+; ZVBB-RV64-NEXT:    vsseg5e16.v v22, (a0)
+; ZVBB-RV64-NEXT:    vmv1r.v v20, v15
+; ZVBB-RV64-NEXT:    vsseg5e16.v v17, (a1)
+; ZVBB-RV64-NEXT:    vl1re16.v v16, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v17, (a6)
+; ZVBB-RV64-NEXT:    add a6, a3, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v10, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v11, (a6)
+; ZVBB-RV64-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re16.v v9, (a3)
+; ZVBB-RV64-NEXT:    vl1re16.v v14, (a4)
+; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    li a3, 10
+; ZVBB-RV64-NEXT:    mul a0, a0, a3
+; ZVBB-RV64-NEXT:    add a0, sp, a0
+; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v15, (a5)
+; ZVBB-RV64-NEXT:    vl1re16.v v12, (a6)
+; ZVBB-RV64-NEXT:    vl1re16.v v13, (a1)
+; ZVBB-RV64-NEXT:    slli a2, a2, 3
+; ZVBB-RV64-NEXT:    add a2, a0, a2
+; ZVBB-RV64-NEXT:    vs2r.v v16, (a2)
+; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl8re16.v v16, (a2)
+; ZVBB-RV64-NEXT:    vl8re16.v v8, (a0)
+; ZVBB-RV64-NEXT:    addi sp, s0, -80
+; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT:    addi sp, sp, 80
+; ZVBB-RV64-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave_nxv40bf16_nxv8bf16:
 ; ZIP:       # %bb.0:
-; ZIP-NEXT:    vsetvli a0, zero, e64, m4, ta, ma
-; ZIP-NEXT:    ri.vzip2b.vv v28, v8, v16
-; ZIP-NEXT:    ri.vzip2b.vv v4, v12, v20
-; ZIP-NEXT:    ri.vzip2a.vv v24, v8, v16
-; ZIP-NEXT:    ri.vzip2a.vv v0, v12, v20
-; ZIP-NEXT:    vmv8r.v v8, v24
-; ZIP-NEXT:    vmv8r.v v16, v0
+; ZIP-NEXT:    addi sp, sp, -80
+; ZIP-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    addi s0, sp, 80
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    li a1, 28
+; ZIP-NEXT:    mul a0, a0, a1
+; ZIP-NEXT:    sub sp, sp, a0
+; ZIP-NEXT:    andi sp, sp, -64
+; ZIP-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT:    vmv2r.v v20, v16
+; ZIP-NEXT:    addi a0, sp, 64
+; ZIP-NEXT:    vmv2r.v v18, v12
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    slli a2, a1, 2
+; ZIP-NEXT:    add a1, a2, a1
+; ZIP-NEXT:    add a1, sp, a1
+; ZIP-NEXT:    addi a1, a1, 64
+; ZIP-NEXT:    csrr a2, vlenb
+; ZIP-NEXT:    vmv2r.v v16, v8
+; ZIP-NEXT:    vmv2r.v v22, v16
+; ZIP-NEXT:    vmv2r.v v24, v18
+; ZIP-NEXT:    vmv1r.v v26, v20
+; ZIP-NEXT:    add a3, a0, a2
+; ZIP-NEXT:    vmv1r.v v23, v10
+; ZIP-NEXT:    add a4, a1, a2
+; ZIP-NEXT:    add a5, a4, a2
+; ZIP-NEXT:    vmv1r.v v25, v14
+; ZIP-NEXT:    add a6, a5, a2
+; ZIP-NEXT:    vmv1r.v v18, v11
+; ZIP-NEXT:    vsseg5e16.v v22, (a0)
+; ZIP-NEXT:    vmv1r.v v20, v15
+; ZIP-NEXT:    vsseg5e16.v v17, (a1)
+; ZIP-NEXT:    vl1re16.v v16, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re16.v v17, (a6)
+; ZIP-NEXT:    add a6, a3, a2
+; ZIP-NEXT:    vl1re16.v v10, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re16.v v11, (a6)
+; ZIP-NEXT:    vl1re16.v v8, (a0)
+; ZIP-NEXT:    vl1re16.v v9, (a3)
+; ZIP-NEXT:    vl1re16.v v14, (a4)
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    li a3, 10
+; ZIP-NEXT:    mul a0, a0, a3
+; ZIP-NEXT:    add a0, sp, a0
+; ZIP-NEXT:    addi a0, a0, 64
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re16.v v15, (a5)
+; ZIP-NEXT:    vl1re16.v v12, (a6)
+; ZIP-NEXT:    vl1re16.v v13, (a1)
+; ZIP-NEXT:    slli a2, a2, 3
+; ZIP-NEXT:    add a2, a0, a2
+; ZIP-NEXT:    vs2r.v v16, (a2)
+; ZIP-NEXT:    vs8r.v v8, (a0)
+; ZIP-NEXT:    vl8re16.v v16, (a2)
+; ZIP-NEXT:    vl8re16.v v8, (a0)
+; ZIP-NEXT:    addi sp, s0, -80
+; ZIP-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    addi sp, sp, 80
 ; ZIP-NEXT:    ret
-  %res = call <vscale x 16 x double> @llvm.vector.interleave2.nxv16f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b)
-  ret <vscale x 16 x double> %res
+  %res = call <vscale x 40 x bfloat> @llvm.vector.interleave5.nxv40bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4)
+  ret <vscale x 40 x bfloat> %res
 }
 
-define <vscale x 6 x half> @vector_interleave_nxv6f16_nxv2f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv6f16_nxv2f16:
+define <vscale x 5 x float> @vector_interleave_nxv5f32_nxv1f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4) nounwind {
+; CHECK-LABEL: vector_interleave_nxv5f32_nxv1f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 1
-; CHECK-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vsseg3e16.v v8, (a0)
 ; CHECK-NEXT:    add a3, a0, a2
-; CHECK-NEXT:    vle16.v v9, (a3)
-; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v9, a1
-; CHECK-NEXT:    add a2, a3, a2
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vle16.v v9, (a2)
+; CHECK-NEXT:    add a4, a3, a2
+; CHECK-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsseg5e32.v v8, (a0)
+; CHECK-NEXT:    add a5, a4, a2
+; CHECK-NEXT:    vle32.v v8, (a5)
+; CHECK-NEXT:    vle32.v v9, (a4)
+; CHECK-NEXT:    srli a1, a1, 3
+; CHECK-NEXT:    add a4, a1, a1
+; CHECK-NEXT:    vle32.v v10, (a3)
+; CHECK-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a1
+; CHECK-NEXT:    vsetvli a3, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v10, a1
+; CHECK-NEXT:    add a2, a5, a2
+; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v10, (a2)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv6f16_nxv2f16:
+; ZVBB-LABEL: vector_interleave_nxv5f32_nxv1f32:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    slli a1, a0, 1
+; ZVBB-NEXT:    add a0, a1, a0
 ; ZVBB-NEXT:    sub sp, sp, a0
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a2, a1, 1
-; ZVBB-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
-; ZVBB-NEXT:    vsseg3e16.v v8, (a0)
 ; ZVBB-NEXT:    add a3, a0, a2
-; ZVBB-NEXT:    vle16.v v9, (a3)
-; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a0, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v8, v9, a1
-; ZVBB-NEXT:    add a2, a3, a2
-; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVBB-NEXT:    vle16.v v9, (a2)
+; ZVBB-NEXT:    add a4, a3, a2
+; ZVBB-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsseg5e32.v v8, (a0)
+; ZVBB-NEXT:    add a5, a4, a2
+; ZVBB-NEXT:    vle32.v v8, (a5)
+; ZVBB-NEXT:    vle32.v v9, (a4)
+; ZVBB-NEXT:    srli a1, a1, 3
+; ZVBB-NEXT:    add a4, a1, a1
+; ZVBB-NEXT:    vle32.v v10, (a3)
+; ZVBB-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v9, v8, a1
+; ZVBB-NEXT:    vsetvli a3, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vle32.v v8, (a0)
+; ZVBB-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v8, v10, a1
+; ZVBB-NEXT:    add a2, a5, a2
+; ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vle32.v v10, (a2)
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a0, a0, 1
+; ZVBB-NEXT:    slli a1, a0, 1
+; ZVBB-NEXT:    add a0, a1, a0
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 6 x half> @llvm.vector.interleave3.nxv6f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2)
-  ret <vscale x 6 x half> %res
+  %res = call <vscale x 5 x float> @llvm.vector.interleave5.nxv5f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4)
+  ret <vscale x 5 x float> %res
 }
 
-define <vscale x 12 x half> @vector_interleave_nxv12f16_nxv4f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv12f16_nxv4f16:
+define <vscale x 10 x float> @vector_interleave_nxv10f32_nxv2f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4) nounwind {
+; CHECK-LABEL: vector_interleave_nxv10f32_nxv2f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    slli a1, a0, 2
 ; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vsseg3e16.v v8, (a0)
-; CHECK-NEXT:    vl1re16.v v8, (a0)
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vl1re16.v v9, (a0)
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vl1re16.v v10, (a0)
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsseg5e32.v v8, (a0)
+; CHECK-NEXT:    vl1re32.v v10, (a3)
+; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    vl1re32.v v11, (a3)
+; CHECK-NEXT:    vl1re32.v v8, (a0)
+; CHECK-NEXT:    vl1re32.v v9, (a2)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vl1re32.v v12, (a1)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    slli a1, a0, 2
 ; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv12f16_nxv4f16:
+; ZVBB-LABEL: vector_interleave_nxv10f32_nxv2f32:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a1, a0, 1
+; ZVBB-NEXT:    slli a1, a0, 2
 ; ZVBB-NEXT:    add a0, a1, a0
 ; ZVBB-NEXT:    sub sp, sp, a0
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
-; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVBB-NEXT:    vsseg3e16.v v8, (a0)
-; ZVBB-NEXT:    vl1re16.v v8, (a0)
-; ZVBB-NEXT:    add a0, a0, a1
-; ZVBB-NEXT:    vl1re16.v v9, (a0)
-; ZVBB-NEXT:    add a0, a0, a1
-; ZVBB-NEXT:    vl1re16.v v10, (a0)
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
+; ZVBB-NEXT:    vsseg5e32.v v8, (a0)
+; ZVBB-NEXT:    vl1re32.v v10, (a3)
+; ZVBB-NEXT:    add a3, a3, a1
+; ZVBB-NEXT:    vl1re32.v v11, (a3)
+; ZVBB-NEXT:    vl1re32.v v8, (a0)
+; ZVBB-NEXT:    vl1re32.v v9, (a2)
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vl1re32.v v12, (a1)
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a1, a0, 1
+; ZVBB-NEXT:    slli a1, a0, 2
 ; ZVBB-NEXT:    add a0, a1, a0
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 12 x half> @llvm.vector.interleave3.nxv12f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2)
-  ret <vscale x 12 x half> %res
+  %res = call <vscale x 10 x float> @llvm.vector.interleave5.nxv10f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4)
+  ret <vscale x 10 x float> %res
 }
 
-define <vscale x 24 x half> @vector_interleave_nxv24f16_nxv8f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv24f16_nxv8f16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 6
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vsseg3e16.v v8, (a0)
-; CHECK-NEXT:    vl2re16.v v8, (a0)
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vl2re16.v v10, (a0)
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vl2re16.v v12, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 6
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+define <vscale x 20 x float> @vector_interleave_nxv20f32_nxv4f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4) nounwind {
+; RV32-LABEL: vector_interleave_nxv20f32_nxv4f32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -80
+; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi s0, sp, 80
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 28
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    sub sp, sp, a0
+; RV32-NEXT:    andi sp, sp, -64
+; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv2r.v v20, v16
+; RV32-NEXT:    addi a0, sp, 64
+; RV32-NEXT:    vmv2r.v v18, v12
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a2, a1, 2
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    vmv2r.v v16, v8
+; RV32-NEXT:    vmv2r.v v22, v16
+; RV32-NEXT:    vmv2r.v v24, v18
+; RV32-NEXT:    vmv1r.v v26, v20
+; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    vmv1r.v v23, v10
+; RV32-NEXT:    add a4, a1, a2
+; RV32-NEXT:    add a5, a4, a2
+; RV32-NEXT:    vmv1r.v v25, v14
+; RV32-NEXT:    add a6, a5, a2
+; RV32-NEXT:    vmv1r.v v18, v11
+; RV32-NEXT:    vsseg5e32.v v22, (a0)
+; RV32-NEXT:    vmv1r.v v20, v15
+; RV32-NEXT:    vsseg5e32.v v17, (a1)
+; RV32-NEXT:    vl1re32.v v16, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re32.v v17, (a6)
+; RV32-NEXT:    add a6, a3, a2
+; RV32-NEXT:    vl1re32.v v10, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re32.v v11, (a6)
+; RV32-NEXT:    vl1re32.v v8, (a0)
+; RV32-NEXT:    vl1re32.v v9, (a3)
+; RV32-NEXT:    vl1re32.v v14, (a4)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a3, 10
+; RV32-NEXT:    mul a0, a0, a3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re32.v v15, (a5)
+; RV32-NEXT:    vl1re32.v v12, (a6)
+; RV32-NEXT:    vl1re32.v v13, (a1)
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    vs2r.v v16, (a2)
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    vl8re32.v v16, (a2)
+; RV32-NEXT:    vl8re32.v v8, (a0)
+; RV32-NEXT:    addi sp, s0, -80
+; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 80
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vector_interleave_nxv20f32_nxv4f32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -80
+; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi s0, sp, 80
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 28
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    sub sp, sp, a0
+; RV64-NEXT:    andi sp, sp, -64
+; RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV64-NEXT:    vmv2r.v v20, v16
+; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    vmv2r.v v18, v12
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 2
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 64
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    vmv2r.v v16, v8
+; RV64-NEXT:    vmv2r.v v22, v16
+; RV64-NEXT:    vmv2r.v v24, v18
+; RV64-NEXT:    vmv1r.v v26, v20
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    vmv1r.v v23, v10
+; RV64-NEXT:    add a4, a1, a2
+; RV64-NEXT:    add a5, a4, a2
+; RV64-NEXT:    vmv1r.v v25, v14
+; RV64-NEXT:    add a6, a5, a2
+; RV64-NEXT:    vmv1r.v v18, v11
+; RV64-NEXT:    vsseg5e32.v v22, (a0)
+; RV64-NEXT:    vmv1r.v v20, v15
+; RV64-NEXT:    vsseg5e32.v v17, (a1)
+; RV64-NEXT:    vl1re32.v v16, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re32.v v17, (a6)
+; RV64-NEXT:    add a6, a3, a2
+; RV64-NEXT:    vl1re32.v v10, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re32.v v11, (a6)
+; RV64-NEXT:    vl1re32.v v8, (a0)
+; RV64-NEXT:    vl1re32.v v9, (a3)
+; RV64-NEXT:    vl1re32.v v14, (a4)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a3, 10
+; RV64-NEXT:    mul a0, a0, a3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re32.v v15, (a5)
+; RV64-NEXT:    vl1re32.v v12, (a6)
+; RV64-NEXT:    vl1re32.v v13, (a1)
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a2, a0, a2
+; RV64-NEXT:    vs2r.v v16, (a2)
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    vl8re32.v v16, (a2)
+; RV64-NEXT:    vl8re32.v v8, (a0)
+; RV64-NEXT:    addi sp, s0, -80
+; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 80
+; RV64-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv24f16_nxv8f16:
-; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    li a1, 6
-; ZVBB-NEXT:    mul a0, a0, a1
-; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    addi a0, sp, 16
-; ZVBB-NEXT:    csrr a1, vlenb
-; ZVBB-NEXT:    slli a1, a1, 1
-; ZVBB-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVBB-NEXT:    vsseg3e16.v v8, (a0)
-; ZVBB-NEXT:    vl2re16.v v8, (a0)
-; ZVBB-NEXT:    add a0, a0, a1
-; ZVBB-NEXT:    vl2re16.v v10, (a0)
-; ZVBB-NEXT:    add a0, a0, a1
-; ZVBB-NEXT:    vl2re16.v v12, (a0)
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    li a1, 6
-; ZVBB-NEXT:    mul a0, a0, a1
-; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    ret
-  %res = call <vscale x 24 x half> @llvm.vector.interleave3.nxv24f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2)
-  ret <vscale x 24 x half> %res
-}
-
-define <vscale x 6 x bfloat> @vector_interleave_nxv6bf16_nxv2bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv6bf16_nxv2bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 1
-; CHECK-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vsseg3e16.v v8, (a0)
-; CHECK-NEXT:    add a3, a0, a2
-; CHECK-NEXT:    vle16.v v9, (a3)
-; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v9, a1
-; CHECK-NEXT:    add a2, a3, a2
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vle16.v v9, (a2)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+; ZVBB-RV32-LABEL: vector_interleave_nxv20f32_nxv4f32:
+; ZVBB-RV32:       # %bb.0:
+; ZVBB-RV32-NEXT:    addi sp, sp, -80
+; ZVBB-RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT:    addi s0, sp, 80
+; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    li a1, 28
+; ZVBB-RV32-NEXT:    mul a0, a0, a1
+; ZVBB-RV32-NEXT:    sub sp, sp, a0
+; ZVBB-RV32-NEXT:    andi sp, sp, -64
+; ZVBB-RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV32-NEXT:    vmv2r.v v20, v16
+; ZVBB-RV32-NEXT:    addi a0, sp, 64
+; ZVBB-RV32-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV32-NEXT:    csrr a1, vlenb
+; ZVBB-RV32-NEXT:    slli a2, a1, 2
+; ZVBB-RV32-NEXT:    add a1, a2, a1
+; ZVBB-RV32-NEXT:    add a1, sp, a1
+; ZVBB-RV32-NEXT:    addi a1, a1, 64
+; ZVBB-RV32-NEXT:    csrr a2, vlenb
+; ZVBB-RV32-NEXT:    vmv2r.v v16, v8
+; ZVBB-RV32-NEXT:    vmv2r.v v22, v16
+; ZVBB-RV32-NEXT:    vmv2r.v v24, v18
+; ZVBB-RV32-NEXT:    vmv1r.v v26, v20
+; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v23, v10
+; ZVBB-RV32-NEXT:    add a4, a1, a2
+; ZVBB-RV32-NEXT:    add a5, a4, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v25, v14
+; ZVBB-RV32-NEXT:    add a6, a5, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v18, v11
+; ZVBB-RV32-NEXT:    vsseg5e32.v v22, (a0)
+; ZVBB-RV32-NEXT:    vmv1r.v v20, v15
+; ZVBB-RV32-NEXT:    vsseg5e32.v v17, (a1)
+; ZVBB-RV32-NEXT:    vl1re32.v v16, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v17, (a6)
+; ZVBB-RV32-NEXT:    add a6, a3, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v10, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v11, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re32.v v9, (a3)
+; ZVBB-RV32-NEXT:    vl1re32.v v14, (a4)
+; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    li a3, 10
+; ZVBB-RV32-NEXT:    mul a0, a0, a3
+; ZVBB-RV32-NEXT:    add a0, sp, a0
+; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v15, (a5)
+; ZVBB-RV32-NEXT:    vl1re32.v v12, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v13, (a1)
+; ZVBB-RV32-NEXT:    slli a2, a2, 3
+; ZVBB-RV32-NEXT:    add a2, a0, a2
+; ZVBB-RV32-NEXT:    vs2r.v v16, (a2)
+; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl8re32.v v16, (a2)
+; ZVBB-RV32-NEXT:    vl8re32.v v8, (a0)
+; ZVBB-RV32-NEXT:    addi sp, s0, -80
+; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT:    addi sp, sp, 80
+; ZVBB-RV32-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv6bf16_nxv2bf16:
-; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a0, a0, 1
-; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    addi a0, sp, 16
-; ZVBB-NEXT:    csrr a1, vlenb
-; ZVBB-NEXT:    srli a2, a1, 1
-; ZVBB-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
-; ZVBB-NEXT:    vsseg3e16.v v8, (a0)
-; ZVBB-NEXT:    add a3, a0, a2
-; ZVBB-NEXT:    vle16.v v9, (a3)
-; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a0, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v8, v9, a1
-; ZVBB-NEXT:    add a2, a3, a2
-; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVBB-NEXT:    vle16.v v9, (a2)
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a0, a0, 1
-; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    ret
-  %res = call <vscale x 6 x bfloat> @llvm.vector.interleave3.nxv6bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2)
-  ret <vscale x 6 x bfloat> %res
+; ZVBB-RV64-LABEL: vector_interleave_nxv20f32_nxv4f32:
+; ZVBB-RV64:       # %bb.0:
+; ZVBB-RV64-NEXT:    addi sp, sp, -80
+; ZVBB-RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT:    addi s0, sp, 80
+; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    li a1, 28
+; ZVBB-RV64-NEXT:    mul a0, a0, a1
+; ZVBB-RV64-NEXT:    sub sp, sp, a0
+; ZVBB-RV64-NEXT:    andi sp, sp, -64
+; ZVBB-RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV64-NEXT:    vmv2r.v v20, v16
+; ZVBB-RV64-NEXT:    addi a0, sp, 64
+; ZVBB-RV64-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV64-NEXT:    csrr a1, vlenb
+; ZVBB-RV64-NEXT:    slli a2, a1, 2
+; ZVBB-RV64-NEXT:    add a1, a2, a1
+; ZVBB-RV64-NEXT:    add a1, sp, a1
+; ZVBB-RV64-NEXT:    addi a1, a1, 64
+; ZVBB-RV64-NEXT:    csrr a2, vlenb
+; ZVBB-RV64-NEXT:    vmv2r.v v16, v8
+; ZVBB-RV64-NEXT:    vmv2r.v v22, v16
+; ZVBB-RV64-NEXT:    vmv2r.v v24, v18
+; ZVBB-RV64-NEXT:    vmv1r.v v26, v20
+; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v23, v10
+; ZVBB-RV64-NEXT:    add a4, a1, a2
+; ZVBB-RV64-NEXT:    add a5, a4, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v25, v14
+; ZVBB-RV64-NEXT:    add a6, a5, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v18, v11
+; ZVBB-RV64-NEXT:    vsseg5e32.v v22, (a0)
+; ZVBB-RV64-NEXT:    vmv1r.v v20, v15
+; ZVBB-RV64-NEXT:    vsseg5e32.v v17, (a1)
+; ZVBB-RV64-NEXT:    vl1re32.v v16, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v17, (a6)
+; ZVBB-RV64-NEXT:    add a6, a3, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v10, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v11, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re32.v v9, (a3)
+; ZVBB-RV64-NEXT:    vl1re32.v v14, (a4)
+; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    li a3, 10
+; ZVBB-RV64-NEXT:    mul a0, a0, a3
+; ZVBB-RV64-NEXT:    add a0, sp, a0
+; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v15, (a5)
+; ZVBB-RV64-NEXT:    vl1re32.v v12, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v13, (a1)
+; ZVBB-RV64-NEXT:    slli a2, a2, 3
+; ZVBB-RV64-NEXT:    add a2, a0, a2
+; ZVBB-RV64-NEXT:    vs2r.v v16, (a2)
+; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl8re32.v v16, (a2)
+; ZVBB-RV64-NEXT:    vl8re32.v v8, (a0)
+; ZVBB-RV64-NEXT:    addi sp, s0, -80
+; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT:    addi sp, sp, 80
+; ZVBB-RV64-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave_nxv20f32_nxv4f32:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    addi sp, sp, -80
+; ZIP-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    addi s0, sp, 80
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    li a1, 28
+; ZIP-NEXT:    mul a0, a0, a1
+; ZIP-NEXT:    sub sp, sp, a0
+; ZIP-NEXT:    andi sp, sp, -64
+; ZIP-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; ZIP-NEXT:    vmv2r.v v20, v16
+; ZIP-NEXT:    addi a0, sp, 64
+; ZIP-NEXT:    vmv2r.v v18, v12
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    slli a2, a1, 2
+; ZIP-NEXT:    add a1, a2, a1
+; ZIP-NEXT:    add a1, sp, a1
+; ZIP-NEXT:    addi a1, a1, 64
+; ZIP-NEXT:    csrr a2, vlenb
+; ZIP-NEXT:    vmv2r.v v16, v8
+; ZIP-NEXT:    vmv2r.v v22, v16
+; ZIP-NEXT:    vmv2r.v v24, v18
+; ZIP-NEXT:    vmv1r.v v26, v20
+; ZIP-NEXT:    add a3, a0, a2
+; ZIP-NEXT:    vmv1r.v v23, v10
+; ZIP-NEXT:    add a4, a1, a2
+; ZIP-NEXT:    add a5, a4, a2
+; ZIP-NEXT:    vmv1r.v v25, v14
+; ZIP-NEXT:    add a6, a5, a2
+; ZIP-NEXT:    vmv1r.v v18, v11
+; ZIP-NEXT:    vsseg5e32.v v22, (a0)
+; ZIP-NEXT:    vmv1r.v v20, v15
+; ZIP-NEXT:    vsseg5e32.v v17, (a1)
+; ZIP-NEXT:    vl1re32.v v16, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re32.v v17, (a6)
+; ZIP-NEXT:    add a6, a3, a2
+; ZIP-NEXT:    vl1re32.v v10, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re32.v v11, (a6)
+; ZIP-NEXT:    vl1re32.v v8, (a0)
+; ZIP-NEXT:    vl1re32.v v9, (a3)
+; ZIP-NEXT:    vl1re32.v v14, (a4)
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    li a3, 10
+; ZIP-NEXT:    mul a0, a0, a3
+; ZIP-NEXT:    add a0, sp, a0
+; ZIP-NEXT:    addi a0, a0, 64
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re32.v v15, (a5)
+; ZIP-NEXT:    vl1re32.v v12, (a6)
+; ZIP-NEXT:    vl1re32.v v13, (a1)
+; ZIP-NEXT:    slli a2, a2, 3
+; ZIP-NEXT:    add a2, a0, a2
+; ZIP-NEXT:    vs2r.v v16, (a2)
+; ZIP-NEXT:    vs8r.v v8, (a0)
+; ZIP-NEXT:    vl8re32.v v16, (a2)
+; ZIP-NEXT:    vl8re32.v v8, (a0)
+; ZIP-NEXT:    addi sp, s0, -80
+; ZIP-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    addi sp, sp, 80
+; ZIP-NEXT:    ret
+  %res = call <vscale x 20 x float> @llvm.vector.interleave5.nxv20f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4)
+  ret <vscale x 20 x float> %res
 }
 
-define <vscale x 12 x bfloat> @vector_interleave_nxv12bf16_nxv4bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv12bf16_nxv4bf16:
+define <vscale x 5 x double> @vector_interleave_nxv5f64_nxv1f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4) nounwind {
+; CHECK-LABEL: vector_interleave_nxv5f64_nxv1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    slli a1, a0, 2
 ; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vsseg3e16.v v8, (a0)
-; CHECK-NEXT:    vl1re16.v v8, (a0)
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vl1re16.v v9, (a0)
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vl1re16.v v10, (a0)
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsseg5e64.v v8, (a0)
+; CHECK-NEXT:    vl1re64.v v10, (a3)
+; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    vl1re64.v v11, (a3)
+; CHECK-NEXT:    vl1re64.v v8, (a0)
+; CHECK-NEXT:    vl1re64.v v9, (a2)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vl1re64.v v12, (a1)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    slli a1, a0, 2
 ; CHECK-NEXT:    add a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv12bf16_nxv4bf16:
+; ZVBB-LABEL: vector_interleave_nxv5f64_nxv1f64:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a1, a0, 1
+; ZVBB-NEXT:    slli a1, a0, 2
 ; ZVBB-NEXT:    add a0, a1, a0
 ; ZVBB-NEXT:    sub sp, sp, a0
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
-; ZVBB-NEXT:    vsetvli a2, zero, e16, m1, ta, ma
-; ZVBB-NEXT:    vsseg3e16.v v8, (a0)
-; ZVBB-NEXT:    vl1re16.v v8, (a0)
-; ZVBB-NEXT:    add a0, a0, a1
-; ZVBB-NEXT:    vl1re16.v v9, (a0)
-; ZVBB-NEXT:    add a0, a0, a1
-; ZVBB-NEXT:    vl1re16.v v10, (a0)
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
+; ZVBB-NEXT:    vsseg5e64.v v8, (a0)
+; ZVBB-NEXT:    vl1re64.v v10, (a3)
+; ZVBB-NEXT:    add a3, a3, a1
+; ZVBB-NEXT:    vl1re64.v v11, (a3)
+; ZVBB-NEXT:    vl1re64.v v8, (a0)
+; ZVBB-NEXT:    vl1re64.v v9, (a2)
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vl1re64.v v12, (a1)
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a1, a0, 1
+; ZVBB-NEXT:    slli a1, a0, 2
 ; ZVBB-NEXT:    add a0, a1, a0
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 12 x bfloat> @llvm.vector.interleave3.nxv12bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2)
-  ret <vscale x 12 x bfloat> %res
+  %res = call <vscale x 5 x double> @llvm.vector.interleave5.nxv5f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4)
+  ret <vscale x 5 x double> %res
 }
 
-define <vscale x 24 x bfloat> @vector_interleave_nxv24bf16_nxv8bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv24bf16_nxv8bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 6
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vsseg3e16.v v8, (a0)
-; CHECK-NEXT:    vl2re16.v v8, (a0)
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vl2re16.v v10, (a0)
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vl2re16.v v12, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 6
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+define <vscale x 10 x double> @vector_interleave_nxv10f64_nxv2f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4) nounwind {
+; RV32-LABEL: vector_interleave_nxv10f64_nxv2f64:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -80
+; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi s0, sp, 80
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 28
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    sub sp, sp, a0
+; RV32-NEXT:    andi sp, sp, -64
+; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT:    vmv2r.v v20, v16
+; RV32-NEXT:    addi a0, sp, 64
+; RV32-NEXT:    vmv2r.v v18, v12
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a2, a1, 2
+; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    vmv2r.v v16, v8
+; RV32-NEXT:    vmv2r.v v22, v16
+; RV32-NEXT:    vmv2r.v v24, v18
+; RV32-NEXT:    vmv1r.v v26, v20
+; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    vmv1r.v v23, v10
+; RV32-NEXT:    add a4, a1, a2
+; RV32-NEXT:    add a5, a4, a2
+; RV32-NEXT:    vmv1r.v v25, v14
+; RV32-NEXT:    add a6, a5, a2
+; RV32-NEXT:    vmv1r.v v18, v11
+; RV32-NEXT:    vsseg5e64.v v22, (a0)
+; RV32-NEXT:    vmv1r.v v20, v15
+; RV32-NEXT:    vsseg5e64.v v17, (a1)
+; RV32-NEXT:    vl1re64.v v16, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re64.v v17, (a6)
+; RV32-NEXT:    add a6, a3, a2
+; RV32-NEXT:    vl1re64.v v10, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re64.v v11, (a6)
+; RV32-NEXT:    vl1re64.v v8, (a0)
+; RV32-NEXT:    vl1re64.v v9, (a3)
+; RV32-NEXT:    vl1re64.v v14, (a4)
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a3, 10
+; RV32-NEXT:    mul a0, a0, a3
+; RV32-NEXT:    add a0, sp, a0
+; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re64.v v15, (a5)
+; RV32-NEXT:    vl1re64.v v12, (a6)
+; RV32-NEXT:    vl1re64.v v13, (a1)
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, a0, a2
+; RV32-NEXT:    vs2r.v v16, (a2)
+; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    vl8re64.v v16, (a2)
+; RV32-NEXT:    vl8re64.v v8, (a0)
+; RV32-NEXT:    addi sp, s0, -80
+; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 80
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: vector_interleave_nxv10f64_nxv2f64:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -80
+; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi s0, sp, 80
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 28
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    sub sp, sp, a0
+; RV64-NEXT:    andi sp, sp, -64
+; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv2r.v v20, v16
+; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    vmv2r.v v18, v12
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    slli a2, a1, 2
+; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 64
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    vmv2r.v v16, v8
+; RV64-NEXT:    vmv2r.v v22, v16
+; RV64-NEXT:    vmv2r.v v24, v18
+; RV64-NEXT:    vmv1r.v v26, v20
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    vmv1r.v v23, v10
+; RV64-NEXT:    add a4, a1, a2
+; RV64-NEXT:    add a5, a4, a2
+; RV64-NEXT:    vmv1r.v v25, v14
+; RV64-NEXT:    add a6, a5, a2
+; RV64-NEXT:    vmv1r.v v18, v11
+; RV64-NEXT:    vsseg5e64.v v22, (a0)
+; RV64-NEXT:    vmv1r.v v20, v15
+; RV64-NEXT:    vsseg5e64.v v17, (a1)
+; RV64-NEXT:    vl1re64.v v16, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re64.v v17, (a6)
+; RV64-NEXT:    add a6, a3, a2
+; RV64-NEXT:    vl1re64.v v10, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re64.v v11, (a6)
+; RV64-NEXT:    vl1re64.v v8, (a0)
+; RV64-NEXT:    vl1re64.v v9, (a3)
+; RV64-NEXT:    vl1re64.v v14, (a4)
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a3, 10
+; RV64-NEXT:    mul a0, a0, a3
+; RV64-NEXT:    add a0, sp, a0
+; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re64.v v15, (a5)
+; RV64-NEXT:    vl1re64.v v12, (a6)
+; RV64-NEXT:    vl1re64.v v13, (a1)
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a2, a0, a2
+; RV64-NEXT:    vs2r.v v16, (a2)
+; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    vl8re64.v v16, (a2)
+; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    addi sp, s0, -80
+; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 80
+; RV64-NEXT:    ret
+;
+; ZVBB-RV32-LABEL: vector_interleave_nxv10f64_nxv2f64:
+; ZVBB-RV32:       # %bb.0:
+; ZVBB-RV32-NEXT:    addi sp, sp, -80
+; ZVBB-RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT:    addi s0, sp, 80
+; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    li a1, 28
+; ZVBB-RV32-NEXT:    mul a0, a0, a1
+; ZVBB-RV32-NEXT:    sub sp, sp, a0
+; ZVBB-RV32-NEXT:    andi sp, sp, -64
+; ZVBB-RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV32-NEXT:    vmv2r.v v20, v16
+; ZVBB-RV32-NEXT:    addi a0, sp, 64
+; ZVBB-RV32-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV32-NEXT:    csrr a1, vlenb
+; ZVBB-RV32-NEXT:    slli a2, a1, 2
+; ZVBB-RV32-NEXT:    add a1, a2, a1
+; ZVBB-RV32-NEXT:    add a1, sp, a1
+; ZVBB-RV32-NEXT:    addi a1, a1, 64
+; ZVBB-RV32-NEXT:    csrr a2, vlenb
+; ZVBB-RV32-NEXT:    vmv2r.v v16, v8
+; ZVBB-RV32-NEXT:    vmv2r.v v22, v16
+; ZVBB-RV32-NEXT:    vmv2r.v v24, v18
+; ZVBB-RV32-NEXT:    vmv1r.v v26, v20
+; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v23, v10
+; ZVBB-RV32-NEXT:    add a4, a1, a2
+; ZVBB-RV32-NEXT:    add a5, a4, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v25, v14
+; ZVBB-RV32-NEXT:    add a6, a5, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v18, v11
+; ZVBB-RV32-NEXT:    vsseg5e64.v v22, (a0)
+; ZVBB-RV32-NEXT:    vmv1r.v v20, v15
+; ZVBB-RV32-NEXT:    vsseg5e64.v v17, (a1)
+; ZVBB-RV32-NEXT:    vl1re64.v v16, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v17, (a6)
+; ZVBB-RV32-NEXT:    add a6, a3, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v10, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v11, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re64.v v9, (a3)
+; ZVBB-RV32-NEXT:    vl1re64.v v14, (a4)
+; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    li a3, 10
+; ZVBB-RV32-NEXT:    mul a0, a0, a3
+; ZVBB-RV32-NEXT:    add a0, sp, a0
+; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v15, (a5)
+; ZVBB-RV32-NEXT:    vl1re64.v v12, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v13, (a1)
+; ZVBB-RV32-NEXT:    slli a2, a2, 3
+; ZVBB-RV32-NEXT:    add a2, a0, a2
+; ZVBB-RV32-NEXT:    vs2r.v v16, (a2)
+; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl8re64.v v16, (a2)
+; ZVBB-RV32-NEXT:    vl8re64.v v8, (a0)
+; ZVBB-RV32-NEXT:    addi sp, s0, -80
+; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT:    addi sp, sp, 80
+; ZVBB-RV32-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv24bf16_nxv8bf16:
-; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    li a1, 6
-; ZVBB-NEXT:    mul a0, a0, a1
-; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    addi a0, sp, 16
-; ZVBB-NEXT:    csrr a1, vlenb
-; ZVBB-NEXT:    slli a1, a1, 1
-; ZVBB-NEXT:    vsetvli a2, zero, e16, m2, ta, ma
-; ZVBB-NEXT:    vsseg3e16.v v8, (a0)
-; ZVBB-NEXT:    vl2re16.v v8, (a0)
-; ZVBB-NEXT:    add a0, a0, a1
-; ZVBB-NEXT:    vl2re16.v v10, (a0)
-; ZVBB-NEXT:    add a0, a0, a1
-; ZVBB-NEXT:    vl2re16.v v12, (a0)
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    li a1, 6
-; ZVBB-NEXT:    mul a0, a0, a1
-; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    ret
-  %res = call <vscale x 24 x bfloat> @llvm.vector.interleave3.nxv24bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2)
-  ret <vscale x 24 x bfloat> %res
-}
-
-define <vscale x 3 x float> @vector_interleave_nxv3f32_nxv1f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv3f32_nxv1f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 1
-; CHECK-NEXT:    vsetvli a3, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vsseg3e32.v v8, (a0)
-; CHECK-NEXT:    add a3, a0, a2
-; CHECK-NEXT:    vle32.v v9, (a3)
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    add a0, a1, a1
-; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v9, a1
-; CHECK-NEXT:    add a2, a3, a2
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v9, (a2)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 1
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+; ZVBB-RV64-LABEL: vector_interleave_nxv10f64_nxv2f64:
+; ZVBB-RV64:       # %bb.0:
+; ZVBB-RV64-NEXT:    addi sp, sp, -80
+; ZVBB-RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT:    addi s0, sp, 80
+; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    li a1, 28
+; ZVBB-RV64-NEXT:    mul a0, a0, a1
+; ZVBB-RV64-NEXT:    sub sp, sp, a0
+; ZVBB-RV64-NEXT:    andi sp, sp, -64
+; ZVBB-RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV64-NEXT:    vmv2r.v v20, v16
+; ZVBB-RV64-NEXT:    addi a0, sp, 64
+; ZVBB-RV64-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV64-NEXT:    csrr a1, vlenb
+; ZVBB-RV64-NEXT:    slli a2, a1, 2
+; ZVBB-RV64-NEXT:    add a1, a2, a1
+; ZVBB-RV64-NEXT:    add a1, sp, a1
+; ZVBB-RV64-NEXT:    addi a1, a1, 64
+; ZVBB-RV64-NEXT:    csrr a2, vlenb
+; ZVBB-RV64-NEXT:    vmv2r.v v16, v8
+; ZVBB-RV64-NEXT:    vmv2r.v v22, v16
+; ZVBB-RV64-NEXT:    vmv2r.v v24, v18
+; ZVBB-RV64-NEXT:    vmv1r.v v26, v20
+; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v23, v10
+; ZVBB-RV64-NEXT:    add a4, a1, a2
+; ZVBB-RV64-NEXT:    add a5, a4, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v25, v14
+; ZVBB-RV64-NEXT:    add a6, a5, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v18, v11
+; ZVBB-RV64-NEXT:    vsseg5e64.v v22, (a0)
+; ZVBB-RV64-NEXT:    vmv1r.v v20, v15
+; ZVBB-RV64-NEXT:    vsseg5e64.v v17, (a1)
+; ZVBB-RV64-NEXT:    vl1re64.v v16, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v17, (a6)
+; ZVBB-RV64-NEXT:    add a6, a3, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v10, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v11, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re64.v v9, (a3)
+; ZVBB-RV64-NEXT:    vl1re64.v v14, (a4)
+; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    li a3, 10
+; ZVBB-RV64-NEXT:    mul a0, a0, a3
+; ZVBB-RV64-NEXT:    add a0, sp, a0
+; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v15, (a5)
+; ZVBB-RV64-NEXT:    vl1re64.v v12, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v13, (a1)
+; ZVBB-RV64-NEXT:    slli a2, a2, 3
+; ZVBB-RV64-NEXT:    add a2, a0, a2
+; ZVBB-RV64-NEXT:    vs2r.v v16, (a2)
+; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl8re64.v v16, (a2)
+; ZVBB-RV64-NEXT:    vl8re64.v v8, (a0)
+; ZVBB-RV64-NEXT:    addi sp, s0, -80
+; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT:    addi sp, sp, 80
+; ZVBB-RV64-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv3f32_nxv1f32:
-; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a0, a0, 1
-; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    addi a0, sp, 16
-; ZVBB-NEXT:    csrr a1, vlenb
-; ZVBB-NEXT:    srli a2, a1, 1
-; ZVBB-NEXT:    vsetvli a3, zero, e32, mf2, ta, ma
-; ZVBB-NEXT:    vsseg3e32.v v8, (a0)
-; ZVBB-NEXT:    add a3, a0, a2
-; ZVBB-NEXT:    vle32.v v9, (a3)
-; ZVBB-NEXT:    vle32.v v8, (a0)
-; ZVBB-NEXT:    srli a1, a1, 3
-; ZVBB-NEXT:    add a0, a1, a1
-; ZVBB-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v8, v9, a1
-; ZVBB-NEXT:    add a2, a3, a2
-; ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; ZVBB-NEXT:    vle32.v v9, (a2)
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a0, a0, 1
-; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    ret
-  %res = call <vscale x 3 x float> @llvm.vector.interleave3.nxv3f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2)
-  ret <vscale x 3 x float> %res
+; ZIP-LABEL: vector_interleave_nxv10f64_nxv2f64:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    addi sp, sp, -80
+; ZIP-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    addi s0, sp, 80
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    li a1, 28
+; ZIP-NEXT:    mul a0, a0, a1
+; ZIP-NEXT:    sub sp, sp, a0
+; ZIP-NEXT:    andi sp, sp, -64
+; ZIP-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; ZIP-NEXT:    vmv2r.v v20, v16
+; ZIP-NEXT:    addi a0, sp, 64
+; ZIP-NEXT:    vmv2r.v v18, v12
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    slli a2, a1, 2
+; ZIP-NEXT:    add a1, a2, a1
+; ZIP-NEXT:    add a1, sp, a1
+; ZIP-NEXT:    addi a1, a1, 64
+; ZIP-NEXT:    csrr a2, vlenb
+; ZIP-NEXT:    vmv2r.v v16, v8
+; ZIP-NEXT:    vmv2r.v v22, v16
+; ZIP-NEXT:    vmv2r.v v24, v18
+; ZIP-NEXT:    vmv1r.v v26, v20
+; ZIP-NEXT:    add a3, a0, a2
+; ZIP-NEXT:    vmv1r.v v23, v10
+; ZIP-NEXT:    add a4, a1, a2
+; ZIP-NEXT:    add a5, a4, a2
+; ZIP-NEXT:    vmv1r.v v25, v14
+; ZIP-NEXT:    add a6, a5, a2
+; ZIP-NEXT:    vmv1r.v v18, v11
+; ZIP-NEXT:    vsseg5e64.v v22, (a0)
+; ZIP-NEXT:    vmv1r.v v20, v15
+; ZIP-NEXT:    vsseg5e64.v v17, (a1)
+; ZIP-NEXT:    vl1re64.v v16, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re64.v v17, (a6)
+; ZIP-NEXT:    add a6, a3, a2
+; ZIP-NEXT:    vl1re64.v v10, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re64.v v11, (a6)
+; ZIP-NEXT:    vl1re64.v v8, (a0)
+; ZIP-NEXT:    vl1re64.v v9, (a3)
+; ZIP-NEXT:    vl1re64.v v14, (a4)
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    li a3, 10
+; ZIP-NEXT:    mul a0, a0, a3
+; ZIP-NEXT:    add a0, sp, a0
+; ZIP-NEXT:    addi a0, a0, 64
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re64.v v15, (a5)
+; ZIP-NEXT:    vl1re64.v v12, (a6)
+; ZIP-NEXT:    vl1re64.v v13, (a1)
+; ZIP-NEXT:    slli a2, a2, 3
+; ZIP-NEXT:    add a2, a0, a2
+; ZIP-NEXT:    vs2r.v v16, (a2)
+; ZIP-NEXT:    vs8r.v v8, (a0)
+; ZIP-NEXT:    vl8re64.v v16, (a2)
+; ZIP-NEXT:    vl8re64.v v8, (a0)
+; ZIP-NEXT:    addi sp, s0, -80
+; ZIP-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    addi sp, sp, 80
+; ZIP-NEXT:    ret
+  %res = call <vscale x 10 x double> @llvm.vector.interleave5.nxv10f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4)
+  ret <vscale x 10 x double> %res
 }
 
-define <vscale x 6 x float> @vector_interleave_nxv6f32_nxv2f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv6f32_nxv2f32:
+define <vscale x 12 x half> @vector_interleave_nxv12f16_nxv2f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4, <vscale x 2 x half> %v5) nounwind {
+; CHECK-LABEL: vector_interleave_nxv12f16_nxv2f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -4757,13 +9785,30 @@ define <vscale x 6 x float> @vector_interleave_nxv6f32_nxv2f32(<vscale x 2 x flo
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vsseg3e32.v v8, (a0)
-; CHECK-NEXT:    vl1re32.v v8, (a0)
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vl1re32.v v9, (a0)
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vl1re32.v v10, (a0)
+; CHECK-NEXT:    srli a2, a1, 1
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    add a4, a3, a2
+; CHECK-NEXT:    add a5, a4, a2
+; CHECK-NEXT:    vsetvli a6, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsseg6e16.v v8, (a0)
+; CHECK-NEXT:    add a6, a5, a2
+; CHECK-NEXT:    add a2, a6, a2
+; CHECK-NEXT:    vle16.v v10, (a6)
+; CHECK-NEXT:    vle16.v v8, (a2)
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    add a2, a1, a1
+; CHECK-NEXT:    vle16.v v11, (a5)
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v8, a1
+; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v9, (a4)
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v11, a1
+; CHECK-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v11, (a3)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v11, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a1, a0, 1
 ; CHECK-NEXT:    add a0, a1, a0
@@ -4771,7 +9816,7 @@ define <vscale x 6 x float> @vector_interleave_nxv6f32_nxv2f32(<vscale x 2 x flo
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv6f32_nxv2f32:
+; ZVBB-LABEL: vector_interleave_nxv12f16_nxv2f16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    csrr a0, vlenb
@@ -4780,25 +9825,42 @@ define <vscale x 6 x float> @vector_interleave_nxv6f32_nxv2f32(<vscale x 2 x flo
 ; ZVBB-NEXT:    sub sp, sp, a0
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
-; ZVBB-NEXT:    vsetvli a2, zero, e32, m1, ta, ma
-; ZVBB-NEXT:    vsseg3e32.v v8, (a0)
-; ZVBB-NEXT:    vl1re32.v v8, (a0)
-; ZVBB-NEXT:    add a0, a0, a1
-; ZVBB-NEXT:    vl1re32.v v9, (a0)
-; ZVBB-NEXT:    add a0, a0, a1
-; ZVBB-NEXT:    vl1re32.v v10, (a0)
+; ZVBB-NEXT:    srli a2, a1, 1
+; ZVBB-NEXT:    add a3, a0, a2
+; ZVBB-NEXT:    add a4, a3, a2
+; ZVBB-NEXT:    add a5, a4, a2
+; ZVBB-NEXT:    vsetvli a6, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsseg6e16.v v8, (a0)
+; ZVBB-NEXT:    add a6, a5, a2
+; ZVBB-NEXT:    add a2, a6, a2
+; ZVBB-NEXT:    vle16.v v10, (a6)
+; ZVBB-NEXT:    vle16.v v8, (a2)
+; ZVBB-NEXT:    srli a1, a1, 2
+; ZVBB-NEXT:    add a2, a1, a1
+; ZVBB-NEXT:    vle16.v v11, (a5)
+; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v10, v8, a1
+; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vle16.v v9, (a4)
+; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v9, v11, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vle16.v v11, (a3)
+; ZVBB-NEXT:    vle16.v v8, (a0)
+; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v8, v11, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a1, a0, 1
 ; ZVBB-NEXT:    add a0, a1, a0
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 6 x float> @llvm.vector.interleave3.nxv6f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2)
-  ret <vscale x 6 x float> %res
+  %res = call <vscale x 12 x half> @llvm.vector.interleave6.nxv12f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4, <vscale x 2 x half> %v5)
+  ret <vscale x 12 x half> %res
 }
 
-define <vscale x 12 x float> @vector_interleave_nxv12f32_nxv4f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv12f32_nxv4f32:
+define <vscale x 24 x half> @vector_interleave_nxv24f16_nxv4f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4, <vscale x 4 x half> %v5) nounwind {
+; CHECK-LABEL: vector_interleave_nxv24f16_nxv4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -4807,14 +9869,19 @@ define <vscale x 12 x float> @vector_interleave_nxv12f32_nxv4f32(<vscale x 4 x f
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vsseg3e32.v v8, (a0)
-; CHECK-NEXT:    vl2re32.v v8, (a0)
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vl2re32.v v10, (a0)
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vl2re32.v v12, (a0)
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    vsetvli a4, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsseg6e16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v10, (a3)
+; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    vl1re16.v v11, (a3)
+; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    vl1re16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v9, (a2)
+; CHECK-NEXT:    vl1re16.v v12, (a3)
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vl1re16.v v13, (a1)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    li a1, 6
 ; CHECK-NEXT:    mul a0, a0, a1
@@ -4822,7 +9889,7 @@ define <vscale x 12 x float> @vector_interleave_nxv12f32_nxv4f32(<vscale x 4 x f
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv12f32_nxv4f32:
+; ZVBB-LABEL: vector_interleave_nxv24f16_nxv4f16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    csrr a0, vlenb
@@ -4831,128 +9898,400 @@ define <vscale x 12 x float> @vector_interleave_nxv12f32_nxv4f32(<vscale x 4 x f
 ; ZVBB-NEXT:    sub sp, sp, a0
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
-; ZVBB-NEXT:    slli a1, a1, 1
-; ZVBB-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
-; ZVBB-NEXT:    vsseg3e32.v v8, (a0)
-; ZVBB-NEXT:    vl2re32.v v8, (a0)
-; ZVBB-NEXT:    add a0, a0, a1
-; ZVBB-NEXT:    vl2re32.v v10, (a0)
-; ZVBB-NEXT:    add a0, a0, a1
-; ZVBB-NEXT:    vl2re32.v v12, (a0)
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e16, m1, ta, ma
+; ZVBB-NEXT:    vsseg6e16.v v8, (a0)
+; ZVBB-NEXT:    vl1re16.v v10, (a3)
+; ZVBB-NEXT:    add a3, a3, a1
+; ZVBB-NEXT:    vl1re16.v v11, (a3)
+; ZVBB-NEXT:    add a3, a3, a1
+; ZVBB-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-NEXT:    vl1re16.v v9, (a2)
+; ZVBB-NEXT:    vl1re16.v v12, (a3)
+; ZVBB-NEXT:    add a1, a3, a1
+; ZVBB-NEXT:    vl1re16.v v13, (a1)
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    li a1, 6
 ; ZVBB-NEXT:    mul a0, a0, a1
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 12 x float> @llvm.vector.interleave3.nxv12f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2)
-  ret <vscale x 12 x float> %res
+  %res = call <vscale x 24 x half> @llvm.vector.interleave6.nxv24f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4, <vscale x 4 x half> %v5)
+  ret <vscale x 24 x half> %res
 }
 
-define <vscale x 3 x double> @vector_interleave_nxv3f64_nxv1f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv3f64_nxv1f64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 1
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vsseg3e64.v v8, (a0)
-; CHECK-NEXT:    vl1re64.v v8, (a0)
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vl1re64.v v9, (a0)
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vl1re64.v v10, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 1
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+define <vscale x 48 x half> @vector_interleave_nxv48f16_nxv8f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4, <vscale x 8 x half> %v5) nounwind {
+; RV32-LABEL: vector_interleave_nxv48f16_nxv8f16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -80
+; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi s0, sp, 80
+; RV32-NEXT:    csrr a0, vlenb
+; RV32-NEXT:    li a1, 28
+; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    sub sp, sp, a0
+; RV32-NEXT:    andi sp, sp, -64
+; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT:    vmv2r.v v20, v14
+; RV32-NEXT:    vmv2r.v v22, v12
+; RV32-NEXT:    vmv2r.v v24, v10
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    li a0, 6
+; RV32-NEXT:    mul a1, a1, a0
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    vmv1r.v v10, v25
+; RV32-NEXT:    vmv1r.v v11, v23
+; RV32-NEXT:    vmv1r.v v12, v21
+; RV32-NEXT:    addi a0, sp, 64
+; RV32-NEXT:    vmv1r.v v13, v17
+; RV32-NEXT:    csrr a2, vlenb
+; RV32-NEXT:    vmv1r.v v14, v19
+; RV32-NEXT:    vsseg6e16.v v9, (a1)
+; RV32-NEXT:    vmv1r.v v9, v24
+; RV32-NEXT:    add a5, a1, a2
+; RV32-NEXT:    vmv1r.v v10, v22
+; RV32-NEXT:    add a3, a0, a2
+; RV32-NEXT:    vmv1r.v v11, v20
+; RV32-NEXT:    add a4, a3, a2
+; RV32-NEXT:    vmv1r.v v12, v16
+; RV32-NEXT:    add a6, a5, a2
+; RV32-NEXT:    vmv1r.v v13, v18
+; RV32-NEXT:    vsseg6e16.v v8, (a0)
+; RV32-NEXT:    vl1re16.v v14, (a1)
+; RV32-NEXT:    add a1, a6, a2
+; RV32-NEXT:    vl1re16.v v15, (a5)
+; RV32-NEXT:    add a5, a1, a2
+; RV32-NEXT:    vl1re16.v v18, (a5)
+; RV32-NEXT:    add a5, a5, a2
+; RV32-NEXT:    vl1re16.v v19, (a5)
+; RV32-NEXT:    add a5, a4, a2
+; RV32-NEXT:    vl1re16.v v16, (a6)
+; RV32-NEXT:    add a6, a5, a2
+; RV32-NEXT:    vl1re16.v v12, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re16.v v13, (a6)
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    li a7, 12
+; RV32-NEXT:    mul a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 64
+; RV32-NEXT:    vl1re16.v v17, (a1)
+; RV32-NEXT:    vl1re16.v v10, (a4)
+; RV32-NEXT:    vl1re16.v v11, (a5)
+; RV32-NEXT:    vl1re16.v v8, (a0)
+; RV32-NEXT:    vl1re16.v v9, (a3)
+; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    add a2, a6, a2
+; RV32-NEXT:    vs4r.v v16, (a2)
+; RV32-NEXT:    vs8r.v v8, (a6)
+; RV32-NEXT:    vl8re16.v v16, (a2)
+; RV32-NEXT:    vl8re16.v v8, (a6)
+; RV32-NEXT:    addi sp, s0, -80
+; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; RV32-NEXT:    addi sp, sp, 80
+; RV32-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv3f64_nxv1f64:
-; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a1, a0, 1
-; ZVBB-NEXT:    add a0, a1, a0
-; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    addi a0, sp, 16
-; ZVBB-NEXT:    csrr a1, vlenb
-; ZVBB-NEXT:    vsetvli a2, zero, e64, m1, ta, ma
-; ZVBB-NEXT:    vsseg3e64.v v8, (a0)
-; ZVBB-NEXT:    vl1re64.v v8, (a0)
-; ZVBB-NEXT:    add a0, a0, a1
-; ZVBB-NEXT:    vl1re64.v v9, (a0)
-; ZVBB-NEXT:    add a0, a0, a1
-; ZVBB-NEXT:    vl1re64.v v10, (a0)
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a1, a0, 1
-; ZVBB-NEXT:    add a0, a1, a0
-; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    ret
-  %res = call <vscale x 3 x double> @llvm.vector.interleave3.nxv3f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2)
-  ret <vscale x 3 x double> %res
-}
-
-define <vscale x 6 x double> @vector_interleave_nxv6f64_nxv2f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2) nounwind {
-; CHECK-LABEL: vector_interleave_nxv6f64_nxv2f64:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 6
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 1
-; CHECK-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vsseg3e64.v v8, (a0)
-; CHECK-NEXT:    vl2re64.v v8, (a0)
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vl2re64.v v10, (a0)
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vl2re64.v v12, (a0)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    li a1, 6
-; CHECK-NEXT:    mul a0, a0, a1
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
+; RV64-LABEL: vector_interleave_nxv48f16_nxv8f16:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -80
+; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; RV64-NEXT:    addi s0, sp, 80
+; RV64-NEXT:    csrr a0, vlenb
+; RV64-NEXT:    li a1, 28
+; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    sub sp, sp, a0
+; RV64-NEXT:    andi sp, sp, -64
+; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT:    vmv2r.v v20, v14
+; RV64-NEXT:    vmv2r.v v22, v12
+; RV64-NEXT:    vmv2r.v v24, v10
+; RV64-NEXT:    csrr a1, vlenb
+; RV64-NEXT:    li a0, 6
+; RV64-NEXT:    mul a1, a1, a0
+; RV64-NEXT:    add a1, sp, a1
+; RV64-NEXT:    addi a1, a1, 64
+; RV64-NEXT:    vmv1r.v v10, v25
+; RV64-NEXT:    vmv1r.v v11, v23
+; RV64-NEXT:    vmv1r.v v12, v21
+; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    vmv1r.v v13, v17
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    vmv1r.v v14, v19
+; RV64-NEXT:    vsseg6e16.v v9, (a1)
+; RV64-NEXT:    vmv1r.v v9, v24
+; RV64-NEXT:    add a5, a1, a2
+; RV64-NEXT:    vmv1r.v v10, v22
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    vmv1r.v v11, v20
+; RV64-NEXT:    add a4, a3, a2
+; RV64-NEXT:    vmv1r.v v12, v16
+; RV64-NEXT:    add a6, a5, a2
+; RV64-NEXT:    vmv1r.v v13, v18
+; RV64-NEXT:    vsseg6e16.v v8, (a0)
+; RV64-NEXT:    vl1re16.v v14, (a1)
+; RV64-NEXT:    add a1, a6, a2
+; RV64-NEXT:    vl1re16.v v15, (a5)
+; RV64-NEXT:    add a5, a1, a2
+; RV64-NEXT:    vl1re16.v v18, (a5)
+; RV64-NEXT:    add a5, a5, a2
+; RV64-NEXT:    vl1re16.v v19, (a5)
+; RV64-NEXT:    add a5, a4, a2
+; RV64-NEXT:    vl1re16.v v16, (a6)
+; RV64-NEXT:    add a6, a5, a2
+; RV64-NEXT:    vl1re16.v v12, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re16.v v13, (a6)
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    li a7, 12
+; RV64-NEXT:    mul a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 64
+; RV64-NEXT:    vl1re16.v v17, (a1)
+; RV64-NEXT:    vl1re16.v v10, (a4)
+; RV64-NEXT:    vl1re16.v v11, (a5)
+; RV64-NEXT:    vl1re16.v v8, (a0)
+; RV64-NEXT:    vl1re16.v v9, (a3)
+; RV64-NEXT:    slli a2, a2, 3
+; RV64-NEXT:    add a2, a6, a2
+; RV64-NEXT:    vs4r.v v16, (a2)
+; RV64-NEXT:    vs8r.v v8, (a6)
+; RV64-NEXT:    vl8re16.v v16, (a2)
+; RV64-NEXT:    vl8re16.v v8, (a6)
+; RV64-NEXT:    addi sp, s0, -80
+; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 80
+; RV64-NEXT:    ret
+;
+; ZVBB-RV32-LABEL: vector_interleave_nxv48f16_nxv8f16:
+; ZVBB-RV32:       # %bb.0:
+; ZVBB-RV32-NEXT:    addi sp, sp, -80
+; ZVBB-RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
+; ZVBB-RV32-NEXT:    addi s0, sp, 80
+; ZVBB-RV32-NEXT:    csrr a0, vlenb
+; ZVBB-RV32-NEXT:    li a1, 28
+; ZVBB-RV32-NEXT:    mul a0, a0, a1
+; ZVBB-RV32-NEXT:    sub sp, sp, a0
+; ZVBB-RV32-NEXT:    andi sp, sp, -64
+; ZVBB-RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV32-NEXT:    vmv2r.v v20, v14
+; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV32-NEXT:    vmv2r.v v24, v10
+; ZVBB-RV32-NEXT:    csrr a1, vlenb
+; ZVBB-RV32-NEXT:    li a0, 6
+; ZVBB-RV32-NEXT:    mul a1, a1, a0
+; ZVBB-RV32-NEXT:    add a1, sp, a1
+; ZVBB-RV32-NEXT:    addi a1, a1, 64
+; ZVBB-RV32-NEXT:    vmv1r.v v10, v25
+; ZVBB-RV32-NEXT:    vmv1r.v v11, v23
+; ZVBB-RV32-NEXT:    vmv1r.v v12, v21
+; ZVBB-RV32-NEXT:    addi a0, sp, 64
+; ZVBB-RV32-NEXT:    vmv1r.v v13, v17
+; ZVBB-RV32-NEXT:    csrr a2, vlenb
+; ZVBB-RV32-NEXT:    vmv1r.v v14, v19
+; ZVBB-RV32-NEXT:    vsseg6e16.v v9, (a1)
+; ZVBB-RV32-NEXT:    vmv1r.v v9, v24
+; ZVBB-RV32-NEXT:    add a5, a1, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v10, v22
+; ZVBB-RV32-NEXT:    add a3, a0, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v11, v20
+; ZVBB-RV32-NEXT:    add a4, a3, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v12, v16
+; ZVBB-RV32-NEXT:    add a6, a5, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v13, v18
+; ZVBB-RV32-NEXT:    vsseg6e16.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re16.v v14, (a1)
+; ZVBB-RV32-NEXT:    add a1, a6, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v15, (a5)
+; ZVBB-RV32-NEXT:    add a5, a1, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v18, (a5)
+; ZVBB-RV32-NEXT:    add a5, a5, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v19, (a5)
+; ZVBB-RV32-NEXT:    add a5, a4, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v16, (a6)
+; ZVBB-RV32-NEXT:    add a6, a5, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v12, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v13, (a6)
+; ZVBB-RV32-NEXT:    csrr a6, vlenb
+; ZVBB-RV32-NEXT:    li a7, 12
+; ZVBB-RV32-NEXT:    mul a6, a6, a7
+; ZVBB-RV32-NEXT:    add a6, sp, a6
+; ZVBB-RV32-NEXT:    addi a6, a6, 64
+; ZVBB-RV32-NEXT:    vl1re16.v v17, (a1)
+; ZVBB-RV32-NEXT:    vl1re16.v v10, (a4)
+; ZVBB-RV32-NEXT:    vl1re16.v v11, (a5)
+; ZVBB-RV32-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re16.v v9, (a3)
+; ZVBB-RV32-NEXT:    slli a2, a2, 3
+; ZVBB-RV32-NEXT:    add a2, a6, a2
+; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV32-NEXT:    vs8r.v v8, (a6)
+; ZVBB-RV32-NEXT:    vl8re16.v v16, (a2)
+; ZVBB-RV32-NEXT:    vl8re16.v v8, (a6)
+; ZVBB-RV32-NEXT:    addi sp, s0, -80
+; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
+; ZVBB-RV32-NEXT:    addi sp, sp, 80
+; ZVBB-RV32-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv6f64_nxv2f64:
-; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    li a1, 6
-; ZVBB-NEXT:    mul a0, a0, a1
-; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    addi a0, sp, 16
-; ZVBB-NEXT:    csrr a1, vlenb
-; ZVBB-NEXT:    slli a1, a1, 1
-; ZVBB-NEXT:    vsetvli a2, zero, e64, m2, ta, ma
-; ZVBB-NEXT:    vsseg3e64.v v8, (a0)
-; ZVBB-NEXT:    vl2re64.v v8, (a0)
-; ZVBB-NEXT:    add a0, a0, a1
-; ZVBB-NEXT:    vl2re64.v v10, (a0)
-; ZVBB-NEXT:    add a0, a0, a1
-; ZVBB-NEXT:    vl2re64.v v12, (a0)
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    li a1, 6
-; ZVBB-NEXT:    mul a0, a0, a1
-; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    ret
-  %res = call <vscale x 6 x double> @llvm.vector.interleave3.nxv6f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2)
-  ret <vscale x 6 x double> %res
+; ZVBB-RV64-LABEL: vector_interleave_nxv48f16_nxv8f16:
+; ZVBB-RV64:       # %bb.0:
+; ZVBB-RV64-NEXT:    addi sp, sp, -80
+; ZVBB-RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; ZVBB-RV64-NEXT:    addi s0, sp, 80
+; ZVBB-RV64-NEXT:    csrr a0, vlenb
+; ZVBB-RV64-NEXT:    li a1, 28
+; ZVBB-RV64-NEXT:    mul a0, a0, a1
+; ZVBB-RV64-NEXT:    sub sp, sp, a0
+; ZVBB-RV64-NEXT:    andi sp, sp, -64
+; ZVBB-RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV64-NEXT:    vmv2r.v v20, v14
+; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV64-NEXT:    vmv2r.v v24, v10
+; ZVBB-RV64-NEXT:    csrr a1, vlenb
+; ZVBB-RV64-NEXT:    li a0, 6
+; ZVBB-RV64-NEXT:    mul a1, a1, a0
+; ZVBB-RV64-NEXT:    add a1, sp, a1
+; ZVBB-RV64-NEXT:    addi a1, a1, 64
+; ZVBB-RV64-NEXT:    vmv1r.v v10, v25
+; ZVBB-RV64-NEXT:    vmv1r.v v11, v23
+; ZVBB-RV64-NEXT:    vmv1r.v v12, v21
+; ZVBB-RV64-NEXT:    addi a0, sp, 64
+; ZVBB-RV64-NEXT:    vmv1r.v v13, v17
+; ZVBB-RV64-NEXT:    csrr a2, vlenb
+; ZVBB-RV64-NEXT:    vmv1r.v v14, v19
+; ZVBB-RV64-NEXT:    vsseg6e16.v v9, (a1)
+; ZVBB-RV64-NEXT:    vmv1r.v v9, v24
+; ZVBB-RV64-NEXT:    add a5, a1, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v10, v22
+; ZVBB-RV64-NEXT:    add a3, a0, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v11, v20
+; ZVBB-RV64-NEXT:    add a4, a3, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v12, v16
+; ZVBB-RV64-NEXT:    add a6, a5, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v13, v18
+; ZVBB-RV64-NEXT:    vsseg6e16.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re16.v v14, (a1)
+; ZVBB-RV64-NEXT:    add a1, a6, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v15, (a5)
+; ZVBB-RV64-NEXT:    add a5, a1, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v18, (a5)
+; ZVBB-RV64-NEXT:    add a5, a5, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v19, (a5)
+; ZVBB-RV64-NEXT:    add a5, a4, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v16, (a6)
+; ZVBB-RV64-NEXT:    add a6, a5, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v12, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v13, (a6)
+; ZVBB-RV64-NEXT:    csrr a6, vlenb
+; ZVBB-RV64-NEXT:    li a7, 12
+; ZVBB-RV64-NEXT:    mul a6, a6, a7
+; ZVBB-RV64-NEXT:    add a6, sp, a6
+; ZVBB-RV64-NEXT:    addi a6, a6, 64
+; ZVBB-RV64-NEXT:    vl1re16.v v17, (a1)
+; ZVBB-RV64-NEXT:    vl1re16.v v10, (a4)
+; ZVBB-RV64-NEXT:    vl1re16.v v11, (a5)
+; ZVBB-RV64-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re16.v v9, (a3)
+; ZVBB-RV64-NEXT:    slli a2, a2, 3
+; ZVBB-RV64-NEXT:    add a2, a6, a2
+; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV64-NEXT:    vs8r.v v8, (a6)
+; ZVBB-RV64-NEXT:    vl8re16.v v16, (a2)
+; ZVBB-RV64-NEXT:    vl8re16.v v8, (a6)
+; ZVBB-RV64-NEXT:    addi sp, s0, -80
+; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; ZVBB-RV64-NEXT:    addi sp, sp, 80
+; ZVBB-RV64-NEXT:    ret
+;
+; ZIP-LABEL: vector_interleave_nxv48f16_nxv8f16:
+; ZIP:       # %bb.0:
+; ZIP-NEXT:    addi sp, sp, -80
+; ZIP-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; ZIP-NEXT:    addi s0, sp, 80
+; ZIP-NEXT:    csrr a0, vlenb
+; ZIP-NEXT:    li a1, 28
+; ZIP-NEXT:    mul a0, a0, a1
+; ZIP-NEXT:    sub sp, sp, a0
+; ZIP-NEXT:    andi sp, sp, -64
+; ZIP-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT:    vmv2r.v v20, v14
+; ZIP-NEXT:    vmv2r.v v22, v12
+; ZIP-NEXT:    vmv2r.v v24, v10
+; ZIP-NEXT:    csrr a1, vlenb
+; ZIP-NEXT:    li a0, 6
+; ZIP-NEXT:    mul a1, a1, a0
+; ZIP-NEXT:    add a1, sp, a1
+; ZIP-NEXT:    addi a1, a1, 64
+; ZIP-NEXT:    vmv1r.v v10, v25
+; ZIP-NEXT:    vmv1r.v v11, v23
+; ZIP-NEXT:    vmv1r.v v12, v21
+; ZIP-NEXT:    addi a0, sp, 64
+; ZIP-NEXT:    vmv1r.v v13, v17
+; ZIP-NEXT:    csrr a2, vlenb
+; ZIP-NEXT:    vmv1r.v v14, v19
+; ZIP-NEXT:    vsseg6e16.v v9, (a1)
+; ZIP-NEXT:    vmv1r.v v9, v24
+; ZIP-NEXT:    add a5, a1, a2
+; ZIP-NEXT:    vmv1r.v v10, v22
+; ZIP-NEXT:    add a3, a0, a2
+; ZIP-NEXT:    vmv1r.v v11, v20
+; ZIP-NEXT:    add a4, a3, a2
+; ZIP-NEXT:    vmv1r.v v12, v16
+; ZIP-NEXT:    add a6, a5, a2
+; ZIP-NEXT:    vmv1r.v v13, v18
+; ZIP-NEXT:    vsseg6e16.v v8, (a0)
+; ZIP-NEXT:    vl1re16.v v14, (a1)
+; ZIP-NEXT:    add a1, a6, a2
+; ZIP-NEXT:    vl1re16.v v15, (a5)
+; ZIP-NEXT:    add a5, a1, a2
+; ZIP-NEXT:    vl1re16.v v18, (a5)
+; ZIP-NEXT:    add a5, a5, a2
+; ZIP-NEXT:    vl1re16.v v19, (a5)
+; ZIP-NEXT:    add a5, a4, a2
+; ZIP-NEXT:    vl1re16.v v16, (a6)
+; ZIP-NEXT:    add a6, a5, a2
+; ZIP-NEXT:    vl1re16.v v12, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re16.v v13, (a6)
+; ZIP-NEXT:    csrr a6, vlenb
+; ZIP-NEXT:    li a7, 12
+; ZIP-NEXT:    mul a6, a6, a7
+; ZIP-NEXT:    add a6, sp, a6
+; ZIP-NEXT:    addi a6, a6, 64
+; ZIP-NEXT:    vl1re16.v v17, (a1)
+; ZIP-NEXT:    vl1re16.v v10, (a4)
+; ZIP-NEXT:    vl1re16.v v11, (a5)
+; ZIP-NEXT:    vl1re16.v v8, (a0)
+; ZIP-NEXT:    vl1re16.v v9, (a3)
+; ZIP-NEXT:    slli a2, a2, 3
+; ZIP-NEXT:    add a2, a6, a2
+; ZIP-NEXT:    vs4r.v v16, (a2)
+; ZIP-NEXT:    vs8r.v v8, (a6)
+; ZIP-NEXT:    vl8re16.v v16, (a2)
+; ZIP-NEXT:    vl8re16.v v8, (a6)
+; ZIP-NEXT:    addi sp, s0, -80
+; ZIP-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; ZIP-NEXT:    addi sp, sp, 80
+; ZIP-NEXT:    ret
+  %res = call <vscale x 48 x half> @llvm.vector.interleave6.nxv48f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4, <vscale x 8 x half> %v5)
+  ret <vscale x 48 x half> %res
 }
 
-define <vscale x 10 x half> @vector_interleave_nxv10f16_nxv2f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4) nounwind {
-; CHECK-LABEL: vector_interleave_nxv10f16_nxv2f16:
+define <vscale x 12 x bfloat> @vector_interleave_nxv12bf16_nxv2bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4, <vscale x 2 x bfloat> %v5) nounwind {
+; CHECK-LABEL: vector_interleave_nxv12bf16_nxv2bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -4964,23 +10303,27 @@ define <vscale x 10 x half> @vector_interleave_nxv10f16_nxv2f16(<vscale x 2 x ha
 ; CHECK-NEXT:    srli a2, a1, 1
 ; CHECK-NEXT:    add a3, a0, a2
 ; CHECK-NEXT:    add a4, a3, a2
-; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vsseg5e16.v v8, (a0)
 ; CHECK-NEXT:    add a5, a4, a2
-; CHECK-NEXT:    vle16.v v8, (a5)
-; CHECK-NEXT:    vle16.v v9, (a4)
+; CHECK-NEXT:    vsetvli a6, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsseg6e16.v v8, (a0)
+; CHECK-NEXT:    add a6, a5, a2
+; CHECK-NEXT:    add a2, a6, a2
+; CHECK-NEXT:    vle16.v v10, (a6)
+; CHECK-NEXT:    vle16.v v8, (a2)
 ; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a4, a1, a1
-; CHECK-NEXT:    vle16.v v10, (a3)
-; CHECK-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v9, v8, a1
-; CHECK-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    add a2, a1, a1
+; CHECK-NEXT:    vle16.v v11, (a5)
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v8, a1
+; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v9, (a4)
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v11, a1
+; CHECK-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v11, (a3)
 ; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v10, a1
-; CHECK-NEXT:    add a2, a5, a2
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vle16.v v10, (a2)
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v11, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a1, a0, 1
 ; CHECK-NEXT:    add a0, a1, a0
@@ -4988,7 +10331,7 @@ define <vscale x 10 x half> @vector_interleave_nxv10f16_nxv2f16(<vscale x 2 x ha
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv10f16_nxv2f16:
+; ZVBB-LABEL: vector_interleave_nxv12bf16_nxv2bf16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    csrr a0, vlenb
@@ -5000,93 +10343,101 @@ define <vscale x 10 x half> @vector_interleave_nxv10f16_nxv2f16(<vscale x 2 x ha
 ; ZVBB-NEXT:    srli a2, a1, 1
 ; ZVBB-NEXT:    add a3, a0, a2
 ; ZVBB-NEXT:    add a4, a3, a2
-; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
-; ZVBB-NEXT:    vsseg5e16.v v8, (a0)
 ; ZVBB-NEXT:    add a5, a4, a2
-; ZVBB-NEXT:    vle16.v v8, (a5)
-; ZVBB-NEXT:    vle16.v v9, (a4)
+; ZVBB-NEXT:    vsetvli a6, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsseg6e16.v v8, (a0)
+; ZVBB-NEXT:    add a6, a5, a2
+; ZVBB-NEXT:    add a2, a6, a2
+; ZVBB-NEXT:    vle16.v v10, (a6)
+; ZVBB-NEXT:    vle16.v v8, (a2)
 ; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a4, a1, a1
-; ZVBB-NEXT:    vle16.v v10, (a3)
-; ZVBB-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v9, v8, a1
-; ZVBB-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    add a2, a1, a1
+; ZVBB-NEXT:    vle16.v v11, (a5)
+; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v10, v8, a1
+; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vle16.v v9, (a4)
+; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v9, v11, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vle16.v v11, (a3)
 ; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v8, v10, a1
-; ZVBB-NEXT:    add a2, a5, a2
-; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVBB-NEXT:    vle16.v v10, (a2)
+; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v8, v11, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a1, a0, 1
 ; ZVBB-NEXT:    add a0, a1, a0
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 10 x half> @llvm.vector.interleave5.nxv10f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4)
-  ret <vscale x 10 x half> %res
+  %res = call <vscale x 12 x bfloat> @llvm.vector.interleave6.nxv12bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4, <vscale x 2 x bfloat> %v5)
+  ret <vscale x 12 x bfloat> %res
 }
 
-define <vscale x 20 x half> @vector_interleave_nxv20f16_nxv4f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4) nounwind {
-; CHECK-LABEL: vector_interleave_nxv20f16_nxv4f16:
+define <vscale x 24 x bfloat> @vector_interleave_nxv24bf16_nxv4bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4, <vscale x 4 x bfloat> %v5) nounwind {
+; CHECK-LABEL: vector_interleave_nxv24bf16_nxv4bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 2
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    li a1, 6
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    add a3, a2, a1
 ; CHECK-NEXT:    vsetvli a4, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vsseg5e16.v v8, (a0)
+; CHECK-NEXT:    vsseg6e16.v v8, (a0)
 ; CHECK-NEXT:    vl1re16.v v10, (a3)
 ; CHECK-NEXT:    add a3, a3, a1
 ; CHECK-NEXT:    vl1re16.v v11, (a3)
+; CHECK-NEXT:    add a3, a3, a1
 ; CHECK-NEXT:    vl1re16.v v8, (a0)
 ; CHECK-NEXT:    vl1re16.v v9, (a2)
+; CHECK-NEXT:    vl1re16.v v12, (a3)
 ; CHECK-NEXT:    add a1, a3, a1
-; CHECK-NEXT:    vl1re16.v v12, (a1)
+; CHECK-NEXT:    vl1re16.v v13, (a1)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 2
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    li a1, 6
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv20f16_nxv4f16:
+; ZVBB-LABEL: vector_interleave_nxv24bf16_nxv4bf16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a1, a0, 2
-; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    li a1, 6
+; ZVBB-NEXT:    mul a0, a0, a1
 ; ZVBB-NEXT:    sub sp, sp, a0
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    add a2, a0, a1
 ; ZVBB-NEXT:    add a3, a2, a1
 ; ZVBB-NEXT:    vsetvli a4, zero, e16, m1, ta, ma
-; ZVBB-NEXT:    vsseg5e16.v v8, (a0)
+; ZVBB-NEXT:    vsseg6e16.v v8, (a0)
 ; ZVBB-NEXT:    vl1re16.v v10, (a3)
 ; ZVBB-NEXT:    add a3, a3, a1
 ; ZVBB-NEXT:    vl1re16.v v11, (a3)
+; ZVBB-NEXT:    add a3, a3, a1
 ; ZVBB-NEXT:    vl1re16.v v8, (a0)
 ; ZVBB-NEXT:    vl1re16.v v9, (a2)
+; ZVBB-NEXT:    vl1re16.v v12, (a3)
 ; ZVBB-NEXT:    add a1, a3, a1
-; ZVBB-NEXT:    vl1re16.v v12, (a1)
+; ZVBB-NEXT:    vl1re16.v v13, (a1)
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a1, a0, 2
-; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    li a1, 6
+; ZVBB-NEXT:    mul a0, a0, a1
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 20 x half> @llvm.vector.interleave5.nxv20f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4)
-  ret <vscale x 20 x half> %res
+  %res = call <vscale x 24 x bfloat> @llvm.vector.interleave6.nxv24bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4, <vscale x 4 x bfloat> %v5)
+  ret <vscale x 24 x bfloat> %res
 }
 
-define <vscale x 40 x half> @vector_interleave_nxv40f16_nxv8f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4) nounwind {
-; RV32-LABEL: vector_interleave_nxv40f16_nxv8f16:
+define <vscale x 48 x bfloat> @vector_interleave_nxv48bf16_nxv8bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4, <vscale x 8 x bfloat> %v5) nounwind {
+; RV32-LABEL: vector_interleave_nxv48bf16_nxv8bf16:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -80
 ; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
@@ -5098,61 +10449,68 @@ define <vscale x 40 x half> @vector_interleave_nxv40f16_nxv8f16(<vscale x 8 x ha
 ; RV32-NEXT:    sub sp, sp, a0
 ; RV32-NEXT:    andi sp, sp, -64
 ; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; RV32-NEXT:    vmv2r.v v20, v16
-; RV32-NEXT:    addi a0, sp, 64
-; RV32-NEXT:    vmv2r.v v18, v12
+; RV32-NEXT:    vmv2r.v v20, v14
+; RV32-NEXT:    vmv2r.v v22, v12
+; RV32-NEXT:    vmv2r.v v24, v10
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 2
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    li a0, 6
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    vmv1r.v v10, v25
+; RV32-NEXT:    vmv1r.v v11, v23
+; RV32-NEXT:    vmv1r.v v12, v21
+; RV32-NEXT:    addi a0, sp, 64
+; RV32-NEXT:    vmv1r.v v13, v17
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    vmv2r.v v16, v8
-; RV32-NEXT:    vmv2r.v v22, v16
-; RV32-NEXT:    vmv2r.v v24, v18
-; RV32-NEXT:    vmv1r.v v26, v20
+; RV32-NEXT:    vmv1r.v v14, v19
+; RV32-NEXT:    vsseg6e16.v v9, (a1)
+; RV32-NEXT:    vmv1r.v v9, v24
+; RV32-NEXT:    add a5, a1, a2
+; RV32-NEXT:    vmv1r.v v10, v22
 ; RV32-NEXT:    add a3, a0, a2
-; RV32-NEXT:    vmv1r.v v23, v10
-; RV32-NEXT:    add a4, a1, a2
-; RV32-NEXT:    add a5, a4, a2
-; RV32-NEXT:    vmv1r.v v25, v14
+; RV32-NEXT:    vmv1r.v v11, v20
+; RV32-NEXT:    add a4, a3, a2
+; RV32-NEXT:    vmv1r.v v12, v16
 ; RV32-NEXT:    add a6, a5, a2
-; RV32-NEXT:    vmv1r.v v18, v11
-; RV32-NEXT:    vsseg5e16.v v22, (a0)
-; RV32-NEXT:    vmv1r.v v20, v15
-; RV32-NEXT:    vsseg5e16.v v17, (a1)
+; RV32-NEXT:    vmv1r.v v13, v18
+; RV32-NEXT:    vsseg6e16.v v8, (a0)
+; RV32-NEXT:    vl1re16.v v14, (a1)
+; RV32-NEXT:    add a1, a6, a2
+; RV32-NEXT:    vl1re16.v v15, (a5)
+; RV32-NEXT:    add a5, a1, a2
+; RV32-NEXT:    vl1re16.v v18, (a5)
+; RV32-NEXT:    add a5, a5, a2
+; RV32-NEXT:    vl1re16.v v19, (a5)
+; RV32-NEXT:    add a5, a4, a2
 ; RV32-NEXT:    vl1re16.v v16, (a6)
+; RV32-NEXT:    add a6, a5, a2
+; RV32-NEXT:    vl1re16.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v17, (a6)
-; RV32-NEXT:    add a6, a3, a2
-; RV32-NEXT:    vl1re16.v v10, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v11, (a6)
+; RV32-NEXT:    vl1re16.v v13, (a6)
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    li a7, 12
+; RV32-NEXT:    mul a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 64
+; RV32-NEXT:    vl1re16.v v17, (a1)
+; RV32-NEXT:    vl1re16.v v10, (a4)
+; RV32-NEXT:    vl1re16.v v11, (a5)
 ; RV32-NEXT:    vl1re16.v v8, (a0)
 ; RV32-NEXT:    vl1re16.v v9, (a3)
-; RV32-NEXT:    vl1re16.v v14, (a4)
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a3, 10
-; RV32-NEXT:    mul a0, a0, a3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 64
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v15, (a5)
-; RV32-NEXT:    vl1re16.v v12, (a6)
-; RV32-NEXT:    vl1re16.v v13, (a1)
 ; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, a0, a2
-; RV32-NEXT:    vs2r.v v16, (a2)
-; RV32-NEXT:    vs8r.v v8, (a0)
+; RV32-NEXT:    add a2, a6, a2
+; RV32-NEXT:    vs4r.v v16, (a2)
+; RV32-NEXT:    vs8r.v v8, (a6)
 ; RV32-NEXT:    vl8re16.v v16, (a2)
-; RV32-NEXT:    vl8re16.v v8, (a0)
+; RV32-NEXT:    vl8re16.v v8, (a6)
 ; RV32-NEXT:    addi sp, s0, -80
 ; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 80
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: vector_interleave_nxv40f16_nxv8f16:
+; RV64-LABEL: vector_interleave_nxv48bf16_nxv8bf16:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -80
 ; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -5164,61 +10522,68 @@ define <vscale x 40 x half> @vector_interleave_nxv40f16_nxv8f16(<vscale x 8 x ha
 ; RV64-NEXT:    sub sp, sp, a0
 ; RV64-NEXT:    andi sp, sp, -64
 ; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; RV64-NEXT:    vmv2r.v v20, v16
-; RV64-NEXT:    addi a0, sp, 64
-; RV64-NEXT:    vmv2r.v v18, v12
+; RV64-NEXT:    vmv2r.v v20, v14
+; RV64-NEXT:    vmv2r.v v22, v12
+; RV64-NEXT:    vmv2r.v v24, v10
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 2
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a0, 6
+; RV64-NEXT:    mul a1, a1, a0
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 64
+; RV64-NEXT:    vmv1r.v v10, v25
+; RV64-NEXT:    vmv1r.v v11, v23
+; RV64-NEXT:    vmv1r.v v12, v21
+; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    vmv1r.v v13, v17
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    vmv2r.v v16, v8
-; RV64-NEXT:    vmv2r.v v22, v16
-; RV64-NEXT:    vmv2r.v v24, v18
-; RV64-NEXT:    vmv1r.v v26, v20
+; RV64-NEXT:    vmv1r.v v14, v19
+; RV64-NEXT:    vsseg6e16.v v9, (a1)
+; RV64-NEXT:    vmv1r.v v9, v24
+; RV64-NEXT:    add a5, a1, a2
+; RV64-NEXT:    vmv1r.v v10, v22
 ; RV64-NEXT:    add a3, a0, a2
-; RV64-NEXT:    vmv1r.v v23, v10
-; RV64-NEXT:    add a4, a1, a2
-; RV64-NEXT:    add a5, a4, a2
-; RV64-NEXT:    vmv1r.v v25, v14
+; RV64-NEXT:    vmv1r.v v11, v20
+; RV64-NEXT:    add a4, a3, a2
+; RV64-NEXT:    vmv1r.v v12, v16
 ; RV64-NEXT:    add a6, a5, a2
-; RV64-NEXT:    vmv1r.v v18, v11
-; RV64-NEXT:    vsseg5e16.v v22, (a0)
-; RV64-NEXT:    vmv1r.v v20, v15
-; RV64-NEXT:    vsseg5e16.v v17, (a1)
+; RV64-NEXT:    vmv1r.v v13, v18
+; RV64-NEXT:    vsseg6e16.v v8, (a0)
+; RV64-NEXT:    vl1re16.v v14, (a1)
+; RV64-NEXT:    add a1, a6, a2
+; RV64-NEXT:    vl1re16.v v15, (a5)
+; RV64-NEXT:    add a5, a1, a2
+; RV64-NEXT:    vl1re16.v v18, (a5)
+; RV64-NEXT:    add a5, a5, a2
+; RV64-NEXT:    vl1re16.v v19, (a5)
+; RV64-NEXT:    add a5, a4, a2
 ; RV64-NEXT:    vl1re16.v v16, (a6)
+; RV64-NEXT:    add a6, a5, a2
+; RV64-NEXT:    vl1re16.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v17, (a6)
-; RV64-NEXT:    add a6, a3, a2
-; RV64-NEXT:    vl1re16.v v10, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v11, (a6)
+; RV64-NEXT:    vl1re16.v v13, (a6)
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    li a7, 12
+; RV64-NEXT:    mul a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 64
+; RV64-NEXT:    vl1re16.v v17, (a1)
+; RV64-NEXT:    vl1re16.v v10, (a4)
+; RV64-NEXT:    vl1re16.v v11, (a5)
 ; RV64-NEXT:    vl1re16.v v8, (a0)
 ; RV64-NEXT:    vl1re16.v v9, (a3)
-; RV64-NEXT:    vl1re16.v v14, (a4)
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a3, 10
-; RV64-NEXT:    mul a0, a0, a3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 64
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v15, (a5)
-; RV64-NEXT:    vl1re16.v v12, (a6)
-; RV64-NEXT:    vl1re16.v v13, (a1)
 ; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a2, a0, a2
-; RV64-NEXT:    vs2r.v v16, (a2)
-; RV64-NEXT:    vs8r.v v8, (a0)
+; RV64-NEXT:    add a2, a6, a2
+; RV64-NEXT:    vs4r.v v16, (a2)
+; RV64-NEXT:    vs8r.v v8, (a6)
 ; RV64-NEXT:    vl8re16.v v16, (a2)
-; RV64-NEXT:    vl8re16.v v8, (a0)
+; RV64-NEXT:    vl8re16.v v8, (a6)
 ; RV64-NEXT:    addi sp, s0, -80
 ; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 80
 ; RV64-NEXT:    ret
 ;
-; ZVBB-RV32-LABEL: vector_interleave_nxv40f16_nxv8f16:
+; ZVBB-RV32-LABEL: vector_interleave_nxv48bf16_nxv8bf16:
 ; ZVBB-RV32:       # %bb.0:
 ; ZVBB-RV32-NEXT:    addi sp, sp, -80
 ; ZVBB-RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
@@ -5230,61 +10595,68 @@ define <vscale x 40 x half> @vector_interleave_nxv40f16_nxv8f16(<vscale x 8 x ha
 ; ZVBB-RV32-NEXT:    sub sp, sp, a0
 ; ZVBB-RV32-NEXT:    andi sp, sp, -64
 ; ZVBB-RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVBB-RV32-NEXT:    vmv2r.v v20, v16
-; ZVBB-RV32-NEXT:    addi a0, sp, 64
-; ZVBB-RV32-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV32-NEXT:    vmv2r.v v20, v14
+; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV32-NEXT:    vmv2r.v v24, v10
 ; ZVBB-RV32-NEXT:    csrr a1, vlenb
-; ZVBB-RV32-NEXT:    slli a2, a1, 2
-; ZVBB-RV32-NEXT:    add a1, a2, a1
+; ZVBB-RV32-NEXT:    li a0, 6
+; ZVBB-RV32-NEXT:    mul a1, a1, a0
 ; ZVBB-RV32-NEXT:    add a1, sp, a1
 ; ZVBB-RV32-NEXT:    addi a1, a1, 64
+; ZVBB-RV32-NEXT:    vmv1r.v v10, v25
+; ZVBB-RV32-NEXT:    vmv1r.v v11, v23
+; ZVBB-RV32-NEXT:    vmv1r.v v12, v21
+; ZVBB-RV32-NEXT:    addi a0, sp, 64
+; ZVBB-RV32-NEXT:    vmv1r.v v13, v17
 ; ZVBB-RV32-NEXT:    csrr a2, vlenb
-; ZVBB-RV32-NEXT:    vmv2r.v v16, v8
-; ZVBB-RV32-NEXT:    vmv2r.v v22, v16
-; ZVBB-RV32-NEXT:    vmv2r.v v24, v18
-; ZVBB-RV32-NEXT:    vmv1r.v v26, v20
+; ZVBB-RV32-NEXT:    vmv1r.v v14, v19
+; ZVBB-RV32-NEXT:    vsseg6e16.v v9, (a1)
+; ZVBB-RV32-NEXT:    vmv1r.v v9, v24
+; ZVBB-RV32-NEXT:    add a5, a1, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v10, v22
 ; ZVBB-RV32-NEXT:    add a3, a0, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v23, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
-; ZVBB-RV32-NEXT:    add a5, a4, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v25, v14
+; ZVBB-RV32-NEXT:    vmv1r.v v11, v20
+; ZVBB-RV32-NEXT:    add a4, a3, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v12, v16
 ; ZVBB-RV32-NEXT:    add a6, a5, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v18, v11
-; ZVBB-RV32-NEXT:    vsseg5e16.v v22, (a0)
-; ZVBB-RV32-NEXT:    vmv1r.v v20, v15
-; ZVBB-RV32-NEXT:    vsseg5e16.v v17, (a1)
+; ZVBB-RV32-NEXT:    vmv1r.v v13, v18
+; ZVBB-RV32-NEXT:    vsseg6e16.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re16.v v14, (a1)
+; ZVBB-RV32-NEXT:    add a1, a6, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v15, (a5)
+; ZVBB-RV32-NEXT:    add a5, a1, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v18, (a5)
+; ZVBB-RV32-NEXT:    add a5, a5, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v19, (a5)
+; ZVBB-RV32-NEXT:    add a5, a4, a2
 ; ZVBB-RV32-NEXT:    vl1re16.v v16, (a6)
+; ZVBB-RV32-NEXT:    add a6, a5, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v17, (a6)
-; ZVBB-RV32-NEXT:    add a6, a3, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v10, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v11, (a6)
+; ZVBB-RV32-NEXT:    vl1re16.v v13, (a6)
+; ZVBB-RV32-NEXT:    csrr a6, vlenb
+; ZVBB-RV32-NEXT:    li a7, 12
+; ZVBB-RV32-NEXT:    mul a6, a6, a7
+; ZVBB-RV32-NEXT:    add a6, sp, a6
+; ZVBB-RV32-NEXT:    addi a6, a6, 64
+; ZVBB-RV32-NEXT:    vl1re16.v v17, (a1)
+; ZVBB-RV32-NEXT:    vl1re16.v v10, (a4)
+; ZVBB-RV32-NEXT:    vl1re16.v v11, (a5)
 ; ZVBB-RV32-NEXT:    vl1re16.v v8, (a0)
 ; ZVBB-RV32-NEXT:    vl1re16.v v9, (a3)
-; ZVBB-RV32-NEXT:    vl1re16.v v14, (a4)
-; ZVBB-RV32-NEXT:    csrr a0, vlenb
-; ZVBB-RV32-NEXT:    li a3, 10
-; ZVBB-RV32-NEXT:    mul a0, a0, a3
-; ZVBB-RV32-NEXT:    add a0, sp, a0
-; ZVBB-RV32-NEXT:    addi a0, a0, 64
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v15, (a5)
-; ZVBB-RV32-NEXT:    vl1re16.v v12, (a6)
-; ZVBB-RV32-NEXT:    vl1re16.v v13, (a1)
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
-; ZVBB-RV32-NEXT:    add a2, a0, a2
-; ZVBB-RV32-NEXT:    vs2r.v v16, (a2)
-; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
+; ZVBB-RV32-NEXT:    add a2, a6, a2
+; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV32-NEXT:    vs8r.v v8, (a6)
 ; ZVBB-RV32-NEXT:    vl8re16.v v16, (a2)
-; ZVBB-RV32-NEXT:    vl8re16.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl8re16.v v8, (a6)
 ; ZVBB-RV32-NEXT:    addi sp, s0, -80
 ; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    addi sp, sp, 80
 ; ZVBB-RV32-NEXT:    ret
 ;
-; ZVBB-RV64-LABEL: vector_interleave_nxv40f16_nxv8f16:
+; ZVBB-RV64-LABEL: vector_interleave_nxv48bf16_nxv8bf16:
 ; ZVBB-RV64:       # %bb.0:
 ; ZVBB-RV64-NEXT:    addi sp, sp, -80
 ; ZVBB-RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -5296,61 +10668,68 @@ define <vscale x 40 x half> @vector_interleave_nxv40f16_nxv8f16(<vscale x 8 x ha
 ; ZVBB-RV64-NEXT:    sub sp, sp, a0
 ; ZVBB-RV64-NEXT:    andi sp, sp, -64
 ; ZVBB-RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVBB-RV64-NEXT:    vmv2r.v v20, v16
-; ZVBB-RV64-NEXT:    addi a0, sp, 64
-; ZVBB-RV64-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV64-NEXT:    vmv2r.v v20, v14
+; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV64-NEXT:    vmv2r.v v24, v10
 ; ZVBB-RV64-NEXT:    csrr a1, vlenb
-; ZVBB-RV64-NEXT:    slli a2, a1, 2
-; ZVBB-RV64-NEXT:    add a1, a2, a1
+; ZVBB-RV64-NEXT:    li a0, 6
+; ZVBB-RV64-NEXT:    mul a1, a1, a0
 ; ZVBB-RV64-NEXT:    add a1, sp, a1
 ; ZVBB-RV64-NEXT:    addi a1, a1, 64
+; ZVBB-RV64-NEXT:    vmv1r.v v10, v25
+; ZVBB-RV64-NEXT:    vmv1r.v v11, v23
+; ZVBB-RV64-NEXT:    vmv1r.v v12, v21
+; ZVBB-RV64-NEXT:    addi a0, sp, 64
+; ZVBB-RV64-NEXT:    vmv1r.v v13, v17
 ; ZVBB-RV64-NEXT:    csrr a2, vlenb
-; ZVBB-RV64-NEXT:    vmv2r.v v16, v8
-; ZVBB-RV64-NEXT:    vmv2r.v v22, v16
-; ZVBB-RV64-NEXT:    vmv2r.v v24, v18
-; ZVBB-RV64-NEXT:    vmv1r.v v26, v20
+; ZVBB-RV64-NEXT:    vmv1r.v v14, v19
+; ZVBB-RV64-NEXT:    vsseg6e16.v v9, (a1)
+; ZVBB-RV64-NEXT:    vmv1r.v v9, v24
+; ZVBB-RV64-NEXT:    add a5, a1, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v10, v22
 ; ZVBB-RV64-NEXT:    add a3, a0, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v23, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
-; ZVBB-RV64-NEXT:    add a5, a4, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v25, v14
+; ZVBB-RV64-NEXT:    vmv1r.v v11, v20
+; ZVBB-RV64-NEXT:    add a4, a3, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v12, v16
 ; ZVBB-RV64-NEXT:    add a6, a5, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v18, v11
-; ZVBB-RV64-NEXT:    vsseg5e16.v v22, (a0)
-; ZVBB-RV64-NEXT:    vmv1r.v v20, v15
-; ZVBB-RV64-NEXT:    vsseg5e16.v v17, (a1)
+; ZVBB-RV64-NEXT:    vmv1r.v v13, v18
+; ZVBB-RV64-NEXT:    vsseg6e16.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re16.v v14, (a1)
+; ZVBB-RV64-NEXT:    add a1, a6, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v15, (a5)
+; ZVBB-RV64-NEXT:    add a5, a1, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v18, (a5)
+; ZVBB-RV64-NEXT:    add a5, a5, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v19, (a5)
+; ZVBB-RV64-NEXT:    add a5, a4, a2
 ; ZVBB-RV64-NEXT:    vl1re16.v v16, (a6)
+; ZVBB-RV64-NEXT:    add a6, a5, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v17, (a6)
-; ZVBB-RV64-NEXT:    add a6, a3, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v10, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v11, (a6)
+; ZVBB-RV64-NEXT:    vl1re16.v v13, (a6)
+; ZVBB-RV64-NEXT:    csrr a6, vlenb
+; ZVBB-RV64-NEXT:    li a7, 12
+; ZVBB-RV64-NEXT:    mul a6, a6, a7
+; ZVBB-RV64-NEXT:    add a6, sp, a6
+; ZVBB-RV64-NEXT:    addi a6, a6, 64
+; ZVBB-RV64-NEXT:    vl1re16.v v17, (a1)
+; ZVBB-RV64-NEXT:    vl1re16.v v10, (a4)
+; ZVBB-RV64-NEXT:    vl1re16.v v11, (a5)
 ; ZVBB-RV64-NEXT:    vl1re16.v v8, (a0)
 ; ZVBB-RV64-NEXT:    vl1re16.v v9, (a3)
-; ZVBB-RV64-NEXT:    vl1re16.v v14, (a4)
-; ZVBB-RV64-NEXT:    csrr a0, vlenb
-; ZVBB-RV64-NEXT:    li a3, 10
-; ZVBB-RV64-NEXT:    mul a0, a0, a3
-; ZVBB-RV64-NEXT:    add a0, sp, a0
-; ZVBB-RV64-NEXT:    addi a0, a0, 64
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v15, (a5)
-; ZVBB-RV64-NEXT:    vl1re16.v v12, (a6)
-; ZVBB-RV64-NEXT:    vl1re16.v v13, (a1)
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
-; ZVBB-RV64-NEXT:    add a2, a0, a2
-; ZVBB-RV64-NEXT:    vs2r.v v16, (a2)
-; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
+; ZVBB-RV64-NEXT:    add a2, a6, a2
+; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV64-NEXT:    vs8r.v v8, (a6)
 ; ZVBB-RV64-NEXT:    vl8re16.v v16, (a2)
-; ZVBB-RV64-NEXT:    vl8re16.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl8re16.v v8, (a6)
 ; ZVBB-RV64-NEXT:    addi sp, s0, -80
 ; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    addi sp, sp, 80
 ; ZVBB-RV64-NEXT:    ret
 ;
-; ZIP-LABEL: vector_interleave_nxv40f16_nxv8f16:
+; ZIP-LABEL: vector_interleave_nxv48bf16_nxv8bf16:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -80
 ; ZIP-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -5362,65 +10741,72 @@ define <vscale x 40 x half> @vector_interleave_nxv40f16_nxv8f16(<vscale x 8 x ha
 ; ZIP-NEXT:    sub sp, sp, a0
 ; ZIP-NEXT:    andi sp, sp, -64
 ; ZIP-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZIP-NEXT:    vmv2r.v v20, v16
-; ZIP-NEXT:    addi a0, sp, 64
-; ZIP-NEXT:    vmv2r.v v18, v12
+; ZIP-NEXT:    vmv2r.v v20, v14
+; ZIP-NEXT:    vmv2r.v v22, v12
+; ZIP-NEXT:    vmv2r.v v24, v10
 ; ZIP-NEXT:    csrr a1, vlenb
-; ZIP-NEXT:    slli a2, a1, 2
-; ZIP-NEXT:    add a1, a2, a1
+; ZIP-NEXT:    li a0, 6
+; ZIP-NEXT:    mul a1, a1, a0
 ; ZIP-NEXT:    add a1, sp, a1
 ; ZIP-NEXT:    addi a1, a1, 64
+; ZIP-NEXT:    vmv1r.v v10, v25
+; ZIP-NEXT:    vmv1r.v v11, v23
+; ZIP-NEXT:    vmv1r.v v12, v21
+; ZIP-NEXT:    addi a0, sp, 64
+; ZIP-NEXT:    vmv1r.v v13, v17
 ; ZIP-NEXT:    csrr a2, vlenb
-; ZIP-NEXT:    vmv2r.v v16, v8
-; ZIP-NEXT:    vmv2r.v v22, v16
-; ZIP-NEXT:    vmv2r.v v24, v18
-; ZIP-NEXT:    vmv1r.v v26, v20
+; ZIP-NEXT:    vmv1r.v v14, v19
+; ZIP-NEXT:    vsseg6e16.v v9, (a1)
+; ZIP-NEXT:    vmv1r.v v9, v24
+; ZIP-NEXT:    add a5, a1, a2
+; ZIP-NEXT:    vmv1r.v v10, v22
 ; ZIP-NEXT:    add a3, a0, a2
-; ZIP-NEXT:    vmv1r.v v23, v10
-; ZIP-NEXT:    add a4, a1, a2
-; ZIP-NEXT:    add a5, a4, a2
-; ZIP-NEXT:    vmv1r.v v25, v14
+; ZIP-NEXT:    vmv1r.v v11, v20
+; ZIP-NEXT:    add a4, a3, a2
+; ZIP-NEXT:    vmv1r.v v12, v16
 ; ZIP-NEXT:    add a6, a5, a2
-; ZIP-NEXT:    vmv1r.v v18, v11
-; ZIP-NEXT:    vsseg5e16.v v22, (a0)
-; ZIP-NEXT:    vmv1r.v v20, v15
-; ZIP-NEXT:    vsseg5e16.v v17, (a1)
+; ZIP-NEXT:    vmv1r.v v13, v18
+; ZIP-NEXT:    vsseg6e16.v v8, (a0)
+; ZIP-NEXT:    vl1re16.v v14, (a1)
+; ZIP-NEXT:    add a1, a6, a2
+; ZIP-NEXT:    vl1re16.v v15, (a5)
+; ZIP-NEXT:    add a5, a1, a2
+; ZIP-NEXT:    vl1re16.v v18, (a5)
+; ZIP-NEXT:    add a5, a5, a2
+; ZIP-NEXT:    vl1re16.v v19, (a5)
+; ZIP-NEXT:    add a5, a4, a2
 ; ZIP-NEXT:    vl1re16.v v16, (a6)
+; ZIP-NEXT:    add a6, a5, a2
+; ZIP-NEXT:    vl1re16.v v12, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re16.v v17, (a6)
-; ZIP-NEXT:    add a6, a3, a2
-; ZIP-NEXT:    vl1re16.v v10, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re16.v v11, (a6)
+; ZIP-NEXT:    vl1re16.v v13, (a6)
+; ZIP-NEXT:    csrr a6, vlenb
+; ZIP-NEXT:    li a7, 12
+; ZIP-NEXT:    mul a6, a6, a7
+; ZIP-NEXT:    add a6, sp, a6
+; ZIP-NEXT:    addi a6, a6, 64
+; ZIP-NEXT:    vl1re16.v v17, (a1)
+; ZIP-NEXT:    vl1re16.v v10, (a4)
+; ZIP-NEXT:    vl1re16.v v11, (a5)
 ; ZIP-NEXT:    vl1re16.v v8, (a0)
 ; ZIP-NEXT:    vl1re16.v v9, (a3)
-; ZIP-NEXT:    vl1re16.v v14, (a4)
-; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    li a3, 10
-; ZIP-NEXT:    mul a0, a0, a3
-; ZIP-NEXT:    add a0, sp, a0
-; ZIP-NEXT:    addi a0, a0, 64
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re16.v v15, (a5)
-; ZIP-NEXT:    vl1re16.v v12, (a6)
-; ZIP-NEXT:    vl1re16.v v13, (a1)
 ; ZIP-NEXT:    slli a2, a2, 3
-; ZIP-NEXT:    add a2, a0, a2
-; ZIP-NEXT:    vs2r.v v16, (a2)
-; ZIP-NEXT:    vs8r.v v8, (a0)
+; ZIP-NEXT:    add a2, a6, a2
+; ZIP-NEXT:    vs4r.v v16, (a2)
+; ZIP-NEXT:    vs8r.v v8, (a6)
 ; ZIP-NEXT:    vl8re16.v v16, (a2)
-; ZIP-NEXT:    vl8re16.v v8, (a0)
+; ZIP-NEXT:    vl8re16.v v8, (a6)
 ; ZIP-NEXT:    addi sp, s0, -80
 ; ZIP-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    addi sp, sp, 80
 ; ZIP-NEXT:    ret
-  %res = call <vscale x 40 x half> @llvm.vector.interleave5.nxv40f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4)
-  ret <vscale x 40 x half> %res
+  %res = call <vscale x 48 x bfloat> @llvm.vector.interleave6.nxv48bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4, <vscale x 8 x bfloat> %v5)
+  ret <vscale x 48 x bfloat> %res
 }
 
-define <vscale x 10 x bfloat> @vector_interleave_nxv10bf16_nxv2bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4) nounwind {
-; CHECK-LABEL: vector_interleave_nxv10bf16_nxv2bf16:
+define <vscale x 6 x float> @vector_interleave_nxv6f32_nxv1f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4, <vscale x 1 x float> %v6) nounwind {
+; CHECK-LABEL: vector_interleave_nxv6f32_nxv1f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -5432,23 +10818,27 @@ define <vscale x 10 x bfloat> @vector_interleave_nxv10bf16_nxv2bf16(<vscale x 2
 ; CHECK-NEXT:    srli a2, a1, 1
 ; CHECK-NEXT:    add a3, a0, a2
 ; CHECK-NEXT:    add a4, a3, a2
-; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vsseg5e16.v v8, (a0)
 ; CHECK-NEXT:    add a5, a4, a2
-; CHECK-NEXT:    vle16.v v8, (a5)
-; CHECK-NEXT:    vle16.v v9, (a4)
-; CHECK-NEXT:    srli a1, a1, 2
-; CHECK-NEXT:    add a4, a1, a1
-; CHECK-NEXT:    vle16.v v10, (a3)
-; CHECK-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v9, v8, a1
-; CHECK-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v10, a1
-; CHECK-NEXT:    add a2, a5, a2
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vle16.v v10, (a2)
+; CHECK-NEXT:    vsetvli a6, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsseg6e32.v v8, (a0)
+; CHECK-NEXT:    add a6, a5, a2
+; CHECK-NEXT:    add a2, a6, a2
+; CHECK-NEXT:    vle32.v v10, (a6)
+; CHECK-NEXT:    vle32.v v8, (a2)
+; CHECK-NEXT:    srli a1, a1, 3
+; CHECK-NEXT:    add a2, a1, a1
+; CHECK-NEXT:    vle32.v v11, (a5)
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v8, a1
+; CHECK-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v9, (a4)
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v11, a1
+; CHECK-NEXT:    vsetvli a4, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v11, (a3)
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v11, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a1, a0, 1
 ; CHECK-NEXT:    add a0, a1, a0
@@ -5456,7 +10846,7 @@ define <vscale x 10 x bfloat> @vector_interleave_nxv10bf16_nxv2bf16(<vscale x 2
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv10bf16_nxv2bf16:
+; ZVBB-LABEL: vector_interleave_nxv6f32_nxv1f32:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    csrr a0, vlenb
@@ -5468,93 +10858,101 @@ define <vscale x 10 x bfloat> @vector_interleave_nxv10bf16_nxv2bf16(<vscale x 2
 ; ZVBB-NEXT:    srli a2, a1, 1
 ; ZVBB-NEXT:    add a3, a0, a2
 ; ZVBB-NEXT:    add a4, a3, a2
-; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
-; ZVBB-NEXT:    vsseg5e16.v v8, (a0)
 ; ZVBB-NEXT:    add a5, a4, a2
-; ZVBB-NEXT:    vle16.v v8, (a5)
-; ZVBB-NEXT:    vle16.v v9, (a4)
-; ZVBB-NEXT:    srli a1, a1, 2
-; ZVBB-NEXT:    add a4, a1, a1
-; ZVBB-NEXT:    vle16.v v10, (a3)
-; ZVBB-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v9, v8, a1
-; ZVBB-NEXT:    vsetvli a3, zero, e16, mf2, ta, ma
-; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a4, e16, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v8, v10, a1
-; ZVBB-NEXT:    add a2, a5, a2
-; ZVBB-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; ZVBB-NEXT:    vle16.v v10, (a2)
+; ZVBB-NEXT:    vsetvli a6, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsseg6e32.v v8, (a0)
+; ZVBB-NEXT:    add a6, a5, a2
+; ZVBB-NEXT:    add a2, a6, a2
+; ZVBB-NEXT:    vle32.v v10, (a6)
+; ZVBB-NEXT:    vle32.v v8, (a2)
+; ZVBB-NEXT:    srli a1, a1, 3
+; ZVBB-NEXT:    add a2, a1, a1
+; ZVBB-NEXT:    vle32.v v11, (a5)
+; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v10, v8, a1
+; ZVBB-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vle32.v v9, (a4)
+; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v9, v11, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vle32.v v11, (a3)
+; ZVBB-NEXT:    vle32.v v8, (a0)
+; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v8, v11, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a1, a0, 1
 ; ZVBB-NEXT:    add a0, a1, a0
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 10 x bfloat> @llvm.vector.interleave5.nxv10bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4)
-  ret <vscale x 10 x bfloat> %res
+  %res = call <vscale x 6 x float> @llvm.vector.interleave6.nxv6f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4, <vscale x 1 x float> %v6)
+  ret <vscale x 6 x float> %res
 }
 
-define <vscale x 20 x bfloat> @vector_interleave_nxv20bf16_nxv4bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4) nounwind {
-; CHECK-LABEL: vector_interleave_nxv20bf16_nxv4bf16:
+define <vscale x 12 x float> @vector_interleave_nxv12f32_nxv2f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4, <vscale x 2 x float> %v5) nounwind {
+; CHECK-LABEL: vector_interleave_nxv12f32_nxv2f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 2
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    li a1, 6
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    add a3, a2, a1
-; CHECK-NEXT:    vsetvli a4, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vsseg5e16.v v8, (a0)
-; CHECK-NEXT:    vl1re16.v v10, (a3)
+; CHECK-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsseg6e32.v v8, (a0)
+; CHECK-NEXT:    vl1re32.v v10, (a3)
 ; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    vl1re16.v v11, (a3)
-; CHECK-NEXT:    vl1re16.v v8, (a0)
-; CHECK-NEXT:    vl1re16.v v9, (a2)
+; CHECK-NEXT:    vl1re32.v v11, (a3)
+; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    vl1re32.v v8, (a0)
+; CHECK-NEXT:    vl1re32.v v9, (a2)
+; CHECK-NEXT:    vl1re32.v v12, (a3)
 ; CHECK-NEXT:    add a1, a3, a1
-; CHECK-NEXT:    vl1re16.v v12, (a1)
+; CHECK-NEXT:    vl1re32.v v13, (a1)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 2
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    li a1, 6
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv20bf16_nxv4bf16:
+; ZVBB-LABEL: vector_interleave_nxv12f32_nxv2f32:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a1, a0, 2
-; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    li a1, 6
+; ZVBB-NEXT:    mul a0, a0, a1
 ; ZVBB-NEXT:    sub sp, sp, a0
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    add a2, a0, a1
 ; ZVBB-NEXT:    add a3, a2, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e16, m1, ta, ma
-; ZVBB-NEXT:    vsseg5e16.v v8, (a0)
-; ZVBB-NEXT:    vl1re16.v v10, (a3)
+; ZVBB-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
+; ZVBB-NEXT:    vsseg6e32.v v8, (a0)
+; ZVBB-NEXT:    vl1re32.v v10, (a3)
 ; ZVBB-NEXT:    add a3, a3, a1
-; ZVBB-NEXT:    vl1re16.v v11, (a3)
-; ZVBB-NEXT:    vl1re16.v v8, (a0)
-; ZVBB-NEXT:    vl1re16.v v9, (a2)
+; ZVBB-NEXT:    vl1re32.v v11, (a3)
+; ZVBB-NEXT:    add a3, a3, a1
+; ZVBB-NEXT:    vl1re32.v v8, (a0)
+; ZVBB-NEXT:    vl1re32.v v9, (a2)
+; ZVBB-NEXT:    vl1re32.v v12, (a3)
 ; ZVBB-NEXT:    add a1, a3, a1
-; ZVBB-NEXT:    vl1re16.v v12, (a1)
+; ZVBB-NEXT:    vl1re32.v v13, (a1)
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a1, a0, 2
-; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    li a1, 6
+; ZVBB-NEXT:    mul a0, a0, a1
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 20 x bfloat> @llvm.vector.interleave5.nxv20bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4)
-  ret <vscale x 20 x bfloat> %res
+  %res = call <vscale x 12 x float> @llvm.vector.interleave6.nxv12f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4, <vscale x 2 x float> %v5)
+  ret <vscale x 12 x float> %res
 }
 
-define <vscale x 40 x bfloat> @vector_interleave_nxv40bf16_nxv8bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4) nounwind {
-; RV32-LABEL: vector_interleave_nxv40bf16_nxv8bf16:
+define <vscale x 24 x float> @vector_interleave_nxv24f32_nxv4f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4, <vscale x 4 x float> %v5) nounwind {
+; RV32-LABEL: vector_interleave_nxv24f32_nxv4f32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -80
 ; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
@@ -5565,62 +10963,69 @@ define <vscale x 40 x bfloat> @vector_interleave_nxv40bf16_nxv8bf16(<vscale x 8
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    sub sp, sp, a0
 ; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; RV32-NEXT:    vmv2r.v v20, v16
-; RV32-NEXT:    addi a0, sp, 64
-; RV32-NEXT:    vmv2r.v v18, v12
+; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV32-NEXT:    vmv2r.v v20, v14
+; RV32-NEXT:    vmv2r.v v22, v12
+; RV32-NEXT:    vmv2r.v v24, v10
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 2
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    li a0, 6
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    vmv1r.v v10, v25
+; RV32-NEXT:    vmv1r.v v11, v23
+; RV32-NEXT:    vmv1r.v v12, v21
+; RV32-NEXT:    addi a0, sp, 64
+; RV32-NEXT:    vmv1r.v v13, v17
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    vmv2r.v v16, v8
-; RV32-NEXT:    vmv2r.v v22, v16
-; RV32-NEXT:    vmv2r.v v24, v18
-; RV32-NEXT:    vmv1r.v v26, v20
+; RV32-NEXT:    vmv1r.v v14, v19
+; RV32-NEXT:    vsseg6e32.v v9, (a1)
+; RV32-NEXT:    vmv1r.v v9, v24
+; RV32-NEXT:    add a5, a1, a2
+; RV32-NEXT:    vmv1r.v v10, v22
 ; RV32-NEXT:    add a3, a0, a2
-; RV32-NEXT:    vmv1r.v v23, v10
-; RV32-NEXT:    add a4, a1, a2
+; RV32-NEXT:    vmv1r.v v11, v20
+; RV32-NEXT:    add a4, a3, a2
+; RV32-NEXT:    vmv1r.v v12, v16
+; RV32-NEXT:    add a6, a5, a2
+; RV32-NEXT:    vmv1r.v v13, v18
+; RV32-NEXT:    vsseg6e32.v v8, (a0)
+; RV32-NEXT:    vl1re32.v v14, (a1)
+; RV32-NEXT:    add a1, a6, a2
+; RV32-NEXT:    vl1re32.v v15, (a5)
+; RV32-NEXT:    add a5, a1, a2
+; RV32-NEXT:    vl1re32.v v18, (a5)
+; RV32-NEXT:    add a5, a5, a2
+; RV32-NEXT:    vl1re32.v v19, (a5)
 ; RV32-NEXT:    add a5, a4, a2
-; RV32-NEXT:    vmv1r.v v25, v14
+; RV32-NEXT:    vl1re32.v v16, (a6)
 ; RV32-NEXT:    add a6, a5, a2
-; RV32-NEXT:    vmv1r.v v18, v11
-; RV32-NEXT:    vsseg5e16.v v22, (a0)
-; RV32-NEXT:    vmv1r.v v20, v15
-; RV32-NEXT:    vsseg5e16.v v17, (a1)
-; RV32-NEXT:    vl1re16.v v16, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v17, (a6)
-; RV32-NEXT:    add a6, a3, a2
-; RV32-NEXT:    vl1re16.v v10, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v11, (a6)
-; RV32-NEXT:    vl1re16.v v8, (a0)
-; RV32-NEXT:    vl1re16.v v9, (a3)
-; RV32-NEXT:    vl1re16.v v14, (a4)
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a3, 10
-; RV32-NEXT:    mul a0, a0, a3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    vl1re32.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v15, (a5)
-; RV32-NEXT:    vl1re16.v v12, (a6)
-; RV32-NEXT:    vl1re16.v v13, (a1)
+; RV32-NEXT:    vl1re32.v v13, (a6)
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    li a7, 12
+; RV32-NEXT:    mul a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 64
+; RV32-NEXT:    vl1re32.v v17, (a1)
+; RV32-NEXT:    vl1re32.v v10, (a4)
+; RV32-NEXT:    vl1re32.v v11, (a5)
+; RV32-NEXT:    vl1re32.v v8, (a0)
+; RV32-NEXT:    vl1re32.v v9, (a3)
 ; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, a0, a2
-; RV32-NEXT:    vs2r.v v16, (a2)
-; RV32-NEXT:    vs8r.v v8, (a0)
-; RV32-NEXT:    vl8re16.v v16, (a2)
-; RV32-NEXT:    vl8re16.v v8, (a0)
+; RV32-NEXT:    add a2, a6, a2
+; RV32-NEXT:    vs4r.v v16, (a2)
+; RV32-NEXT:    vs8r.v v8, (a6)
+; RV32-NEXT:    vl8re32.v v16, (a2)
+; RV32-NEXT:    vl8re32.v v8, (a6)
 ; RV32-NEXT:    addi sp, s0, -80
 ; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 80
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: vector_interleave_nxv40bf16_nxv8bf16:
+; RV64-LABEL: vector_interleave_nxv24f32_nxv4f32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -80
 ; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -5631,62 +11036,69 @@ define <vscale x 40 x bfloat> @vector_interleave_nxv40bf16_nxv8bf16(<vscale x 8
 ; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    sub sp, sp, a0
 ; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; RV64-NEXT:    vmv2r.v v20, v16
-; RV64-NEXT:    addi a0, sp, 64
-; RV64-NEXT:    vmv2r.v v18, v12
+; RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV64-NEXT:    vmv2r.v v20, v14
+; RV64-NEXT:    vmv2r.v v22, v12
+; RV64-NEXT:    vmv2r.v v24, v10
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 2
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a0, 6
+; RV64-NEXT:    mul a1, a1, a0
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 64
+; RV64-NEXT:    vmv1r.v v10, v25
+; RV64-NEXT:    vmv1r.v v11, v23
+; RV64-NEXT:    vmv1r.v v12, v21
+; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    vmv1r.v v13, v17
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    vmv2r.v v16, v8
-; RV64-NEXT:    vmv2r.v v22, v16
-; RV64-NEXT:    vmv2r.v v24, v18
-; RV64-NEXT:    vmv1r.v v26, v20
+; RV64-NEXT:    vmv1r.v v14, v19
+; RV64-NEXT:    vsseg6e32.v v9, (a1)
+; RV64-NEXT:    vmv1r.v v9, v24
+; RV64-NEXT:    add a5, a1, a2
+; RV64-NEXT:    vmv1r.v v10, v22
 ; RV64-NEXT:    add a3, a0, a2
-; RV64-NEXT:    vmv1r.v v23, v10
-; RV64-NEXT:    add a4, a1, a2
+; RV64-NEXT:    vmv1r.v v11, v20
+; RV64-NEXT:    add a4, a3, a2
+; RV64-NEXT:    vmv1r.v v12, v16
+; RV64-NEXT:    add a6, a5, a2
+; RV64-NEXT:    vmv1r.v v13, v18
+; RV64-NEXT:    vsseg6e32.v v8, (a0)
+; RV64-NEXT:    vl1re32.v v14, (a1)
+; RV64-NEXT:    add a1, a6, a2
+; RV64-NEXT:    vl1re32.v v15, (a5)
+; RV64-NEXT:    add a5, a1, a2
+; RV64-NEXT:    vl1re32.v v18, (a5)
+; RV64-NEXT:    add a5, a5, a2
+; RV64-NEXT:    vl1re32.v v19, (a5)
 ; RV64-NEXT:    add a5, a4, a2
-; RV64-NEXT:    vmv1r.v v25, v14
+; RV64-NEXT:    vl1re32.v v16, (a6)
 ; RV64-NEXT:    add a6, a5, a2
-; RV64-NEXT:    vmv1r.v v18, v11
-; RV64-NEXT:    vsseg5e16.v v22, (a0)
-; RV64-NEXT:    vmv1r.v v20, v15
-; RV64-NEXT:    vsseg5e16.v v17, (a1)
-; RV64-NEXT:    vl1re16.v v16, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v17, (a6)
-; RV64-NEXT:    add a6, a3, a2
-; RV64-NEXT:    vl1re16.v v10, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v11, (a6)
-; RV64-NEXT:    vl1re16.v v8, (a0)
-; RV64-NEXT:    vl1re16.v v9, (a3)
-; RV64-NEXT:    vl1re16.v v14, (a4)
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a3, 10
-; RV64-NEXT:    mul a0, a0, a3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    vl1re32.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v15, (a5)
-; RV64-NEXT:    vl1re16.v v12, (a6)
-; RV64-NEXT:    vl1re16.v v13, (a1)
+; RV64-NEXT:    vl1re32.v v13, (a6)
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    li a7, 12
+; RV64-NEXT:    mul a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 64
+; RV64-NEXT:    vl1re32.v v17, (a1)
+; RV64-NEXT:    vl1re32.v v10, (a4)
+; RV64-NEXT:    vl1re32.v v11, (a5)
+; RV64-NEXT:    vl1re32.v v8, (a0)
+; RV64-NEXT:    vl1re32.v v9, (a3)
 ; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a2, a0, a2
-; RV64-NEXT:    vs2r.v v16, (a2)
-; RV64-NEXT:    vs8r.v v8, (a0)
-; RV64-NEXT:    vl8re16.v v16, (a2)
-; RV64-NEXT:    vl8re16.v v8, (a0)
+; RV64-NEXT:    add a2, a6, a2
+; RV64-NEXT:    vs4r.v v16, (a2)
+; RV64-NEXT:    vs8r.v v8, (a6)
+; RV64-NEXT:    vl8re32.v v16, (a2)
+; RV64-NEXT:    vl8re32.v v8, (a6)
 ; RV64-NEXT:    addi sp, s0, -80
 ; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 80
 ; RV64-NEXT:    ret
 ;
-; ZVBB-RV32-LABEL: vector_interleave_nxv40bf16_nxv8bf16:
+; ZVBB-RV32-LABEL: vector_interleave_nxv24f32_nxv4f32:
 ; ZVBB-RV32:       # %bb.0:
 ; ZVBB-RV32-NEXT:    addi sp, sp, -80
 ; ZVBB-RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
@@ -5697,62 +11109,69 @@ define <vscale x 40 x bfloat> @vector_interleave_nxv40bf16_nxv8bf16(<vscale x 8
 ; ZVBB-RV32-NEXT:    mul a0, a0, a1
 ; ZVBB-RV32-NEXT:    sub sp, sp, a0
 ; ZVBB-RV32-NEXT:    andi sp, sp, -64
-; ZVBB-RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVBB-RV32-NEXT:    vmv2r.v v20, v16
-; ZVBB-RV32-NEXT:    addi a0, sp, 64
-; ZVBB-RV32-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV32-NEXT:    vmv2r.v v20, v14
+; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV32-NEXT:    vmv2r.v v24, v10
 ; ZVBB-RV32-NEXT:    csrr a1, vlenb
-; ZVBB-RV32-NEXT:    slli a2, a1, 2
-; ZVBB-RV32-NEXT:    add a1, a2, a1
+; ZVBB-RV32-NEXT:    li a0, 6
+; ZVBB-RV32-NEXT:    mul a1, a1, a0
 ; ZVBB-RV32-NEXT:    add a1, sp, a1
 ; ZVBB-RV32-NEXT:    addi a1, a1, 64
+; ZVBB-RV32-NEXT:    vmv1r.v v10, v25
+; ZVBB-RV32-NEXT:    vmv1r.v v11, v23
+; ZVBB-RV32-NEXT:    vmv1r.v v12, v21
+; ZVBB-RV32-NEXT:    addi a0, sp, 64
+; ZVBB-RV32-NEXT:    vmv1r.v v13, v17
 ; ZVBB-RV32-NEXT:    csrr a2, vlenb
-; ZVBB-RV32-NEXT:    vmv2r.v v16, v8
-; ZVBB-RV32-NEXT:    vmv2r.v v22, v16
-; ZVBB-RV32-NEXT:    vmv2r.v v24, v18
-; ZVBB-RV32-NEXT:    vmv1r.v v26, v20
+; ZVBB-RV32-NEXT:    vmv1r.v v14, v19
+; ZVBB-RV32-NEXT:    vsseg6e32.v v9, (a1)
+; ZVBB-RV32-NEXT:    vmv1r.v v9, v24
+; ZVBB-RV32-NEXT:    add a5, a1, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v10, v22
 ; ZVBB-RV32-NEXT:    add a3, a0, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v23, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v11, v20
+; ZVBB-RV32-NEXT:    add a4, a3, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v12, v16
+; ZVBB-RV32-NEXT:    add a6, a5, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v13, v18
+; ZVBB-RV32-NEXT:    vsseg6e32.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re32.v v14, (a1)
+; ZVBB-RV32-NEXT:    add a1, a6, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v15, (a5)
+; ZVBB-RV32-NEXT:    add a5, a1, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v18, (a5)
+; ZVBB-RV32-NEXT:    add a5, a5, a2
+; ZVBB-RV32-NEXT:    vl1re32.v v19, (a5)
 ; ZVBB-RV32-NEXT:    add a5, a4, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v25, v14
+; ZVBB-RV32-NEXT:    vl1re32.v v16, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a5, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v18, v11
-; ZVBB-RV32-NEXT:    vsseg5e16.v v22, (a0)
-; ZVBB-RV32-NEXT:    vmv1r.v v20, v15
-; ZVBB-RV32-NEXT:    vsseg5e16.v v17, (a1)
-; ZVBB-RV32-NEXT:    vl1re16.v v16, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v17, (a6)
-; ZVBB-RV32-NEXT:    add a6, a3, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v10, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v11, (a6)
-; ZVBB-RV32-NEXT:    vl1re16.v v8, (a0)
-; ZVBB-RV32-NEXT:    vl1re16.v v9, (a3)
-; ZVBB-RV32-NEXT:    vl1re16.v v14, (a4)
-; ZVBB-RV32-NEXT:    csrr a0, vlenb
-; ZVBB-RV32-NEXT:    li a3, 10
-; ZVBB-RV32-NEXT:    mul a0, a0, a3
-; ZVBB-RV32-NEXT:    add a0, sp, a0
-; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    vl1re32.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v15, (a5)
-; ZVBB-RV32-NEXT:    vl1re16.v v12, (a6)
-; ZVBB-RV32-NEXT:    vl1re16.v v13, (a1)
+; ZVBB-RV32-NEXT:    vl1re32.v v13, (a6)
+; ZVBB-RV32-NEXT:    csrr a6, vlenb
+; ZVBB-RV32-NEXT:    li a7, 12
+; ZVBB-RV32-NEXT:    mul a6, a6, a7
+; ZVBB-RV32-NEXT:    add a6, sp, a6
+; ZVBB-RV32-NEXT:    addi a6, a6, 64
+; ZVBB-RV32-NEXT:    vl1re32.v v17, (a1)
+; ZVBB-RV32-NEXT:    vl1re32.v v10, (a4)
+; ZVBB-RV32-NEXT:    vl1re32.v v11, (a5)
+; ZVBB-RV32-NEXT:    vl1re32.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re32.v v9, (a3)
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
-; ZVBB-RV32-NEXT:    add a2, a0, a2
-; ZVBB-RV32-NEXT:    vs2r.v v16, (a2)
-; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
-; ZVBB-RV32-NEXT:    vl8re16.v v16, (a2)
-; ZVBB-RV32-NEXT:    vl8re16.v v8, (a0)
+; ZVBB-RV32-NEXT:    add a2, a6, a2
+; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV32-NEXT:    vs8r.v v8, (a6)
+; ZVBB-RV32-NEXT:    vl8re32.v v16, (a2)
+; ZVBB-RV32-NEXT:    vl8re32.v v8, (a6)
 ; ZVBB-RV32-NEXT:    addi sp, s0, -80
 ; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    addi sp, sp, 80
 ; ZVBB-RV32-NEXT:    ret
 ;
-; ZVBB-RV64-LABEL: vector_interleave_nxv40bf16_nxv8bf16:
+; ZVBB-RV64-LABEL: vector_interleave_nxv24f32_nxv4f32:
 ; ZVBB-RV64:       # %bb.0:
 ; ZVBB-RV64-NEXT:    addi sp, sp, -80
 ; ZVBB-RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -5763,62 +11182,69 @@ define <vscale x 40 x bfloat> @vector_interleave_nxv40bf16_nxv8bf16(<vscale x 8
 ; ZVBB-RV64-NEXT:    mul a0, a0, a1
 ; ZVBB-RV64-NEXT:    sub sp, sp, a0
 ; ZVBB-RV64-NEXT:    andi sp, sp, -64
-; ZVBB-RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZVBB-RV64-NEXT:    vmv2r.v v20, v16
-; ZVBB-RV64-NEXT:    addi a0, sp, 64
-; ZVBB-RV64-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV64-NEXT:    vmv2r.v v20, v14
+; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV64-NEXT:    vmv2r.v v24, v10
 ; ZVBB-RV64-NEXT:    csrr a1, vlenb
-; ZVBB-RV64-NEXT:    slli a2, a1, 2
-; ZVBB-RV64-NEXT:    add a1, a2, a1
+; ZVBB-RV64-NEXT:    li a0, 6
+; ZVBB-RV64-NEXT:    mul a1, a1, a0
 ; ZVBB-RV64-NEXT:    add a1, sp, a1
 ; ZVBB-RV64-NEXT:    addi a1, a1, 64
+; ZVBB-RV64-NEXT:    vmv1r.v v10, v25
+; ZVBB-RV64-NEXT:    vmv1r.v v11, v23
+; ZVBB-RV64-NEXT:    vmv1r.v v12, v21
+; ZVBB-RV64-NEXT:    addi a0, sp, 64
+; ZVBB-RV64-NEXT:    vmv1r.v v13, v17
 ; ZVBB-RV64-NEXT:    csrr a2, vlenb
-; ZVBB-RV64-NEXT:    vmv2r.v v16, v8
-; ZVBB-RV64-NEXT:    vmv2r.v v22, v16
-; ZVBB-RV64-NEXT:    vmv2r.v v24, v18
-; ZVBB-RV64-NEXT:    vmv1r.v v26, v20
+; ZVBB-RV64-NEXT:    vmv1r.v v14, v19
+; ZVBB-RV64-NEXT:    vsseg6e32.v v9, (a1)
+; ZVBB-RV64-NEXT:    vmv1r.v v9, v24
+; ZVBB-RV64-NEXT:    add a5, a1, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v10, v22
 ; ZVBB-RV64-NEXT:    add a3, a0, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v23, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v11, v20
+; ZVBB-RV64-NEXT:    add a4, a3, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v12, v16
+; ZVBB-RV64-NEXT:    add a6, a5, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v13, v18
+; ZVBB-RV64-NEXT:    vsseg6e32.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re32.v v14, (a1)
+; ZVBB-RV64-NEXT:    add a1, a6, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v15, (a5)
+; ZVBB-RV64-NEXT:    add a5, a1, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v18, (a5)
+; ZVBB-RV64-NEXT:    add a5, a5, a2
+; ZVBB-RV64-NEXT:    vl1re32.v v19, (a5)
 ; ZVBB-RV64-NEXT:    add a5, a4, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v25, v14
+; ZVBB-RV64-NEXT:    vl1re32.v v16, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a5, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v18, v11
-; ZVBB-RV64-NEXT:    vsseg5e16.v v22, (a0)
-; ZVBB-RV64-NEXT:    vmv1r.v v20, v15
-; ZVBB-RV64-NEXT:    vsseg5e16.v v17, (a1)
-; ZVBB-RV64-NEXT:    vl1re16.v v16, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v17, (a6)
-; ZVBB-RV64-NEXT:    add a6, a3, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v10, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v11, (a6)
-; ZVBB-RV64-NEXT:    vl1re16.v v8, (a0)
-; ZVBB-RV64-NEXT:    vl1re16.v v9, (a3)
-; ZVBB-RV64-NEXT:    vl1re16.v v14, (a4)
-; ZVBB-RV64-NEXT:    csrr a0, vlenb
-; ZVBB-RV64-NEXT:    li a3, 10
-; ZVBB-RV64-NEXT:    mul a0, a0, a3
-; ZVBB-RV64-NEXT:    add a0, sp, a0
-; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    vl1re32.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v15, (a5)
-; ZVBB-RV64-NEXT:    vl1re16.v v12, (a6)
-; ZVBB-RV64-NEXT:    vl1re16.v v13, (a1)
+; ZVBB-RV64-NEXT:    vl1re32.v v13, (a6)
+; ZVBB-RV64-NEXT:    csrr a6, vlenb
+; ZVBB-RV64-NEXT:    li a7, 12
+; ZVBB-RV64-NEXT:    mul a6, a6, a7
+; ZVBB-RV64-NEXT:    add a6, sp, a6
+; ZVBB-RV64-NEXT:    addi a6, a6, 64
+; ZVBB-RV64-NEXT:    vl1re32.v v17, (a1)
+; ZVBB-RV64-NEXT:    vl1re32.v v10, (a4)
+; ZVBB-RV64-NEXT:    vl1re32.v v11, (a5)
+; ZVBB-RV64-NEXT:    vl1re32.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re32.v v9, (a3)
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
-; ZVBB-RV64-NEXT:    add a2, a0, a2
-; ZVBB-RV64-NEXT:    vs2r.v v16, (a2)
-; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
-; ZVBB-RV64-NEXT:    vl8re16.v v16, (a2)
-; ZVBB-RV64-NEXT:    vl8re16.v v8, (a0)
+; ZVBB-RV64-NEXT:    add a2, a6, a2
+; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV64-NEXT:    vs8r.v v8, (a6)
+; ZVBB-RV64-NEXT:    vl8re32.v v16, (a2)
+; ZVBB-RV64-NEXT:    vl8re32.v v8, (a6)
 ; ZVBB-RV64-NEXT:    addi sp, s0, -80
 ; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    addi sp, sp, 80
 ; ZVBB-RV64-NEXT:    ret
 ;
-; ZIP-LABEL: vector_interleave_nxv40bf16_nxv8bf16:
+; ZIP-LABEL: vector_interleave_nxv24f32_nxv4f32:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -80
 ; ZIP-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -5829,200 +11255,135 @@ define <vscale x 40 x bfloat> @vector_interleave_nxv40bf16_nxv8bf16(<vscale x 8
 ; ZIP-NEXT:    mul a0, a0, a1
 ; ZIP-NEXT:    sub sp, sp, a0
 ; ZIP-NEXT:    andi sp, sp, -64
-; ZIP-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; ZIP-NEXT:    vmv2r.v v20, v16
-; ZIP-NEXT:    addi a0, sp, 64
-; ZIP-NEXT:    vmv2r.v v18, v12
+; ZIP-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; ZIP-NEXT:    vmv2r.v v20, v14
+; ZIP-NEXT:    vmv2r.v v22, v12
+; ZIP-NEXT:    vmv2r.v v24, v10
 ; ZIP-NEXT:    csrr a1, vlenb
-; ZIP-NEXT:    slli a2, a1, 2
-; ZIP-NEXT:    add a1, a2, a1
+; ZIP-NEXT:    li a0, 6
+; ZIP-NEXT:    mul a1, a1, a0
 ; ZIP-NEXT:    add a1, sp, a1
 ; ZIP-NEXT:    addi a1, a1, 64
+; ZIP-NEXT:    vmv1r.v v10, v25
+; ZIP-NEXT:    vmv1r.v v11, v23
+; ZIP-NEXT:    vmv1r.v v12, v21
+; ZIP-NEXT:    addi a0, sp, 64
+; ZIP-NEXT:    vmv1r.v v13, v17
 ; ZIP-NEXT:    csrr a2, vlenb
-; ZIP-NEXT:    vmv2r.v v16, v8
-; ZIP-NEXT:    vmv2r.v v22, v16
-; ZIP-NEXT:    vmv2r.v v24, v18
-; ZIP-NEXT:    vmv1r.v v26, v20
+; ZIP-NEXT:    vmv1r.v v14, v19
+; ZIP-NEXT:    vsseg6e32.v v9, (a1)
+; ZIP-NEXT:    vmv1r.v v9, v24
+; ZIP-NEXT:    add a5, a1, a2
+; ZIP-NEXT:    vmv1r.v v10, v22
 ; ZIP-NEXT:    add a3, a0, a2
-; ZIP-NEXT:    vmv1r.v v23, v10
-; ZIP-NEXT:    add a4, a1, a2
+; ZIP-NEXT:    vmv1r.v v11, v20
+; ZIP-NEXT:    add a4, a3, a2
+; ZIP-NEXT:    vmv1r.v v12, v16
+; ZIP-NEXT:    add a6, a5, a2
+; ZIP-NEXT:    vmv1r.v v13, v18
+; ZIP-NEXT:    vsseg6e32.v v8, (a0)
+; ZIP-NEXT:    vl1re32.v v14, (a1)
+; ZIP-NEXT:    add a1, a6, a2
+; ZIP-NEXT:    vl1re32.v v15, (a5)
+; ZIP-NEXT:    add a5, a1, a2
+; ZIP-NEXT:    vl1re32.v v18, (a5)
+; ZIP-NEXT:    add a5, a5, a2
+; ZIP-NEXT:    vl1re32.v v19, (a5)
 ; ZIP-NEXT:    add a5, a4, a2
-; ZIP-NEXT:    vmv1r.v v25, v14
+; ZIP-NEXT:    vl1re32.v v16, (a6)
 ; ZIP-NEXT:    add a6, a5, a2
-; ZIP-NEXT:    vmv1r.v v18, v11
-; ZIP-NEXT:    vsseg5e16.v v22, (a0)
-; ZIP-NEXT:    vmv1r.v v20, v15
-; ZIP-NEXT:    vsseg5e16.v v17, (a1)
-; ZIP-NEXT:    vl1re16.v v16, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re16.v v17, (a6)
-; ZIP-NEXT:    add a6, a3, a2
-; ZIP-NEXT:    vl1re16.v v10, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re16.v v11, (a6)
-; ZIP-NEXT:    vl1re16.v v8, (a0)
-; ZIP-NEXT:    vl1re16.v v9, (a3)
-; ZIP-NEXT:    vl1re16.v v14, (a4)
-; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    li a3, 10
-; ZIP-NEXT:    mul a0, a0, a3
-; ZIP-NEXT:    add a0, sp, a0
-; ZIP-NEXT:    addi a0, a0, 64
+; ZIP-NEXT:    vl1re32.v v12, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re16.v v15, (a5)
-; ZIP-NEXT:    vl1re16.v v12, (a6)
-; ZIP-NEXT:    vl1re16.v v13, (a1)
+; ZIP-NEXT:    vl1re32.v v13, (a6)
+; ZIP-NEXT:    csrr a6, vlenb
+; ZIP-NEXT:    li a7, 12
+; ZIP-NEXT:    mul a6, a6, a7
+; ZIP-NEXT:    add a6, sp, a6
+; ZIP-NEXT:    addi a6, a6, 64
+; ZIP-NEXT:    vl1re32.v v17, (a1)
+; ZIP-NEXT:    vl1re32.v v10, (a4)
+; ZIP-NEXT:    vl1re32.v v11, (a5)
+; ZIP-NEXT:    vl1re32.v v8, (a0)
+; ZIP-NEXT:    vl1re32.v v9, (a3)
 ; ZIP-NEXT:    slli a2, a2, 3
-; ZIP-NEXT:    add a2, a0, a2
-; ZIP-NEXT:    vs2r.v v16, (a2)
-; ZIP-NEXT:    vs8r.v v8, (a0)
-; ZIP-NEXT:    vl8re16.v v16, (a2)
-; ZIP-NEXT:    vl8re16.v v8, (a0)
+; ZIP-NEXT:    add a2, a6, a2
+; ZIP-NEXT:    vs4r.v v16, (a2)
+; ZIP-NEXT:    vs8r.v v8, (a6)
+; ZIP-NEXT:    vl8re32.v v16, (a2)
+; ZIP-NEXT:    vl8re32.v v8, (a6)
 ; ZIP-NEXT:    addi sp, s0, -80
 ; ZIP-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    addi sp, sp, 80
 ; ZIP-NEXT:    ret
-  %res = call <vscale x 40 x bfloat> @llvm.vector.interleave5.nxv40bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4)
-  ret <vscale x 40 x bfloat> %res
-}
-
-define <vscale x 5 x float> @vector_interleave_nxv5f32_nxv1f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4) nounwind {
-; CHECK-LABEL: vector_interleave_nxv5f32_nxv1f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 1
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 1
-; CHECK-NEXT:    add a3, a0, a2
-; CHECK-NEXT:    add a4, a3, a2
-; CHECK-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vsseg5e32.v v8, (a0)
-; CHECK-NEXT:    add a5, a4, a2
-; CHECK-NEXT:    vle32.v v8, (a5)
-; CHECK-NEXT:    vle32.v v9, (a4)
-; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    add a4, a1, a1
-; CHECK-NEXT:    vle32.v v10, (a3)
-; CHECK-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v9, v8, a1
-; CHECK-NEXT:    vsetvli a3, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v10, a1
-; CHECK-NEXT:    add a2, a5, a2
-; CHECK-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a2)
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 1
-; CHECK-NEXT:    add a0, a1, a0
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-;
-; ZVBB-LABEL: vector_interleave_nxv5f32_nxv1f32:
-; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a1, a0, 1
-; ZVBB-NEXT:    add a0, a1, a0
-; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    addi a0, sp, 16
-; ZVBB-NEXT:    csrr a1, vlenb
-; ZVBB-NEXT:    srli a2, a1, 1
-; ZVBB-NEXT:    add a3, a0, a2
-; ZVBB-NEXT:    add a4, a3, a2
-; ZVBB-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
-; ZVBB-NEXT:    vsseg5e32.v v8, (a0)
-; ZVBB-NEXT:    add a5, a4, a2
-; ZVBB-NEXT:    vle32.v v8, (a5)
-; ZVBB-NEXT:    vle32.v v9, (a4)
-; ZVBB-NEXT:    srli a1, a1, 3
-; ZVBB-NEXT:    add a4, a1, a1
-; ZVBB-NEXT:    vle32.v v10, (a3)
-; ZVBB-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v9, v8, a1
-; ZVBB-NEXT:    vsetvli a3, zero, e32, mf2, ta, ma
-; ZVBB-NEXT:    vle32.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a4, e32, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v8, v10, a1
-; ZVBB-NEXT:    add a2, a5, a2
-; ZVBB-NEXT:    vsetvli a0, zero, e32, mf2, ta, ma
-; ZVBB-NEXT:    vle32.v v10, (a2)
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a1, a0, 1
-; ZVBB-NEXT:    add a0, a1, a0
-; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    ret
-  %res = call <vscale x 5 x float> @llvm.vector.interleave5.nxv5f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4)
-  ret <vscale x 5 x float> %res
+  %res = call <vscale x 24 x float> @llvm.vector.interleave6.nxv24f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4, <vscale x 4 x float> %v5)
+  ret <vscale x 24 x float> %res
 }
 
-define <vscale x 10 x float> @vector_interleave_nxv10f32_nxv2f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4) nounwind {
-; CHECK-LABEL: vector_interleave_nxv10f32_nxv2f32:
+define <vscale x 6 x double> @vector_interleave_nxv6f64_nxv1f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4, <vscale x 1 x double> %v5) nounwind {
+; CHECK-LABEL: vector_interleave_nxv6f64_nxv1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 2
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    li a1, 6
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    add a3, a2, a1
-; CHECK-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vsseg5e32.v v8, (a0)
-; CHECK-NEXT:    vl1re32.v v10, (a3)
+; CHECK-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsseg6e64.v v8, (a0)
+; CHECK-NEXT:    vl1re64.v v10, (a3)
 ; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    vl1re32.v v11, (a3)
-; CHECK-NEXT:    vl1re32.v v8, (a0)
-; CHECK-NEXT:    vl1re32.v v9, (a2)
+; CHECK-NEXT:    vl1re64.v v11, (a3)
+; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    vl1re64.v v8, (a0)
+; CHECK-NEXT:    vl1re64.v v9, (a2)
+; CHECK-NEXT:    vl1re64.v v12, (a3)
 ; CHECK-NEXT:    add a1, a3, a1
-; CHECK-NEXT:    vl1re32.v v12, (a1)
+; CHECK-NEXT:    vl1re64.v v13, (a1)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 2
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    li a1, 6
+; CHECK-NEXT:    mul a0, a0, a1
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv10f32_nxv2f32:
+; ZVBB-LABEL: vector_interleave_nxv6f64_nxv1f64:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a1, a0, 2
-; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    li a1, 6
+; ZVBB-NEXT:    mul a0, a0, a1
 ; ZVBB-NEXT:    sub sp, sp, a0
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    add a2, a0, a1
 ; ZVBB-NEXT:    add a3, a2, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
-; ZVBB-NEXT:    vsseg5e32.v v8, (a0)
-; ZVBB-NEXT:    vl1re32.v v10, (a3)
+; ZVBB-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
+; ZVBB-NEXT:    vsseg6e64.v v8, (a0)
+; ZVBB-NEXT:    vl1re64.v v10, (a3)
 ; ZVBB-NEXT:    add a3, a3, a1
-; ZVBB-NEXT:    vl1re32.v v11, (a3)
-; ZVBB-NEXT:    vl1re32.v v8, (a0)
-; ZVBB-NEXT:    vl1re32.v v9, (a2)
+; ZVBB-NEXT:    vl1re64.v v11, (a3)
+; ZVBB-NEXT:    add a3, a3, a1
+; ZVBB-NEXT:    vl1re64.v v8, (a0)
+; ZVBB-NEXT:    vl1re64.v v9, (a2)
+; ZVBB-NEXT:    vl1re64.v v12, (a3)
 ; ZVBB-NEXT:    add a1, a3, a1
-; ZVBB-NEXT:    vl1re32.v v12, (a1)
+; ZVBB-NEXT:    vl1re64.v v13, (a1)
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a1, a0, 2
-; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    li a1, 6
+; ZVBB-NEXT:    mul a0, a0, a1
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 10 x float> @llvm.vector.interleave5.nxv10f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4)
-  ret <vscale x 10 x float> %res
+  %res = call <vscale x 6 x double> @llvm.vector.interleave6.nxv6f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4, <vscale x 1 x double> %v5)
+  ret <vscale x 6 x double> %res
 }
 
-define <vscale x 20 x float> @vector_interleave_nxv20f32_nxv4f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4) nounwind {
-; RV32-LABEL: vector_interleave_nxv20f32_nxv4f32:
+define <vscale x 12 x double> @vector_interleave_nxv12f64_nxv2f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4, <vscale x 2 x double> %v5) nounwind {
+; RV32-LABEL: vector_interleave_nxv12f64_nxv2f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -80
 ; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
@@ -6033,62 +11394,69 @@ define <vscale x 20 x float> @vector_interleave_nxv20f32_nxv4f32(<vscale x 4 x f
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    sub sp, sp, a0
 ; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; RV32-NEXT:    vmv2r.v v20, v16
-; RV32-NEXT:    addi a0, sp, 64
-; RV32-NEXT:    vmv2r.v v18, v12
+; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV32-NEXT:    vmv2r.v v20, v14
+; RV32-NEXT:    vmv2r.v v22, v12
+; RV32-NEXT:    vmv2r.v v24, v10
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 2
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    li a0, 6
+; RV32-NEXT:    mul a1, a1, a0
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    vmv1r.v v10, v25
+; RV32-NEXT:    vmv1r.v v11, v23
+; RV32-NEXT:    vmv1r.v v12, v21
+; RV32-NEXT:    addi a0, sp, 64
+; RV32-NEXT:    vmv1r.v v13, v17
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    vmv2r.v v16, v8
-; RV32-NEXT:    vmv2r.v v22, v16
-; RV32-NEXT:    vmv2r.v v24, v18
-; RV32-NEXT:    vmv1r.v v26, v20
+; RV32-NEXT:    vmv1r.v v14, v19
+; RV32-NEXT:    vsseg6e64.v v9, (a1)
+; RV32-NEXT:    vmv1r.v v9, v24
+; RV32-NEXT:    add a5, a1, a2
+; RV32-NEXT:    vmv1r.v v10, v22
 ; RV32-NEXT:    add a3, a0, a2
-; RV32-NEXT:    vmv1r.v v23, v10
-; RV32-NEXT:    add a4, a1, a2
+; RV32-NEXT:    vmv1r.v v11, v20
+; RV32-NEXT:    add a4, a3, a2
+; RV32-NEXT:    vmv1r.v v12, v16
+; RV32-NEXT:    add a6, a5, a2
+; RV32-NEXT:    vmv1r.v v13, v18
+; RV32-NEXT:    vsseg6e64.v v8, (a0)
+; RV32-NEXT:    vl1re64.v v14, (a1)
+; RV32-NEXT:    add a1, a6, a2
+; RV32-NEXT:    vl1re64.v v15, (a5)
+; RV32-NEXT:    add a5, a1, a2
+; RV32-NEXT:    vl1re64.v v18, (a5)
+; RV32-NEXT:    add a5, a5, a2
+; RV32-NEXT:    vl1re64.v v19, (a5)
 ; RV32-NEXT:    add a5, a4, a2
-; RV32-NEXT:    vmv1r.v v25, v14
+; RV32-NEXT:    vl1re64.v v16, (a6)
 ; RV32-NEXT:    add a6, a5, a2
-; RV32-NEXT:    vmv1r.v v18, v11
-; RV32-NEXT:    vsseg5e32.v v22, (a0)
-; RV32-NEXT:    vmv1r.v v20, v15
-; RV32-NEXT:    vsseg5e32.v v17, (a1)
-; RV32-NEXT:    vl1re32.v v16, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v17, (a6)
-; RV32-NEXT:    add a6, a3, a2
-; RV32-NEXT:    vl1re32.v v10, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v11, (a6)
-; RV32-NEXT:    vl1re32.v v8, (a0)
-; RV32-NEXT:    vl1re32.v v9, (a3)
-; RV32-NEXT:    vl1re32.v v14, (a4)
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a3, 10
-; RV32-NEXT:    mul a0, a0, a3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 64
+; RV32-NEXT:    vl1re64.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v15, (a5)
-; RV32-NEXT:    vl1re32.v v12, (a6)
-; RV32-NEXT:    vl1re32.v v13, (a1)
+; RV32-NEXT:    vl1re64.v v13, (a6)
+; RV32-NEXT:    csrr a6, vlenb
+; RV32-NEXT:    li a7, 12
+; RV32-NEXT:    mul a6, a6, a7
+; RV32-NEXT:    add a6, sp, a6
+; RV32-NEXT:    addi a6, a6, 64
+; RV32-NEXT:    vl1re64.v v17, (a1)
+; RV32-NEXT:    vl1re64.v v10, (a4)
+; RV32-NEXT:    vl1re64.v v11, (a5)
+; RV32-NEXT:    vl1re64.v v8, (a0)
+; RV32-NEXT:    vl1re64.v v9, (a3)
 ; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, a0, a2
-; RV32-NEXT:    vs2r.v v16, (a2)
-; RV32-NEXT:    vs8r.v v8, (a0)
-; RV32-NEXT:    vl8re32.v v16, (a2)
-; RV32-NEXT:    vl8re32.v v8, (a0)
+; RV32-NEXT:    add a2, a6, a2
+; RV32-NEXT:    vs4r.v v16, (a2)
+; RV32-NEXT:    vs8r.v v8, (a6)
+; RV32-NEXT:    vl8re64.v v16, (a2)
+; RV32-NEXT:    vl8re64.v v8, (a6)
 ; RV32-NEXT:    addi sp, s0, -80
 ; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 80
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: vector_interleave_nxv20f32_nxv4f32:
+; RV64-LABEL: vector_interleave_nxv12f64_nxv2f64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -80
 ; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -6099,62 +11467,69 @@ define <vscale x 20 x float> @vector_interleave_nxv20f32_nxv4f32(<vscale x 4 x f
 ; RV64-NEXT:    mul a0, a0, a1
 ; RV64-NEXT:    sub sp, sp, a0
 ; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; RV64-NEXT:    vmv2r.v v20, v16
-; RV64-NEXT:    addi a0, sp, 64
-; RV64-NEXT:    vmv2r.v v18, v12
+; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; RV64-NEXT:    vmv2r.v v20, v14
+; RV64-NEXT:    vmv2r.v v22, v12
+; RV64-NEXT:    vmv2r.v v24, v10
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 2
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    li a0, 6
+; RV64-NEXT:    mul a1, a1, a0
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 64
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    vmv2r.v v16, v8
-; RV64-NEXT:    vmv2r.v v22, v16
-; RV64-NEXT:    vmv2r.v v24, v18
-; RV64-NEXT:    vmv1r.v v26, v20
-; RV64-NEXT:    add a3, a0, a2
-; RV64-NEXT:    vmv1r.v v23, v10
-; RV64-NEXT:    add a4, a1, a2
-; RV64-NEXT:    add a5, a4, a2
-; RV64-NEXT:    vmv1r.v v25, v14
-; RV64-NEXT:    add a6, a5, a2
-; RV64-NEXT:    vmv1r.v v18, v11
-; RV64-NEXT:    vsseg5e32.v v22, (a0)
-; RV64-NEXT:    vmv1r.v v20, v15
-; RV64-NEXT:    vsseg5e32.v v17, (a1)
-; RV64-NEXT:    vl1re32.v v16, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v17, (a6)
-; RV64-NEXT:    add a6, a3, a2
-; RV64-NEXT:    vl1re32.v v10, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v11, (a6)
-; RV64-NEXT:    vl1re32.v v8, (a0)
-; RV64-NEXT:    vl1re32.v v9, (a3)
-; RV64-NEXT:    vl1re32.v v14, (a4)
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a3, 10
-; RV64-NEXT:    mul a0, a0, a3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 64
+; RV64-NEXT:    vmv1r.v v10, v25
+; RV64-NEXT:    vmv1r.v v11, v23
+; RV64-NEXT:    vmv1r.v v12, v21
+; RV64-NEXT:    addi a0, sp, 64
+; RV64-NEXT:    vmv1r.v v13, v17
+; RV64-NEXT:    csrr a2, vlenb
+; RV64-NEXT:    vmv1r.v v14, v19
+; RV64-NEXT:    vsseg6e64.v v9, (a1)
+; RV64-NEXT:    vmv1r.v v9, v24
+; RV64-NEXT:    add a5, a1, a2
+; RV64-NEXT:    vmv1r.v v10, v22
+; RV64-NEXT:    add a3, a0, a2
+; RV64-NEXT:    vmv1r.v v11, v20
+; RV64-NEXT:    add a4, a3, a2
+; RV64-NEXT:    vmv1r.v v12, v16
+; RV64-NEXT:    add a6, a5, a2
+; RV64-NEXT:    vmv1r.v v13, v18
+; RV64-NEXT:    vsseg6e64.v v8, (a0)
+; RV64-NEXT:    vl1re64.v v14, (a1)
+; RV64-NEXT:    add a1, a6, a2
+; RV64-NEXT:    vl1re64.v v15, (a5)
+; RV64-NEXT:    add a5, a1, a2
+; RV64-NEXT:    vl1re64.v v18, (a5)
+; RV64-NEXT:    add a5, a5, a2
+; RV64-NEXT:    vl1re64.v v19, (a5)
+; RV64-NEXT:    add a5, a4, a2
+; RV64-NEXT:    vl1re64.v v16, (a6)
+; RV64-NEXT:    add a6, a5, a2
+; RV64-NEXT:    vl1re64.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v15, (a5)
-; RV64-NEXT:    vl1re32.v v12, (a6)
-; RV64-NEXT:    vl1re32.v v13, (a1)
+; RV64-NEXT:    vl1re64.v v13, (a6)
+; RV64-NEXT:    csrr a6, vlenb
+; RV64-NEXT:    li a7, 12
+; RV64-NEXT:    mul a6, a6, a7
+; RV64-NEXT:    add a6, sp, a6
+; RV64-NEXT:    addi a6, a6, 64
+; RV64-NEXT:    vl1re64.v v17, (a1)
+; RV64-NEXT:    vl1re64.v v10, (a4)
+; RV64-NEXT:    vl1re64.v v11, (a5)
+; RV64-NEXT:    vl1re64.v v8, (a0)
+; RV64-NEXT:    vl1re64.v v9, (a3)
 ; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a2, a0, a2
-; RV64-NEXT:    vs2r.v v16, (a2)
-; RV64-NEXT:    vs8r.v v8, (a0)
-; RV64-NEXT:    vl8re32.v v16, (a2)
-; RV64-NEXT:    vl8re32.v v8, (a0)
+; RV64-NEXT:    add a2, a6, a2
+; RV64-NEXT:    vs4r.v v16, (a2)
+; RV64-NEXT:    vs8r.v v8, (a6)
+; RV64-NEXT:    vl8re64.v v16, (a2)
+; RV64-NEXT:    vl8re64.v v8, (a6)
 ; RV64-NEXT:    addi sp, s0, -80
 ; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 80
 ; RV64-NEXT:    ret
 ;
-; ZVBB-RV32-LABEL: vector_interleave_nxv20f32_nxv4f32:
+; ZVBB-RV32-LABEL: vector_interleave_nxv12f64_nxv2f64:
 ; ZVBB-RV32:       # %bb.0:
 ; ZVBB-RV32-NEXT:    addi sp, sp, -80
 ; ZVBB-RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
@@ -6165,62 +11540,69 @@ define <vscale x 20 x float> @vector_interleave_nxv20f32_nxv4f32(<vscale x 4 x f
 ; ZVBB-RV32-NEXT:    mul a0, a0, a1
 ; ZVBB-RV32-NEXT:    sub sp, sp, a0
 ; ZVBB-RV32-NEXT:    andi sp, sp, -64
-; ZVBB-RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; ZVBB-RV32-NEXT:    vmv2r.v v20, v16
-; ZVBB-RV32-NEXT:    addi a0, sp, 64
-; ZVBB-RV32-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV32-NEXT:    vmv2r.v v20, v14
+; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV32-NEXT:    vmv2r.v v24, v10
 ; ZVBB-RV32-NEXT:    csrr a1, vlenb
-; ZVBB-RV32-NEXT:    slli a2, a1, 2
-; ZVBB-RV32-NEXT:    add a1, a2, a1
+; ZVBB-RV32-NEXT:    li a0, 6
+; ZVBB-RV32-NEXT:    mul a1, a1, a0
 ; ZVBB-RV32-NEXT:    add a1, sp, a1
 ; ZVBB-RV32-NEXT:    addi a1, a1, 64
+; ZVBB-RV32-NEXT:    vmv1r.v v10, v25
+; ZVBB-RV32-NEXT:    vmv1r.v v11, v23
+; ZVBB-RV32-NEXT:    vmv1r.v v12, v21
+; ZVBB-RV32-NEXT:    addi a0, sp, 64
+; ZVBB-RV32-NEXT:    vmv1r.v v13, v17
 ; ZVBB-RV32-NEXT:    csrr a2, vlenb
-; ZVBB-RV32-NEXT:    vmv2r.v v16, v8
-; ZVBB-RV32-NEXT:    vmv2r.v v22, v16
-; ZVBB-RV32-NEXT:    vmv2r.v v24, v18
-; ZVBB-RV32-NEXT:    vmv1r.v v26, v20
+; ZVBB-RV32-NEXT:    vmv1r.v v14, v19
+; ZVBB-RV32-NEXT:    vsseg6e64.v v9, (a1)
+; ZVBB-RV32-NEXT:    vmv1r.v v9, v24
+; ZVBB-RV32-NEXT:    add a5, a1, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v10, v22
 ; ZVBB-RV32-NEXT:    add a3, a0, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v23, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v11, v20
+; ZVBB-RV32-NEXT:    add a4, a3, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v12, v16
+; ZVBB-RV32-NEXT:    add a6, a5, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v13, v18
+; ZVBB-RV32-NEXT:    vsseg6e64.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re64.v v14, (a1)
+; ZVBB-RV32-NEXT:    add a1, a6, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v15, (a5)
+; ZVBB-RV32-NEXT:    add a5, a1, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v18, (a5)
+; ZVBB-RV32-NEXT:    add a5, a5, a2
+; ZVBB-RV32-NEXT:    vl1re64.v v19, (a5)
 ; ZVBB-RV32-NEXT:    add a5, a4, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v25, v14
+; ZVBB-RV32-NEXT:    vl1re64.v v16, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a5, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v18, v11
-; ZVBB-RV32-NEXT:    vsseg5e32.v v22, (a0)
-; ZVBB-RV32-NEXT:    vmv1r.v v20, v15
-; ZVBB-RV32-NEXT:    vsseg5e32.v v17, (a1)
-; ZVBB-RV32-NEXT:    vl1re32.v v16, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v17, (a6)
-; ZVBB-RV32-NEXT:    add a6, a3, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v10, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v11, (a6)
-; ZVBB-RV32-NEXT:    vl1re32.v v8, (a0)
-; ZVBB-RV32-NEXT:    vl1re32.v v9, (a3)
-; ZVBB-RV32-NEXT:    vl1re32.v v14, (a4)
-; ZVBB-RV32-NEXT:    csrr a0, vlenb
-; ZVBB-RV32-NEXT:    li a3, 10
-; ZVBB-RV32-NEXT:    mul a0, a0, a3
-; ZVBB-RV32-NEXT:    add a0, sp, a0
-; ZVBB-RV32-NEXT:    addi a0, a0, 64
+; ZVBB-RV32-NEXT:    vl1re64.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v15, (a5)
-; ZVBB-RV32-NEXT:    vl1re32.v v12, (a6)
-; ZVBB-RV32-NEXT:    vl1re32.v v13, (a1)
+; ZVBB-RV32-NEXT:    vl1re64.v v13, (a6)
+; ZVBB-RV32-NEXT:    csrr a6, vlenb
+; ZVBB-RV32-NEXT:    li a7, 12
+; ZVBB-RV32-NEXT:    mul a6, a6, a7
+; ZVBB-RV32-NEXT:    add a6, sp, a6
+; ZVBB-RV32-NEXT:    addi a6, a6, 64
+; ZVBB-RV32-NEXT:    vl1re64.v v17, (a1)
+; ZVBB-RV32-NEXT:    vl1re64.v v10, (a4)
+; ZVBB-RV32-NEXT:    vl1re64.v v11, (a5)
+; ZVBB-RV32-NEXT:    vl1re64.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re64.v v9, (a3)
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
-; ZVBB-RV32-NEXT:    add a2, a0, a2
-; ZVBB-RV32-NEXT:    vs2r.v v16, (a2)
-; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
-; ZVBB-RV32-NEXT:    vl8re32.v v16, (a2)
-; ZVBB-RV32-NEXT:    vl8re32.v v8, (a0)
+; ZVBB-RV32-NEXT:    add a2, a6, a2
+; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV32-NEXT:    vs8r.v v8, (a6)
+; ZVBB-RV32-NEXT:    vl8re64.v v16, (a2)
+; ZVBB-RV32-NEXT:    vl8re64.v v8, (a6)
 ; ZVBB-RV32-NEXT:    addi sp, s0, -80
 ; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    addi sp, sp, 80
 ; ZVBB-RV32-NEXT:    ret
 ;
-; ZVBB-RV64-LABEL: vector_interleave_nxv20f32_nxv4f32:
+; ZVBB-RV64-LABEL: vector_interleave_nxv12f64_nxv2f64:
 ; ZVBB-RV64:       # %bb.0:
 ; ZVBB-RV64-NEXT:    addi sp, sp, -80
 ; ZVBB-RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -6231,62 +11613,69 @@ define <vscale x 20 x float> @vector_interleave_nxv20f32_nxv4f32(<vscale x 4 x f
 ; ZVBB-RV64-NEXT:    mul a0, a0, a1
 ; ZVBB-RV64-NEXT:    sub sp, sp, a0
 ; ZVBB-RV64-NEXT:    andi sp, sp, -64
-; ZVBB-RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; ZVBB-RV64-NEXT:    vmv2r.v v20, v16
-; ZVBB-RV64-NEXT:    addi a0, sp, 64
-; ZVBB-RV64-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-RV64-NEXT:    vmv2r.v v20, v14
+; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
+; ZVBB-RV64-NEXT:    vmv2r.v v24, v10
 ; ZVBB-RV64-NEXT:    csrr a1, vlenb
-; ZVBB-RV64-NEXT:    slli a2, a1, 2
-; ZVBB-RV64-NEXT:    add a1, a2, a1
+; ZVBB-RV64-NEXT:    li a0, 6
+; ZVBB-RV64-NEXT:    mul a1, a1, a0
 ; ZVBB-RV64-NEXT:    add a1, sp, a1
 ; ZVBB-RV64-NEXT:    addi a1, a1, 64
+; ZVBB-RV64-NEXT:    vmv1r.v v10, v25
+; ZVBB-RV64-NEXT:    vmv1r.v v11, v23
+; ZVBB-RV64-NEXT:    vmv1r.v v12, v21
+; ZVBB-RV64-NEXT:    addi a0, sp, 64
+; ZVBB-RV64-NEXT:    vmv1r.v v13, v17
 ; ZVBB-RV64-NEXT:    csrr a2, vlenb
-; ZVBB-RV64-NEXT:    vmv2r.v v16, v8
-; ZVBB-RV64-NEXT:    vmv2r.v v22, v16
-; ZVBB-RV64-NEXT:    vmv2r.v v24, v18
-; ZVBB-RV64-NEXT:    vmv1r.v v26, v20
+; ZVBB-RV64-NEXT:    vmv1r.v v14, v19
+; ZVBB-RV64-NEXT:    vsseg6e64.v v9, (a1)
+; ZVBB-RV64-NEXT:    vmv1r.v v9, v24
+; ZVBB-RV64-NEXT:    add a5, a1, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v10, v22
 ; ZVBB-RV64-NEXT:    add a3, a0, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v23, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v11, v20
+; ZVBB-RV64-NEXT:    add a4, a3, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v12, v16
+; ZVBB-RV64-NEXT:    add a6, a5, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v13, v18
+; ZVBB-RV64-NEXT:    vsseg6e64.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re64.v v14, (a1)
+; ZVBB-RV64-NEXT:    add a1, a6, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v15, (a5)
+; ZVBB-RV64-NEXT:    add a5, a1, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v18, (a5)
+; ZVBB-RV64-NEXT:    add a5, a5, a2
+; ZVBB-RV64-NEXT:    vl1re64.v v19, (a5)
 ; ZVBB-RV64-NEXT:    add a5, a4, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v25, v14
+; ZVBB-RV64-NEXT:    vl1re64.v v16, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a5, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v18, v11
-; ZVBB-RV64-NEXT:    vsseg5e32.v v22, (a0)
-; ZVBB-RV64-NEXT:    vmv1r.v v20, v15
-; ZVBB-RV64-NEXT:    vsseg5e32.v v17, (a1)
-; ZVBB-RV64-NEXT:    vl1re32.v v16, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v17, (a6)
-; ZVBB-RV64-NEXT:    add a6, a3, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v10, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v11, (a6)
-; ZVBB-RV64-NEXT:    vl1re32.v v8, (a0)
-; ZVBB-RV64-NEXT:    vl1re32.v v9, (a3)
-; ZVBB-RV64-NEXT:    vl1re32.v v14, (a4)
-; ZVBB-RV64-NEXT:    csrr a0, vlenb
-; ZVBB-RV64-NEXT:    li a3, 10
-; ZVBB-RV64-NEXT:    mul a0, a0, a3
-; ZVBB-RV64-NEXT:    add a0, sp, a0
-; ZVBB-RV64-NEXT:    addi a0, a0, 64
+; ZVBB-RV64-NEXT:    vl1re64.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v15, (a5)
-; ZVBB-RV64-NEXT:    vl1re32.v v12, (a6)
-; ZVBB-RV64-NEXT:    vl1re32.v v13, (a1)
+; ZVBB-RV64-NEXT:    vl1re64.v v13, (a6)
+; ZVBB-RV64-NEXT:    csrr a6, vlenb
+; ZVBB-RV64-NEXT:    li a7, 12
+; ZVBB-RV64-NEXT:    mul a6, a6, a7
+; ZVBB-RV64-NEXT:    add a6, sp, a6
+; ZVBB-RV64-NEXT:    addi a6, a6, 64
+; ZVBB-RV64-NEXT:    vl1re64.v v17, (a1)
+; ZVBB-RV64-NEXT:    vl1re64.v v10, (a4)
+; ZVBB-RV64-NEXT:    vl1re64.v v11, (a5)
+; ZVBB-RV64-NEXT:    vl1re64.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re64.v v9, (a3)
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
-; ZVBB-RV64-NEXT:    add a2, a0, a2
-; ZVBB-RV64-NEXT:    vs2r.v v16, (a2)
-; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
-; ZVBB-RV64-NEXT:    vl8re32.v v16, (a2)
-; ZVBB-RV64-NEXT:    vl8re32.v v8, (a0)
+; ZVBB-RV64-NEXT:    add a2, a6, a2
+; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
+; ZVBB-RV64-NEXT:    vs8r.v v8, (a6)
+; ZVBB-RV64-NEXT:    vl8re64.v v16, (a2)
+; ZVBB-RV64-NEXT:    vl8re64.v v8, (a6)
 ; ZVBB-RV64-NEXT:    addi sp, s0, -80
 ; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    addi sp, sp, 80
 ; ZVBB-RV64-NEXT:    ret
 ;
-; ZIP-LABEL: vector_interleave_nxv20f32_nxv4f32:
+; ZIP-LABEL: vector_interleave_nxv12f64_nxv2f64:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -80
 ; ZIP-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -6297,458 +11686,637 @@ define <vscale x 20 x float> @vector_interleave_nxv20f32_nxv4f32(<vscale x 4 x f
 ; ZIP-NEXT:    mul a0, a0, a1
 ; ZIP-NEXT:    sub sp, sp, a0
 ; ZIP-NEXT:    andi sp, sp, -64
-; ZIP-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
-; ZIP-NEXT:    vmv2r.v v20, v16
-; ZIP-NEXT:    addi a0, sp, 64
-; ZIP-NEXT:    vmv2r.v v18, v12
+; ZIP-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; ZIP-NEXT:    vmv2r.v v20, v14
+; ZIP-NEXT:    vmv2r.v v22, v12
+; ZIP-NEXT:    vmv2r.v v24, v10
 ; ZIP-NEXT:    csrr a1, vlenb
-; ZIP-NEXT:    slli a2, a1, 2
-; ZIP-NEXT:    add a1, a2, a1
+; ZIP-NEXT:    li a0, 6
+; ZIP-NEXT:    mul a1, a1, a0
 ; ZIP-NEXT:    add a1, sp, a1
 ; ZIP-NEXT:    addi a1, a1, 64
+; ZIP-NEXT:    vmv1r.v v10, v25
+; ZIP-NEXT:    vmv1r.v v11, v23
+; ZIP-NEXT:    vmv1r.v v12, v21
+; ZIP-NEXT:    addi a0, sp, 64
+; ZIP-NEXT:    vmv1r.v v13, v17
 ; ZIP-NEXT:    csrr a2, vlenb
-; ZIP-NEXT:    vmv2r.v v16, v8
-; ZIP-NEXT:    vmv2r.v v22, v16
-; ZIP-NEXT:    vmv2r.v v24, v18
-; ZIP-NEXT:    vmv1r.v v26, v20
+; ZIP-NEXT:    vmv1r.v v14, v19
+; ZIP-NEXT:    vsseg6e64.v v9, (a1)
+; ZIP-NEXT:    vmv1r.v v9, v24
+; ZIP-NEXT:    add a5, a1, a2
+; ZIP-NEXT:    vmv1r.v v10, v22
 ; ZIP-NEXT:    add a3, a0, a2
-; ZIP-NEXT:    vmv1r.v v23, v10
-; ZIP-NEXT:    add a4, a1, a2
+; ZIP-NEXT:    vmv1r.v v11, v20
+; ZIP-NEXT:    add a4, a3, a2
+; ZIP-NEXT:    vmv1r.v v12, v16
+; ZIP-NEXT:    add a6, a5, a2
+; ZIP-NEXT:    vmv1r.v v13, v18
+; ZIP-NEXT:    vsseg6e64.v v8, (a0)
+; ZIP-NEXT:    vl1re64.v v14, (a1)
+; ZIP-NEXT:    add a1, a6, a2
+; ZIP-NEXT:    vl1re64.v v15, (a5)
+; ZIP-NEXT:    add a5, a1, a2
+; ZIP-NEXT:    vl1re64.v v18, (a5)
+; ZIP-NEXT:    add a5, a5, a2
+; ZIP-NEXT:    vl1re64.v v19, (a5)
 ; ZIP-NEXT:    add a5, a4, a2
-; ZIP-NEXT:    vmv1r.v v25, v14
+; ZIP-NEXT:    vl1re64.v v16, (a6)
 ; ZIP-NEXT:    add a6, a5, a2
-; ZIP-NEXT:    vmv1r.v v18, v11
-; ZIP-NEXT:    vsseg5e32.v v22, (a0)
-; ZIP-NEXT:    vmv1r.v v20, v15
-; ZIP-NEXT:    vsseg5e32.v v17, (a1)
-; ZIP-NEXT:    vl1re32.v v16, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re32.v v17, (a6)
-; ZIP-NEXT:    add a6, a3, a2
-; ZIP-NEXT:    vl1re32.v v10, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re32.v v11, (a6)
-; ZIP-NEXT:    vl1re32.v v8, (a0)
-; ZIP-NEXT:    vl1re32.v v9, (a3)
-; ZIP-NEXT:    vl1re32.v v14, (a4)
-; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    li a3, 10
-; ZIP-NEXT:    mul a0, a0, a3
-; ZIP-NEXT:    add a0, sp, a0
-; ZIP-NEXT:    addi a0, a0, 64
+; ZIP-NEXT:    vl1re64.v v12, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re32.v v15, (a5)
-; ZIP-NEXT:    vl1re32.v v12, (a6)
-; ZIP-NEXT:    vl1re32.v v13, (a1)
+; ZIP-NEXT:    vl1re64.v v13, (a6)
+; ZIP-NEXT:    csrr a6, vlenb
+; ZIP-NEXT:    li a7, 12
+; ZIP-NEXT:    mul a6, a6, a7
+; ZIP-NEXT:    add a6, sp, a6
+; ZIP-NEXT:    addi a6, a6, 64
+; ZIP-NEXT:    vl1re64.v v17, (a1)
+; ZIP-NEXT:    vl1re64.v v10, (a4)
+; ZIP-NEXT:    vl1re64.v v11, (a5)
+; ZIP-NEXT:    vl1re64.v v8, (a0)
+; ZIP-NEXT:    vl1re64.v v9, (a3)
 ; ZIP-NEXT:    slli a2, a2, 3
-; ZIP-NEXT:    add a2, a0, a2
-; ZIP-NEXT:    vs2r.v v16, (a2)
-; ZIP-NEXT:    vs8r.v v8, (a0)
-; ZIP-NEXT:    vl8re32.v v16, (a2)
-; ZIP-NEXT:    vl8re32.v v8, (a0)
+; ZIP-NEXT:    add a2, a6, a2
+; ZIP-NEXT:    vs4r.v v16, (a2)
+; ZIP-NEXT:    vs8r.v v8, (a6)
+; ZIP-NEXT:    vl8re64.v v16, (a2)
+; ZIP-NEXT:    vl8re64.v v8, (a6)
 ; ZIP-NEXT:    addi sp, s0, -80
 ; ZIP-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    addi sp, sp, 80
 ; ZIP-NEXT:    ret
-  %res = call <vscale x 20 x float> @llvm.vector.interleave5.nxv20f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4)
-  ret <vscale x 20 x float> %res
+  %res = call <vscale x 12 x double> @llvm.vector.interleave6.nxv12f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4, <vscale x 2 x double> %v5)
+  ret <vscale x 12 x double> %res
+}
+
+define <vscale x 14 x half> @vector_interleave_nxv14f16_nxv2f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4, <vscale x 2 x half> %v5, <vscale x 2 x half> %v6) nounwind {
+; CHECK-LABEL: vector_interleave_nxv14f16_nxv2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a2, a1, 1
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    add a4, a3, a2
+; CHECK-NEXT:    add a5, a4, a2
+; CHECK-NEXT:    add a6, a5, a2
+; CHECK-NEXT:    vsetvli a7, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsseg7e16.v v8, (a0)
+; CHECK-NEXT:    add a7, a6, a2
+; CHECK-NEXT:    vle16.v v8, (a7)
+; CHECK-NEXT:    vle16.v v10, (a6)
+; CHECK-NEXT:    add a6, a1, a1
+; CHECK-NEXT:    add a2, a7, a2
+; CHECK-NEXT:    vle16.v v12, (a5)
+; CHECK-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v8, a1
+; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v11, (a2)
+; CHECK-NEXT:    vle16.v v9, (a4)
+; CHECK-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v12, a1
+; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v12, (a3)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v12, a1
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv14f16_nxv2f16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 2
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    srli a2, a1, 1
+; ZVBB-NEXT:    srli a1, a1, 2
+; ZVBB-NEXT:    add a3, a0, a2
+; ZVBB-NEXT:    add a4, a3, a2
+; ZVBB-NEXT:    add a5, a4, a2
+; ZVBB-NEXT:    add a6, a5, a2
+; ZVBB-NEXT:    vsetvli a7, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsseg7e16.v v8, (a0)
+; ZVBB-NEXT:    add a7, a6, a2
+; ZVBB-NEXT:    vle16.v v8, (a7)
+; ZVBB-NEXT:    vle16.v v10, (a6)
+; ZVBB-NEXT:    add a6, a1, a1
+; ZVBB-NEXT:    add a2, a7, a2
+; ZVBB-NEXT:    vle16.v v12, (a5)
+; ZVBB-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v10, v8, a1
+; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vle16.v v11, (a2)
+; ZVBB-NEXT:    vle16.v v9, (a4)
+; ZVBB-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v9, v12, a1
+; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vle16.v v12, (a3)
+; ZVBB-NEXT:    vle16.v v8, (a0)
+; ZVBB-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v8, v12, a1
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 2
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 14 x half> @llvm.vector.interleave7.nxv14f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4, <vscale x 2 x half> %v5, <vscale x 2 x half> %v6)
+  ret <vscale x 14 x half> %res
 }
 
-define <vscale x 5 x double> @vector_interleave_nxv5f64_nxv1f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4) nounwind {
-; CHECK-LABEL: vector_interleave_nxv5f64_nxv1f64:
+define <vscale x 28 x half> @vector_interleave_nxv28f16_nxv4f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4, <vscale x 4 x half> %v5, <vscale x 4 x half> %v6) nounwind {
+; CHECK-LABEL: vector_interleave_nxv28f16_nxv4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 2
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    sub a0, a1, a0
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    add a3, a2, a1
-; CHECK-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vsseg5e64.v v8, (a0)
-; CHECK-NEXT:    vl1re64.v v10, (a3)
+; CHECK-NEXT:    vsetvli a4, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsseg7e16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v10, (a3)
 ; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    vl1re64.v v11, (a3)
-; CHECK-NEXT:    vl1re64.v v8, (a0)
-; CHECK-NEXT:    vl1re64.v v9, (a2)
-; CHECK-NEXT:    add a1, a3, a1
-; CHECK-NEXT:    vl1re64.v v12, (a1)
+; CHECK-NEXT:    vl1re16.v v11, (a3)
+; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    vl1re16.v v8, (a0)
+; CHECK-NEXT:    add a0, a3, a1
+; CHECK-NEXT:    vl1re16.v v9, (a2)
+; CHECK-NEXT:    vl1re16.v v12, (a3)
+; CHECK-NEXT:    vl1re16.v v13, (a0)
+; CHECK-NEXT:    add a0, a0, a1
+; CHECK-NEXT:    vl1re16.v v14, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 2
-; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    sub a0, a1, a0
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv5f64_nxv1f64:
+; ZVBB-LABEL: vector_interleave_nxv28f16_nxv4f16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a1, a0, 2
-; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    slli a1, a0, 3
+; ZVBB-NEXT:    sub a0, a1, a0
 ; ZVBB-NEXT:    sub sp, sp, a0
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    add a2, a0, a1
 ; ZVBB-NEXT:    add a3, a2, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
-; ZVBB-NEXT:    vsseg5e64.v v8, (a0)
-; ZVBB-NEXT:    vl1re64.v v10, (a3)
+; ZVBB-NEXT:    vsetvli a4, zero, e16, m1, ta, ma
+; ZVBB-NEXT:    vsseg7e16.v v8, (a0)
+; ZVBB-NEXT:    vl1re16.v v10, (a3)
 ; ZVBB-NEXT:    add a3, a3, a1
-; ZVBB-NEXT:    vl1re64.v v11, (a3)
-; ZVBB-NEXT:    vl1re64.v v8, (a0)
-; ZVBB-NEXT:    vl1re64.v v9, (a2)
-; ZVBB-NEXT:    add a1, a3, a1
-; ZVBB-NEXT:    vl1re64.v v12, (a1)
+; ZVBB-NEXT:    vl1re16.v v11, (a3)
+; ZVBB-NEXT:    add a3, a3, a1
+; ZVBB-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-NEXT:    add a0, a3, a1
+; ZVBB-NEXT:    vl1re16.v v9, (a2)
+; ZVBB-NEXT:    vl1re16.v v12, (a3)
+; ZVBB-NEXT:    vl1re16.v v13, (a0)
+; ZVBB-NEXT:    add a0, a0, a1
+; ZVBB-NEXT:    vl1re16.v v14, (a0)
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a1, a0, 2
-; ZVBB-NEXT:    add a0, a1, a0
+; ZVBB-NEXT:    slli a1, a0, 3
+; ZVBB-NEXT:    sub a0, a1, a0
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 5 x double> @llvm.vector.interleave5.nxv5f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4)
-  ret <vscale x 5 x double> %res
+  %res = call <vscale x 28 x half> @llvm.vector.interleave7.nxv28f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4, <vscale x 4 x half> %v5, <vscale x 4 x half> %v6)
+  ret <vscale x 28 x half> %res
 }
 
-define <vscale x 10 x double> @vector_interleave_nxv10f64_nxv2f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4) nounwind {
-; RV32-LABEL: vector_interleave_nxv10f64_nxv2f64:
+define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4, <vscale x 8 x half> %v5, <vscale x 8 x half> %v6) nounwind {
+; RV32-LABEL: vector_interleave_nxv56f16_nxv8f16:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -80
 ; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    addi s0, sp, 80
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 28
-; RV32-NEXT:    mul a0, a0, a1
+; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    sub sp, sp, a0
 ; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vmv2r.v v20, v16
+; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT:    vmv2r.v v26, v20
 ; RV32-NEXT:    addi a0, sp, 64
-; RV32-NEXT:    vmv2r.v v18, v12
+; RV32-NEXT:    vmv2r.v v24, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 2
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    slli a2, a1, 3
+; RV32-NEXT:    sub a1, a2, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 64
+; RV32-NEXT:    vmv2r.v v22, v12
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    vmv2r.v v16, v8
-; RV32-NEXT:    vmv2r.v v22, v16
-; RV32-NEXT:    vmv2r.v v24, v18
-; RV32-NEXT:    vmv1r.v v26, v20
+; RV32-NEXT:    vmv2r.v v20, v8
+; RV32-NEXT:    vmv1r.v v1, v20
+; RV32-NEXT:    vmv1r.v v3, v22
+; RV32-NEXT:    vmv1r.v v5, v24
+; RV32-NEXT:    vmv1r.v v7, v26
 ; RV32-NEXT:    add a3, a0, a2
-; RV32-NEXT:    vmv1r.v v23, v10
+; RV32-NEXT:    vmv1r.v v2, v10
 ; RV32-NEXT:    add a4, a1, a2
-; RV32-NEXT:    add a5, a4, a2
-; RV32-NEXT:    vmv1r.v v25, v14
-; RV32-NEXT:    add a6, a5, a2
-; RV32-NEXT:    vmv1r.v v18, v11
-; RV32-NEXT:    vsseg5e64.v v22, (a0)
-; RV32-NEXT:    vmv1r.v v20, v15
-; RV32-NEXT:    vsseg5e64.v v17, (a1)
-; RV32-NEXT:    vl1re64.v v16, (a6)
+; RV32-NEXT:    slli a5, a2, 2
+; RV32-NEXT:    vmv1r.v v4, v14
+; RV32-NEXT:    slli a6, a2, 4
+; RV32-NEXT:    add a7, a4, a2
+; RV32-NEXT:    vmv1r.v v6, v18
+; RV32-NEXT:    sub a5, a6, a5
+; RV32-NEXT:    vmv1r.v v22, v11
+; RV32-NEXT:    add a6, a7, a2
+; RV32-NEXT:    vmv1r.v v24, v15
+; RV32-NEXT:    vsseg7e16.v v1, (a0)
+; RV32-NEXT:    vmv1r.v v26, v19
+; RV32-NEXT:    vsseg7e16.v v21, (a1)
+; RV32-NEXT:    vl1re16.v v18, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v17, (a6)
+; RV32-NEXT:    vl1re16.v v19, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re16.v v20, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re16.v v21, (a6)
 ; RV32-NEXT:    add a6, a3, a2
-; RV32-NEXT:    vl1re64.v v10, (a6)
+; RV32-NEXT:    vl1re16.v v10, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v11, (a6)
-; RV32-NEXT:    vl1re64.v v8, (a0)
-; RV32-NEXT:    vl1re64.v v9, (a3)
-; RV32-NEXT:    vl1re64.v v14, (a4)
+; RV32-NEXT:    vl1re16.v v11, (a6)
+; RV32-NEXT:    vl1re16.v v8, (a0)
+; RV32-NEXT:    vl1re16.v v16, (a4)
+; RV32-NEXT:    vl1re16.v v9, (a3)
+; RV32-NEXT:    vl1re16.v v17, (a7)
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a3, 10
+; RV32-NEXT:    li a3, 14
 ; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 64
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v15, (a5)
-; RV32-NEXT:    vl1re64.v v12, (a6)
-; RV32-NEXT:    vl1re64.v v13, (a1)
+; RV32-NEXT:    vl1re16.v v12, (a6)
+; RV32-NEXT:    add a6, a6, a2
+; RV32-NEXT:    vl1re16.v v13, (a6)
+; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, a0, a2
-; RV32-NEXT:    vs2r.v v16, (a2)
+; RV32-NEXT:    vl1re16.v v14, (a6)
+; RV32-NEXT:    vl1re16.v v15, (a1)
+; RV32-NEXT:    add a5, a0, a5
+; RV32-NEXT:    vs2r.v v20, (a5)
+; RV32-NEXT:    vs4r.v v16, (a2)
 ; RV32-NEXT:    vs8r.v v8, (a0)
-; RV32-NEXT:    vl8re64.v v16, (a2)
-; RV32-NEXT:    vl8re64.v v8, (a0)
+; RV32-NEXT:    vl8re16.v v16, (a2)
+; RV32-NEXT:    vl8re16.v v8, (a0)
 ; RV32-NEXT:    addi sp, s0, -80
 ; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 80
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: vector_interleave_nxv10f64_nxv2f64:
+; RV64-LABEL: vector_interleave_nxv56f16_nxv8f16:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -80
 ; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    addi s0, sp, 80
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a1, 28
-; RV64-NEXT:    mul a0, a0, a1
+; RV64-NEXT:    slli a0, a0, 5
 ; RV64-NEXT:    sub sp, sp, a0
 ; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT:    vmv2r.v v20, v16
+; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT:    vmv2r.v v26, v20
 ; RV64-NEXT:    addi a0, sp, 64
-; RV64-NEXT:    vmv2r.v v18, v12
+; RV64-NEXT:    vmv2r.v v24, v16
 ; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 2
-; RV64-NEXT:    add a1, a2, a1
+; RV64-NEXT:    slli a2, a1, 3
+; RV64-NEXT:    sub a1, a2, a1
 ; RV64-NEXT:    add a1, sp, a1
 ; RV64-NEXT:    addi a1, a1, 64
+; RV64-NEXT:    vmv2r.v v22, v12
 ; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    vmv2r.v v16, v8
-; RV64-NEXT:    vmv2r.v v22, v16
-; RV64-NEXT:    vmv2r.v v24, v18
-; RV64-NEXT:    vmv1r.v v26, v20
+; RV64-NEXT:    vmv2r.v v20, v8
+; RV64-NEXT:    vmv1r.v v1, v20
+; RV64-NEXT:    vmv1r.v v3, v22
+; RV64-NEXT:    vmv1r.v v5, v24
+; RV64-NEXT:    vmv1r.v v7, v26
 ; RV64-NEXT:    add a3, a0, a2
-; RV64-NEXT:    vmv1r.v v23, v10
+; RV64-NEXT:    vmv1r.v v2, v10
 ; RV64-NEXT:    add a4, a1, a2
-; RV64-NEXT:    add a5, a4, a2
-; RV64-NEXT:    vmv1r.v v25, v14
-; RV64-NEXT:    add a6, a5, a2
-; RV64-NEXT:    vmv1r.v v18, v11
-; RV64-NEXT:    vsseg5e64.v v22, (a0)
-; RV64-NEXT:    vmv1r.v v20, v15
-; RV64-NEXT:    vsseg5e64.v v17, (a1)
-; RV64-NEXT:    vl1re64.v v16, (a6)
+; RV64-NEXT:    slli a5, a2, 2
+; RV64-NEXT:    vmv1r.v v4, v14
+; RV64-NEXT:    slli a6, a2, 4
+; RV64-NEXT:    add a7, a4, a2
+; RV64-NEXT:    vmv1r.v v6, v18
+; RV64-NEXT:    sub a5, a6, a5
+; RV64-NEXT:    vmv1r.v v22, v11
+; RV64-NEXT:    add a6, a7, a2
+; RV64-NEXT:    vmv1r.v v24, v15
+; RV64-NEXT:    vsseg7e16.v v1, (a0)
+; RV64-NEXT:    vmv1r.v v26, v19
+; RV64-NEXT:    vsseg7e16.v v21, (a1)
+; RV64-NEXT:    vl1re16.v v18, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v17, (a6)
+; RV64-NEXT:    vl1re16.v v19, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re16.v v20, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re16.v v21, (a6)
 ; RV64-NEXT:    add a6, a3, a2
-; RV64-NEXT:    vl1re64.v v10, (a6)
+; RV64-NEXT:    vl1re16.v v10, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v11, (a6)
-; RV64-NEXT:    vl1re64.v v8, (a0)
-; RV64-NEXT:    vl1re64.v v9, (a3)
-; RV64-NEXT:    vl1re64.v v14, (a4)
+; RV64-NEXT:    vl1re16.v v11, (a6)
+; RV64-NEXT:    vl1re16.v v8, (a0)
+; RV64-NEXT:    vl1re16.v v16, (a4)
+; RV64-NEXT:    vl1re16.v v9, (a3)
+; RV64-NEXT:    vl1re16.v v17, (a7)
 ; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a3, 10
+; RV64-NEXT:    li a3, 14
 ; RV64-NEXT:    mul a0, a0, a3
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 64
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v15, (a5)
-; RV64-NEXT:    vl1re64.v v12, (a6)
-; RV64-NEXT:    vl1re64.v v13, (a1)
+; RV64-NEXT:    vl1re16.v v12, (a6)
+; RV64-NEXT:    add a6, a6, a2
+; RV64-NEXT:    vl1re16.v v13, (a6)
+; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, a0, a2
-; RV64-NEXT:    vs2r.v v16, (a2)
+; RV64-NEXT:    vl1re16.v v14, (a6)
+; RV64-NEXT:    vl1re16.v v15, (a1)
+; RV64-NEXT:    add a5, a0, a5
+; RV64-NEXT:    vs2r.v v20, (a5)
+; RV64-NEXT:    vs4r.v v16, (a2)
 ; RV64-NEXT:    vs8r.v v8, (a0)
-; RV64-NEXT:    vl8re64.v v16, (a2)
-; RV64-NEXT:    vl8re64.v v8, (a0)
+; RV64-NEXT:    vl8re16.v v16, (a2)
+; RV64-NEXT:    vl8re16.v v8, (a0)
 ; RV64-NEXT:    addi sp, s0, -80
 ; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 80
 ; RV64-NEXT:    ret
 ;
-; ZVBB-RV32-LABEL: vector_interleave_nxv10f64_nxv2f64:
+; ZVBB-RV32-LABEL: vector_interleave_nxv56f16_nxv8f16:
 ; ZVBB-RV32:       # %bb.0:
 ; ZVBB-RV32-NEXT:    addi sp, sp, -80
 ; ZVBB-RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
 ; ZVBB-RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
 ; ZVBB-RV32-NEXT:    addi s0, sp, 80
 ; ZVBB-RV32-NEXT:    csrr a0, vlenb
-; ZVBB-RV32-NEXT:    li a1, 28
-; ZVBB-RV32-NEXT:    mul a0, a0, a1
+; ZVBB-RV32-NEXT:    slli a0, a0, 5
 ; ZVBB-RV32-NEXT:    sub sp, sp, a0
 ; ZVBB-RV32-NEXT:    andi sp, sp, -64
-; ZVBB-RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; ZVBB-RV32-NEXT:    vmv2r.v v20, v16
+; ZVBB-RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV32-NEXT:    addi a0, sp, 64
-; ZVBB-RV32-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
 ; ZVBB-RV32-NEXT:    csrr a1, vlenb
-; ZVBB-RV32-NEXT:    slli a2, a1, 2
-; ZVBB-RV32-NEXT:    add a1, a2, a1
+; ZVBB-RV32-NEXT:    slli a2, a1, 3
+; ZVBB-RV32-NEXT:    sub a1, a2, a1
 ; ZVBB-RV32-NEXT:    add a1, sp, a1
 ; ZVBB-RV32-NEXT:    addi a1, a1, 64
+; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
 ; ZVBB-RV32-NEXT:    csrr a2, vlenb
-; ZVBB-RV32-NEXT:    vmv2r.v v16, v8
-; ZVBB-RV32-NEXT:    vmv2r.v v22, v16
-; ZVBB-RV32-NEXT:    vmv2r.v v24, v18
-; ZVBB-RV32-NEXT:    vmv1r.v v26, v20
+; ZVBB-RV32-NEXT:    vmv2r.v v20, v8
+; ZVBB-RV32-NEXT:    vmv1r.v v1, v20
+; ZVBB-RV32-NEXT:    vmv1r.v v3, v22
+; ZVBB-RV32-NEXT:    vmv1r.v v5, v24
+; ZVBB-RV32-NEXT:    vmv1r.v v7, v26
 ; ZVBB-RV32-NEXT:    add a3, a0, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v23, v10
+; ZVBB-RV32-NEXT:    vmv1r.v v2, v10
 ; ZVBB-RV32-NEXT:    add a4, a1, a2
-; ZVBB-RV32-NEXT:    add a5, a4, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v25, v14
-; ZVBB-RV32-NEXT:    add a6, a5, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v18, v11
-; ZVBB-RV32-NEXT:    vsseg5e64.v v22, (a0)
-; ZVBB-RV32-NEXT:    vmv1r.v v20, v15
-; ZVBB-RV32-NEXT:    vsseg5e64.v v17, (a1)
-; ZVBB-RV32-NEXT:    vl1re64.v v16, (a6)
+; ZVBB-RV32-NEXT:    slli a5, a2, 2
+; ZVBB-RV32-NEXT:    vmv1r.v v4, v14
+; ZVBB-RV32-NEXT:    slli a6, a2, 4
+; ZVBB-RV32-NEXT:    add a7, a4, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v6, v18
+; ZVBB-RV32-NEXT:    sub a5, a6, a5
+; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
+; ZVBB-RV32-NEXT:    add a6, a7, a2
+; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
+; ZVBB-RV32-NEXT:    vsseg7e16.v v1, (a0)
+; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
+; ZVBB-RV32-NEXT:    vsseg7e16.v v21, (a1)
+; ZVBB-RV32-NEXT:    vl1re16.v v18, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v17, (a6)
+; ZVBB-RV32-NEXT:    vl1re16.v v19, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v20, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v21, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a3, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v10, (a6)
+; ZVBB-RV32-NEXT:    vl1re16.v v10, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v11, (a6)
-; ZVBB-RV32-NEXT:    vl1re64.v v8, (a0)
-; ZVBB-RV32-NEXT:    vl1re64.v v9, (a3)
-; ZVBB-RV32-NEXT:    vl1re64.v v14, (a4)
+; ZVBB-RV32-NEXT:    vl1re16.v v11, (a6)
+; ZVBB-RV32-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re16.v v16, (a4)
+; ZVBB-RV32-NEXT:    vl1re16.v v9, (a3)
+; ZVBB-RV32-NEXT:    vl1re16.v v17, (a7)
 ; ZVBB-RV32-NEXT:    csrr a0, vlenb
-; ZVBB-RV32-NEXT:    li a3, 10
+; ZVBB-RV32-NEXT:    li a3, 14
 ; ZVBB-RV32-NEXT:    mul a0, a0, a3
 ; ZVBB-RV32-NEXT:    add a0, sp, a0
 ; ZVBB-RV32-NEXT:    addi a0, a0, 64
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v15, (a5)
-; ZVBB-RV32-NEXT:    vl1re64.v v12, (a6)
-; ZVBB-RV32-NEXT:    vl1re64.v v13, (a1)
+; ZVBB-RV32-NEXT:    vl1re16.v v12, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
+; ZVBB-RV32-NEXT:    vl1re16.v v13, (a6)
+; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
 ; ZVBB-RV32-NEXT:    add a2, a0, a2
-; ZVBB-RV32-NEXT:    vs2r.v v16, (a2)
+; ZVBB-RV32-NEXT:    vl1re16.v v14, (a6)
+; ZVBB-RV32-NEXT:    vl1re16.v v15, (a1)
+; ZVBB-RV32-NEXT:    add a5, a0, a5
+; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
-; ZVBB-RV32-NEXT:    vl8re64.v v16, (a2)
-; ZVBB-RV32-NEXT:    vl8re64.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl8re16.v v16, (a2)
+; ZVBB-RV32-NEXT:    vl8re16.v v8, (a0)
 ; ZVBB-RV32-NEXT:    addi sp, s0, -80
 ; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    addi sp, sp, 80
 ; ZVBB-RV32-NEXT:    ret
 ;
-; ZVBB-RV64-LABEL: vector_interleave_nxv10f64_nxv2f64:
+; ZVBB-RV64-LABEL: vector_interleave_nxv56f16_nxv8f16:
 ; ZVBB-RV64:       # %bb.0:
 ; ZVBB-RV64-NEXT:    addi sp, sp, -80
 ; ZVBB-RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
 ; ZVBB-RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
 ; ZVBB-RV64-NEXT:    addi s0, sp, 80
 ; ZVBB-RV64-NEXT:    csrr a0, vlenb
-; ZVBB-RV64-NEXT:    li a1, 28
-; ZVBB-RV64-NEXT:    mul a0, a0, a1
+; ZVBB-RV64-NEXT:    slli a0, a0, 5
 ; ZVBB-RV64-NEXT:    sub sp, sp, a0
 ; ZVBB-RV64-NEXT:    andi sp, sp, -64
-; ZVBB-RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; ZVBB-RV64-NEXT:    vmv2r.v v20, v16
+; ZVBB-RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV64-NEXT:    addi a0, sp, 64
-; ZVBB-RV64-NEXT:    vmv2r.v v18, v12
+; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
 ; ZVBB-RV64-NEXT:    csrr a1, vlenb
-; ZVBB-RV64-NEXT:    slli a2, a1, 2
-; ZVBB-RV64-NEXT:    add a1, a2, a1
+; ZVBB-RV64-NEXT:    slli a2, a1, 3
+; ZVBB-RV64-NEXT:    sub a1, a2, a1
 ; ZVBB-RV64-NEXT:    add a1, sp, a1
 ; ZVBB-RV64-NEXT:    addi a1, a1, 64
+; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
 ; ZVBB-RV64-NEXT:    csrr a2, vlenb
-; ZVBB-RV64-NEXT:    vmv2r.v v16, v8
-; ZVBB-RV64-NEXT:    vmv2r.v v22, v16
-; ZVBB-RV64-NEXT:    vmv2r.v v24, v18
-; ZVBB-RV64-NEXT:    vmv1r.v v26, v20
+; ZVBB-RV64-NEXT:    vmv2r.v v20, v8
+; ZVBB-RV64-NEXT:    vmv1r.v v1, v20
+; ZVBB-RV64-NEXT:    vmv1r.v v3, v22
+; ZVBB-RV64-NEXT:    vmv1r.v v5, v24
+; ZVBB-RV64-NEXT:    vmv1r.v v7, v26
 ; ZVBB-RV64-NEXT:    add a3, a0, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v23, v10
+; ZVBB-RV64-NEXT:    vmv1r.v v2, v10
 ; ZVBB-RV64-NEXT:    add a4, a1, a2
-; ZVBB-RV64-NEXT:    add a5, a4, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v25, v14
-; ZVBB-RV64-NEXT:    add a6, a5, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v18, v11
-; ZVBB-RV64-NEXT:    vsseg5e64.v v22, (a0)
-; ZVBB-RV64-NEXT:    vmv1r.v v20, v15
-; ZVBB-RV64-NEXT:    vsseg5e64.v v17, (a1)
-; ZVBB-RV64-NEXT:    vl1re64.v v16, (a6)
+; ZVBB-RV64-NEXT:    slli a5, a2, 2
+; ZVBB-RV64-NEXT:    vmv1r.v v4, v14
+; ZVBB-RV64-NEXT:    slli a6, a2, 4
+; ZVBB-RV64-NEXT:    add a7, a4, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v6, v18
+; ZVBB-RV64-NEXT:    sub a5, a6, a5
+; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
+; ZVBB-RV64-NEXT:    add a6, a7, a2
+; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
+; ZVBB-RV64-NEXT:    vsseg7e16.v v1, (a0)
+; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
+; ZVBB-RV64-NEXT:    vsseg7e16.v v21, (a1)
+; ZVBB-RV64-NEXT:    vl1re16.v v18, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v17, (a6)
+; ZVBB-RV64-NEXT:    vl1re16.v v19, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v20, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v21, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a3, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v10, (a6)
+; ZVBB-RV64-NEXT:    vl1re16.v v10, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v11, (a6)
-; ZVBB-RV64-NEXT:    vl1re64.v v8, (a0)
-; ZVBB-RV64-NEXT:    vl1re64.v v9, (a3)
-; ZVBB-RV64-NEXT:    vl1re64.v v14, (a4)
+; ZVBB-RV64-NEXT:    vl1re16.v v11, (a6)
+; ZVBB-RV64-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re16.v v16, (a4)
+; ZVBB-RV64-NEXT:    vl1re16.v v9, (a3)
+; ZVBB-RV64-NEXT:    vl1re16.v v17, (a7)
 ; ZVBB-RV64-NEXT:    csrr a0, vlenb
-; ZVBB-RV64-NEXT:    li a3, 10
+; ZVBB-RV64-NEXT:    li a3, 14
 ; ZVBB-RV64-NEXT:    mul a0, a0, a3
 ; ZVBB-RV64-NEXT:    add a0, sp, a0
 ; ZVBB-RV64-NEXT:    addi a0, a0, 64
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v15, (a5)
-; ZVBB-RV64-NEXT:    vl1re64.v v12, (a6)
-; ZVBB-RV64-NEXT:    vl1re64.v v13, (a1)
+; ZVBB-RV64-NEXT:    vl1re16.v v12, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
+; ZVBB-RV64-NEXT:    vl1re16.v v13, (a6)
+; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
 ; ZVBB-RV64-NEXT:    add a2, a0, a2
-; ZVBB-RV64-NEXT:    vs2r.v v16, (a2)
+; ZVBB-RV64-NEXT:    vl1re16.v v14, (a6)
+; ZVBB-RV64-NEXT:    vl1re16.v v15, (a1)
+; ZVBB-RV64-NEXT:    add a5, a0, a5
+; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
+; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
-; ZVBB-RV64-NEXT:    vl8re64.v v16, (a2)
-; ZVBB-RV64-NEXT:    vl8re64.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl8re16.v v16, (a2)
+; ZVBB-RV64-NEXT:    vl8re16.v v8, (a0)
 ; ZVBB-RV64-NEXT:    addi sp, s0, -80
 ; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    addi sp, sp, 80
 ; ZVBB-RV64-NEXT:    ret
 ;
-; ZIP-LABEL: vector_interleave_nxv10f64_nxv2f64:
+; ZIP-LABEL: vector_interleave_nxv56f16_nxv8f16:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -80
 ; ZIP-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
 ; ZIP-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
 ; ZIP-NEXT:    addi s0, sp, 80
 ; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    li a1, 28
-; ZIP-NEXT:    mul a0, a0, a1
+; ZIP-NEXT:    slli a0, a0, 5
 ; ZIP-NEXT:    sub sp, sp, a0
 ; ZIP-NEXT:    andi sp, sp, -64
-; ZIP-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; ZIP-NEXT:    vmv2r.v v20, v16
+; ZIP-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT:    vmv2r.v v26, v20
 ; ZIP-NEXT:    addi a0, sp, 64
-; ZIP-NEXT:    vmv2r.v v18, v12
+; ZIP-NEXT:    vmv2r.v v24, v16
 ; ZIP-NEXT:    csrr a1, vlenb
-; ZIP-NEXT:    slli a2, a1, 2
-; ZIP-NEXT:    add a1, a2, a1
+; ZIP-NEXT:    slli a2, a1, 3
+; ZIP-NEXT:    sub a1, a2, a1
 ; ZIP-NEXT:    add a1, sp, a1
 ; ZIP-NEXT:    addi a1, a1, 64
+; ZIP-NEXT:    vmv2r.v v22, v12
 ; ZIP-NEXT:    csrr a2, vlenb
-; ZIP-NEXT:    vmv2r.v v16, v8
-; ZIP-NEXT:    vmv2r.v v22, v16
-; ZIP-NEXT:    vmv2r.v v24, v18
-; ZIP-NEXT:    vmv1r.v v26, v20
+; ZIP-NEXT:    vmv2r.v v20, v8
+; ZIP-NEXT:    vmv1r.v v1, v20
+; ZIP-NEXT:    vmv1r.v v3, v22
+; ZIP-NEXT:    vmv1r.v v5, v24
+; ZIP-NEXT:    vmv1r.v v7, v26
 ; ZIP-NEXT:    add a3, a0, a2
-; ZIP-NEXT:    vmv1r.v v23, v10
+; ZIP-NEXT:    vmv1r.v v2, v10
 ; ZIP-NEXT:    add a4, a1, a2
-; ZIP-NEXT:    add a5, a4, a2
-; ZIP-NEXT:    vmv1r.v v25, v14
-; ZIP-NEXT:    add a6, a5, a2
-; ZIP-NEXT:    vmv1r.v v18, v11
-; ZIP-NEXT:    vsseg5e64.v v22, (a0)
-; ZIP-NEXT:    vmv1r.v v20, v15
-; ZIP-NEXT:    vsseg5e64.v v17, (a1)
-; ZIP-NEXT:    vl1re64.v v16, (a6)
+; ZIP-NEXT:    slli a5, a2, 2
+; ZIP-NEXT:    vmv1r.v v4, v14
+; ZIP-NEXT:    slli a6, a2, 4
+; ZIP-NEXT:    add a7, a4, a2
+; ZIP-NEXT:    vmv1r.v v6, v18
+; ZIP-NEXT:    sub a5, a6, a5
+; ZIP-NEXT:    vmv1r.v v22, v11
+; ZIP-NEXT:    add a6, a7, a2
+; ZIP-NEXT:    vmv1r.v v24, v15
+; ZIP-NEXT:    vsseg7e16.v v1, (a0)
+; ZIP-NEXT:    vmv1r.v v26, v19
+; ZIP-NEXT:    vsseg7e16.v v21, (a1)
+; ZIP-NEXT:    vl1re16.v v18, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re64.v v17, (a6)
+; ZIP-NEXT:    vl1re16.v v19, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re16.v v20, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re16.v v21, (a6)
 ; ZIP-NEXT:    add a6, a3, a2
-; ZIP-NEXT:    vl1re64.v v10, (a6)
+; ZIP-NEXT:    vl1re16.v v10, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re64.v v11, (a6)
-; ZIP-NEXT:    vl1re64.v v8, (a0)
-; ZIP-NEXT:    vl1re64.v v9, (a3)
-; ZIP-NEXT:    vl1re64.v v14, (a4)
+; ZIP-NEXT:    vl1re16.v v11, (a6)
+; ZIP-NEXT:    vl1re16.v v8, (a0)
+; ZIP-NEXT:    vl1re16.v v16, (a4)
+; ZIP-NEXT:    vl1re16.v v9, (a3)
+; ZIP-NEXT:    vl1re16.v v17, (a7)
 ; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    li a3, 10
+; ZIP-NEXT:    li a3, 14
 ; ZIP-NEXT:    mul a0, a0, a3
 ; ZIP-NEXT:    add a0, sp, a0
 ; ZIP-NEXT:    addi a0, a0, 64
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re64.v v15, (a5)
-; ZIP-NEXT:    vl1re64.v v12, (a6)
-; ZIP-NEXT:    vl1re64.v v13, (a1)
+; ZIP-NEXT:    vl1re16.v v12, (a6)
+; ZIP-NEXT:    add a6, a6, a2
+; ZIP-NEXT:    vl1re16.v v13, (a6)
+; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    slli a2, a2, 3
 ; ZIP-NEXT:    add a2, a0, a2
-; ZIP-NEXT:    vs2r.v v16, (a2)
+; ZIP-NEXT:    vl1re16.v v14, (a6)
+; ZIP-NEXT:    vl1re16.v v15, (a1)
+; ZIP-NEXT:    add a5, a0, a5
+; ZIP-NEXT:    vs2r.v v20, (a5)
+; ZIP-NEXT:    vs4r.v v16, (a2)
 ; ZIP-NEXT:    vs8r.v v8, (a0)
-; ZIP-NEXT:    vl8re64.v v16, (a2)
-; ZIP-NEXT:    vl8re64.v v8, (a0)
+; ZIP-NEXT:    vl8re16.v v16, (a2)
+; ZIP-NEXT:    vl8re16.v v8, (a0)
 ; ZIP-NEXT:    addi sp, s0, -80
 ; ZIP-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    addi sp, sp, 80
 ; ZIP-NEXT:    ret
-  %res = call <vscale x 10 x double> @llvm.vector.interleave5.nxv10f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4)
-  ret <vscale x 10 x double> %res
+  %res = call <vscale x 56 x half> @llvm.vector.interleave7.nxv56f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4, <vscale x 8 x half> %v5, <vscale x 8 x half> %v6)
+  ret <vscale x 56 x half> %res
 }
 
-define <vscale x 14 x half> @vector_interleave_nxv14f16_nxv2f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4, <vscale x 2 x half> %v5, <vscale x 2 x half> %v6) nounwind {
-; CHECK-LABEL: vector_interleave_nxv14f16_nxv2f16:
+define <vscale x 14 x bfloat> @vector_interleave_nxv14bf16_nxv2bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4, <vscale x 2 x bfloat> %v5, <vscale x 2 x bfloat> %v6) nounwind {
+; CHECK-LABEL: vector_interleave_nxv14bf16_nxv2bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -6788,7 +12356,7 @@ define <vscale x 14 x half> @vector_interleave_nxv14f16_nxv2f16(<vscale x 2 x ha
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv14f16_nxv2f16:
+; ZVBB-LABEL: vector_interleave_nxv14bf16_nxv2bf16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    csrr a0, vlenb
@@ -6826,13 +12394,13 @@ define <vscale x 14 x half> @vector_interleave_nxv14f16_nxv2f16(<vscale x 2 x ha
 ; ZVBB-NEXT:    slli a0, a0, 2
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    ret
-  %res = call <vscale x 14 x half> @llvm.vector.interleave7.nxv14f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4, <vscale x 2 x half> %v5, <vscale x 2 x half> %v6)
-  ret <vscale x 14 x half> %res
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 14 x bfloat> @llvm.vector.interleave7.nxv14bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4, <vscale x 2 x bfloat> %v5, <vscale x 2 x bfloat> %v6)
+  ret <vscale x 14 x bfloat> %res
 }
 
-define <vscale x 28 x half> @vector_interleave_nxv28f16_nxv4f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4, <vscale x 4 x half> %v5, <vscale x 4 x half> %v6) nounwind {
-; CHECK-LABEL: vector_interleave_nxv28f16_nxv4f16:
+define <vscale x 28 x bfloat> @vector_interleave_nxv28bf16_nxv4bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4, <vscale x 4 x bfloat> %v5, <vscale x 4 x bfloat> %v6) nounwind {
+; CHECK-LABEL: vector_interleave_nxv28bf16_nxv4bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -6863,7 +12431,7 @@ define <vscale x 28 x half> @vector_interleave_nxv28f16_nxv4f16(<vscale x 4 x ha
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv28f16_nxv4f16:
+; ZVBB-LABEL: vector_interleave_nxv28bf16_nxv4bf16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    csrr a0, vlenb
@@ -6893,12 +12461,12 @@ define <vscale x 28 x half> @vector_interleave_nxv28f16_nxv4f16(<vscale x 4 x ha
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 28 x half> @llvm.vector.interleave7.nxv28f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4, <vscale x 4 x half> %v5, <vscale x 4 x half> %v6)
-  ret <vscale x 28 x half> %res
+  %res = call <vscale x 28 x bfloat> @llvm.vector.interleave7.nxv28bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4, <vscale x 4 x bfloat> %v5, <vscale x 4 x bfloat> %v6)
+  ret <vscale x 28 x bfloat> %res
 }
 
-define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4, <vscale x 8 x half> %v5, <vscale x 8 x half> %v6) nounwind {
-; RV32-LABEL: vector_interleave_nxv56f16_nxv8f16:
+define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4, <vscale x 8 x bfloat> %v5, <vscale x 8 x bfloat> %v6) nounwind {
+; RV32-LABEL: vector_interleave_nxv56bf16_nxv8bf16:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -80
 ; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
@@ -6980,7 +12548,7 @@ define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x ha
 ; RV32-NEXT:    addi sp, sp, 80
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: vector_interleave_nxv56f16_nxv8f16:
+; RV64-LABEL: vector_interleave_nxv56bf16_nxv8bf16:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -80
 ; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -7062,7 +12630,7 @@ define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x ha
 ; RV64-NEXT:    addi sp, sp, 80
 ; RV64-NEXT:    ret
 ;
-; ZVBB-RV32-LABEL: vector_interleave_nxv56f16_nxv8f16:
+; ZVBB-RV32-LABEL: vector_interleave_nxv56bf16_nxv8bf16:
 ; ZVBB-RV32:       # %bb.0:
 ; ZVBB-RV32-NEXT:    addi sp, sp, -80
 ; ZVBB-RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
@@ -7144,7 +12712,7 @@ define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x ha
 ; ZVBB-RV32-NEXT:    addi sp, sp, 80
 ; ZVBB-RV32-NEXT:    ret
 ;
-; ZVBB-RV64-LABEL: vector_interleave_nxv56f16_nxv8f16:
+; ZVBB-RV64-LABEL: vector_interleave_nxv56bf16_nxv8bf16:
 ; ZVBB-RV64:       # %bb.0:
 ; ZVBB-RV64-NEXT:    addi sp, sp, -80
 ; ZVBB-RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -7226,7 +12794,7 @@ define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x ha
 ; ZVBB-RV64-NEXT:    addi sp, sp, 80
 ; ZVBB-RV64-NEXT:    ret
 ;
-; ZIP-LABEL: vector_interleave_nxv56f16_nxv8f16:
+; ZIP-LABEL: vector_interleave_nxv56bf16_nxv8bf16:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -80
 ; ZIP-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -7307,12 +12875,12 @@ define <vscale x 56 x half> @vector_interleave_nxv56f16_nxv8f16(<vscale x 8 x ha
 ; ZIP-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    addi sp, sp, 80
 ; ZIP-NEXT:    ret
-  %res = call <vscale x 56 x half> @llvm.vector.interleave7.nxv56f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4, <vscale x 8 x half> %v5, <vscale x 8 x half> %v6)
-  ret <vscale x 56 x half> %res
+  %res = call <vscale x 56 x bfloat> @llvm.vector.interleave7.nxv56bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4, <vscale x 8 x bfloat> %v5, <vscale x 8 x bfloat> %v6)
+  ret <vscale x 56 x bfloat> %res
 }
 
-define <vscale x 14 x bfloat> @vector_interleave_nxv14bf16_nxv2bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4, <vscale x 2 x bfloat> %v5, <vscale x 2 x bfloat> %v6) nounwind {
-; CHECK-LABEL: vector_interleave_nxv14bf16_nxv2bf16:
+define <vscale x 7 x float> @vector_interleave_nxv7f32_nxv1f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4, <vscale x 1 x float> %v5, <vscale x 1 x float> %v6) nounwind {
+; CHECK-LABEL: vector_interleave_nxv7f32_nxv1f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -7321,30 +12889,30 @@ define <vscale x 14 x bfloat> @vector_interleave_nxv14bf16_nxv2bf16(<vscale x 2
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a2, a1, 1
-; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    srli a1, a1, 3
 ; CHECK-NEXT:    add a3, a0, a2
 ; CHECK-NEXT:    add a4, a3, a2
 ; CHECK-NEXT:    add a5, a4, a2
 ; CHECK-NEXT:    add a6, a5, a2
-; CHECK-NEXT:    vsetvli a7, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vsseg7e16.v v8, (a0)
+; CHECK-NEXT:    vsetvli a7, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsseg7e32.v v8, (a0)
 ; CHECK-NEXT:    add a7, a6, a2
-; CHECK-NEXT:    vle16.v v8, (a7)
-; CHECK-NEXT:    vle16.v v10, (a6)
+; CHECK-NEXT:    vle32.v v8, (a7)
+; CHECK-NEXT:    vle32.v v10, (a6)
 ; CHECK-NEXT:    add a6, a1, a1
 ; CHECK-NEXT:    add a2, a7, a2
-; CHECK-NEXT:    vle16.v v12, (a5)
-; CHECK-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT:    vle32.v v12, (a5)
+; CHECK-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v10, v8, a1
-; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vle16.v v11, (a2)
-; CHECK-NEXT:    vle16.v v9, (a4)
-; CHECK-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v11, (a2)
+; CHECK-NEXT:    vle32.v v9, (a4)
+; CHECK-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v9, v12, a1
-; CHECK-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vle16.v v12, (a3)
-; CHECK-NEXT:    vle16.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; CHECK-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v12, (a3)
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
 ; CHECK-NEXT:    vslideup.vx v8, v12, a1
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 2
@@ -7352,7 +12920,7 @@ define <vscale x 14 x bfloat> @vector_interleave_nxv14bf16_nxv2bf16(<vscale x 2
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv14bf16_nxv2bf16:
+; ZVBB-LABEL: vector_interleave_nxv7f32_nxv1f32:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    csrr a0, vlenb
@@ -7361,42 +12929,42 @@ define <vscale x 14 x bfloat> @vector_interleave_nxv14bf16_nxv2bf16(<vscale x 2
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a2, a1, 1
-; ZVBB-NEXT:    srli a1, a1, 2
+; ZVBB-NEXT:    srli a1, a1, 3
 ; ZVBB-NEXT:    add a3, a0, a2
 ; ZVBB-NEXT:    add a4, a3, a2
 ; ZVBB-NEXT:    add a5, a4, a2
 ; ZVBB-NEXT:    add a6, a5, a2
-; ZVBB-NEXT:    vsetvli a7, zero, e16, mf2, ta, ma
-; ZVBB-NEXT:    vsseg7e16.v v8, (a0)
+; ZVBB-NEXT:    vsetvli a7, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsseg7e32.v v8, (a0)
 ; ZVBB-NEXT:    add a7, a6, a2
-; ZVBB-NEXT:    vle16.v v8, (a7)
-; ZVBB-NEXT:    vle16.v v10, (a6)
+; ZVBB-NEXT:    vle32.v v8, (a7)
+; ZVBB-NEXT:    vle32.v v10, (a6)
 ; ZVBB-NEXT:    add a6, a1, a1
 ; ZVBB-NEXT:    add a2, a7, a2
-; ZVBB-NEXT:    vle16.v v12, (a5)
-; ZVBB-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT:    vle32.v v12, (a5)
+; ZVBB-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v10, v8, a1
-; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
-; ZVBB-NEXT:    vle16.v v11, (a2)
-; ZVBB-NEXT:    vle16.v v9, (a4)
-; ZVBB-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vle32.v v11, (a2)
+; ZVBB-NEXT:    vle32.v v9, (a4)
+; ZVBB-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v9, v12, a1
-; ZVBB-NEXT:    vsetvli a2, zero, e16, mf2, ta, ma
-; ZVBB-NEXT:    vle16.v v12, (a3)
-; ZVBB-NEXT:    vle16.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a6, e16, m1, ta, ma
+; ZVBB-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vle32.v v12, (a3)
+; ZVBB-NEXT:    vle32.v v8, (a0)
+; ZVBB-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
 ; ZVBB-NEXT:    vslideup.vx v8, v12, a1
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 2
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 14 x bfloat> @llvm.vector.interleave7.nxv14bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4, <vscale x 2 x bfloat> %v5, <vscale x 2 x bfloat> %v6)
-  ret <vscale x 14 x bfloat> %res
+  %res = call <vscale x 7 x float> @llvm.vector.interleave7.nxv7f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4, <vscale x 1 x float> %v5, <vscale x 1 x float> %v6)
+  ret <vscale x 7 x float> %res
 }
 
-define <vscale x 28 x bfloat> @vector_interleave_nxv28bf16_nxv4bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4, <vscale x 4 x bfloat> %v5, <vscale x 4 x bfloat> %v6) nounwind {
-; CHECK-LABEL: vector_interleave_nxv28bf16_nxv4bf16:
+define <vscale x 14 x float> @vector_interleave_nxv14f32_nxv2f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4, <vscale x 2 x float> %v5, <vscale x 2 x float> %v6) nounwind {
+; CHECK-LABEL: vector_interleave_nxv14f32_nxv2f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -7407,19 +12975,19 @@ define <vscale x 28 x bfloat> @vector_interleave_nxv28bf16_nxv4bf16(<vscale x 4
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    add a3, a2, a1
-; CHECK-NEXT:    vsetvli a4, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vsseg7e16.v v8, (a0)
-; CHECK-NEXT:    vl1re16.v v10, (a3)
+; CHECK-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsseg7e32.v v8, (a0)
+; CHECK-NEXT:    vl1re32.v v10, (a3)
 ; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    vl1re16.v v11, (a3)
+; CHECK-NEXT:    vl1re32.v v11, (a3)
 ; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    vl1re16.v v8, (a0)
+; CHECK-NEXT:    vl1re32.v v8, (a0)
 ; CHECK-NEXT:    add a0, a3, a1
-; CHECK-NEXT:    vl1re16.v v9, (a2)
-; CHECK-NEXT:    vl1re16.v v12, (a3)
-; CHECK-NEXT:    vl1re16.v v13, (a0)
+; CHECK-NEXT:    vl1re32.v v9, (a2)
+; CHECK-NEXT:    vl1re32.v v12, (a3)
+; CHECK-NEXT:    vl1re32.v v13, (a0)
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vl1re16.v v14, (a0)
+; CHECK-NEXT:    vl1re32.v v14, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a1, a0, 3
 ; CHECK-NEXT:    sub a0, a1, a0
@@ -7427,7 +12995,7 @@ define <vscale x 28 x bfloat> @vector_interleave_nxv28bf16_nxv4bf16(<vscale x 4
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv28bf16_nxv4bf16:
+; ZVBB-LABEL: vector_interleave_nxv14f32_nxv2f32:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    csrr a0, vlenb
@@ -7438,31 +13006,31 @@ define <vscale x 28 x bfloat> @vector_interleave_nxv28bf16_nxv4bf16(<vscale x 4
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    add a2, a0, a1
 ; ZVBB-NEXT:    add a3, a2, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e16, m1, ta, ma
-; ZVBB-NEXT:    vsseg7e16.v v8, (a0)
-; ZVBB-NEXT:    vl1re16.v v10, (a3)
+; ZVBB-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
+; ZVBB-NEXT:    vsseg7e32.v v8, (a0)
+; ZVBB-NEXT:    vl1re32.v v10, (a3)
 ; ZVBB-NEXT:    add a3, a3, a1
-; ZVBB-NEXT:    vl1re16.v v11, (a3)
+; ZVBB-NEXT:    vl1re32.v v11, (a3)
 ; ZVBB-NEXT:    add a3, a3, a1
-; ZVBB-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-NEXT:    vl1re32.v v8, (a0)
 ; ZVBB-NEXT:    add a0, a3, a1
-; ZVBB-NEXT:    vl1re16.v v9, (a2)
-; ZVBB-NEXT:    vl1re16.v v12, (a3)
-; ZVBB-NEXT:    vl1re16.v v13, (a0)
+; ZVBB-NEXT:    vl1re32.v v9, (a2)
+; ZVBB-NEXT:    vl1re32.v v12, (a3)
+; ZVBB-NEXT:    vl1re32.v v13, (a0)
 ; ZVBB-NEXT:    add a0, a0, a1
-; ZVBB-NEXT:    vl1re16.v v14, (a0)
+; ZVBB-NEXT:    vl1re32.v v14, (a0)
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a1, a0, 3
 ; ZVBB-NEXT:    sub a0, a1, a0
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 28 x bfloat> @llvm.vector.interleave7.nxv28bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4, <vscale x 4 x bfloat> %v5, <vscale x 4 x bfloat> %v6)
-  ret <vscale x 28 x bfloat> %res
+  %res = call <vscale x 14 x float> @llvm.vector.interleave7.nxv14f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4, <vscale x 2 x float> %v5, <vscale x 2 x float> %v6)
+  ret <vscale x 14 x float> %res
 }
 
-define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4, <vscale x 8 x bfloat> %v5, <vscale x 8 x bfloat> %v6) nounwind {
-; RV32-LABEL: vector_interleave_nxv56bf16_nxv8bf16:
+define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4, <vscale x 4 x float> %v5, <vscale x 4 x float> %v6) nounwind {
+; RV32-LABEL: vector_interleave_nxv28f32_nxv4f32:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -80
 ; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
@@ -7472,7 +13040,7 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    sub sp, sp, a0
 ; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; RV32-NEXT:    vmv2r.v v26, v20
 ; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vmv2r.v v24, v16
@@ -7500,51 +13068,51 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
 ; RV32-NEXT:    vmv1r.v v22, v11
 ; RV32-NEXT:    add a6, a7, a2
 ; RV32-NEXT:    vmv1r.v v24, v15
-; RV32-NEXT:    vsseg7e16.v v1, (a0)
+; RV32-NEXT:    vsseg7e32.v v1, (a0)
 ; RV32-NEXT:    vmv1r.v v26, v19
-; RV32-NEXT:    vsseg7e16.v v21, (a1)
-; RV32-NEXT:    vl1re16.v v18, (a6)
+; RV32-NEXT:    vsseg7e32.v v21, (a1)
+; RV32-NEXT:    vl1re32.v v18, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v19, (a6)
+; RV32-NEXT:    vl1re32.v v19, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v20, (a6)
+; RV32-NEXT:    vl1re32.v v20, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v21, (a6)
+; RV32-NEXT:    vl1re32.v v21, (a6)
 ; RV32-NEXT:    add a6, a3, a2
-; RV32-NEXT:    vl1re16.v v10, (a6)
+; RV32-NEXT:    vl1re32.v v10, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v11, (a6)
-; RV32-NEXT:    vl1re16.v v8, (a0)
-; RV32-NEXT:    vl1re16.v v16, (a4)
-; RV32-NEXT:    vl1re16.v v9, (a3)
-; RV32-NEXT:    vl1re16.v v17, (a7)
+; RV32-NEXT:    vl1re32.v v11, (a6)
+; RV32-NEXT:    vl1re32.v v8, (a0)
+; RV32-NEXT:    vl1re32.v v16, (a4)
+; RV32-NEXT:    vl1re32.v v9, (a3)
+; RV32-NEXT:    vl1re32.v v17, (a7)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a3, 14
 ; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 64
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v12, (a6)
+; RV32-NEXT:    vl1re32.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re16.v v13, (a6)
+; RV32-NEXT:    vl1re32.v v13, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, a0, a2
-; RV32-NEXT:    vl1re16.v v14, (a6)
-; RV32-NEXT:    vl1re16.v v15, (a1)
+; RV32-NEXT:    vl1re32.v v14, (a6)
+; RV32-NEXT:    vl1re32.v v15, (a1)
 ; RV32-NEXT:    add a5, a0, a5
 ; RV32-NEXT:    vs2r.v v20, (a5)
 ; RV32-NEXT:    vs4r.v v16, (a2)
 ; RV32-NEXT:    vs8r.v v8, (a0)
-; RV32-NEXT:    vl8re16.v v16, (a2)
-; RV32-NEXT:    vl8re16.v v8, (a0)
+; RV32-NEXT:    vl8re32.v v16, (a2)
+; RV32-NEXT:    vl8re32.v v8, (a0)
 ; RV32-NEXT:    addi sp, s0, -80
 ; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 80
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: vector_interleave_nxv56bf16_nxv8bf16:
+; RV64-LABEL: vector_interleave_nxv28f32_nxv4f32:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -80
 ; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -7554,7 +13122,7 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
 ; RV64-NEXT:    slli a0, a0, 5
 ; RV64-NEXT:    sub sp, sp, a0
 ; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; RV64-NEXT:    vmv2r.v v26, v20
 ; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    vmv2r.v v24, v16
@@ -7582,51 +13150,51 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
 ; RV64-NEXT:    vmv1r.v v22, v11
 ; RV64-NEXT:    add a6, a7, a2
 ; RV64-NEXT:    vmv1r.v v24, v15
-; RV64-NEXT:    vsseg7e16.v v1, (a0)
+; RV64-NEXT:    vsseg7e32.v v1, (a0)
 ; RV64-NEXT:    vmv1r.v v26, v19
-; RV64-NEXT:    vsseg7e16.v v21, (a1)
-; RV64-NEXT:    vl1re16.v v18, (a6)
+; RV64-NEXT:    vsseg7e32.v v21, (a1)
+; RV64-NEXT:    vl1re32.v v18, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v19, (a6)
+; RV64-NEXT:    vl1re32.v v19, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v20, (a6)
+; RV64-NEXT:    vl1re32.v v20, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v21, (a6)
+; RV64-NEXT:    vl1re32.v v21, (a6)
 ; RV64-NEXT:    add a6, a3, a2
-; RV64-NEXT:    vl1re16.v v10, (a6)
+; RV64-NEXT:    vl1re32.v v10, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v11, (a6)
-; RV64-NEXT:    vl1re16.v v8, (a0)
-; RV64-NEXT:    vl1re16.v v16, (a4)
-; RV64-NEXT:    vl1re16.v v9, (a3)
-; RV64-NEXT:    vl1re16.v v17, (a7)
+; RV64-NEXT:    vl1re32.v v11, (a6)
+; RV64-NEXT:    vl1re32.v v8, (a0)
+; RV64-NEXT:    vl1re32.v v16, (a4)
+; RV64-NEXT:    vl1re32.v v9, (a3)
+; RV64-NEXT:    vl1re32.v v17, (a7)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    li a3, 14
 ; RV64-NEXT:    mul a0, a0, a3
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 64
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v12, (a6)
+; RV64-NEXT:    vl1re32.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re16.v v13, (a6)
+; RV64-NEXT:    vl1re32.v v13, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, a0, a2
-; RV64-NEXT:    vl1re16.v v14, (a6)
-; RV64-NEXT:    vl1re16.v v15, (a1)
+; RV64-NEXT:    vl1re32.v v14, (a6)
+; RV64-NEXT:    vl1re32.v v15, (a1)
 ; RV64-NEXT:    add a5, a0, a5
 ; RV64-NEXT:    vs2r.v v20, (a5)
 ; RV64-NEXT:    vs4r.v v16, (a2)
 ; RV64-NEXT:    vs8r.v v8, (a0)
-; RV64-NEXT:    vl8re16.v v16, (a2)
-; RV64-NEXT:    vl8re16.v v8, (a0)
+; RV64-NEXT:    vl8re32.v v16, (a2)
+; RV64-NEXT:    vl8re32.v v8, (a0)
 ; RV64-NEXT:    addi sp, s0, -80
 ; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 80
 ; RV64-NEXT:    ret
 ;
-; ZVBB-RV32-LABEL: vector_interleave_nxv56bf16_nxv8bf16:
+; ZVBB-RV32-LABEL: vector_interleave_nxv28f32_nxv4f32:
 ; ZVBB-RV32:       # %bb.0:
 ; ZVBB-RV32-NEXT:    addi sp, sp, -80
 ; ZVBB-RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
@@ -7636,7 +13204,7 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
 ; ZVBB-RV32-NEXT:    slli a0, a0, 5
 ; ZVBB-RV32-NEXT:    sub sp, sp, a0
 ; ZVBB-RV32-NEXT:    andi sp, sp, -64
-; ZVBB-RV32-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
@@ -7664,51 +13232,51 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
 ; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
 ; ZVBB-RV32-NEXT:    add a6, a7, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV32-NEXT:    vsseg7e16.v v1, (a0)
+; ZVBB-RV32-NEXT:    vsseg7e32.v v1, (a0)
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV32-NEXT:    vsseg7e16.v v21, (a1)
-; ZVBB-RV32-NEXT:    vl1re16.v v18, (a6)
+; ZVBB-RV32-NEXT:    vsseg7e32.v v21, (a1)
+; ZVBB-RV32-NEXT:    vl1re32.v v18, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v19, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v19, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v20, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v20, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v21, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v21, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a3, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v10, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v10, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v11, (a6)
-; ZVBB-RV32-NEXT:    vl1re16.v v8, (a0)
-; ZVBB-RV32-NEXT:    vl1re16.v v16, (a4)
-; ZVBB-RV32-NEXT:    vl1re16.v v9, (a3)
-; ZVBB-RV32-NEXT:    vl1re16.v v17, (a7)
+; ZVBB-RV32-NEXT:    vl1re32.v v11, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re32.v v16, (a4)
+; ZVBB-RV32-NEXT:    vl1re32.v v9, (a3)
+; ZVBB-RV32-NEXT:    vl1re32.v v17, (a7)
 ; ZVBB-RV32-NEXT:    csrr a0, vlenb
 ; ZVBB-RV32-NEXT:    li a3, 14
 ; ZVBB-RV32-NEXT:    mul a0, a0, a3
 ; ZVBB-RV32-NEXT:    add a0, sp, a0
 ; ZVBB-RV32-NEXT:    addi a0, a0, 64
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v12, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v13, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v13, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
 ; ZVBB-RV32-NEXT:    add a2, a0, a2
-; ZVBB-RV32-NEXT:    vl1re16.v v14, (a6)
-; ZVBB-RV32-NEXT:    vl1re16.v v15, (a1)
+; ZVBB-RV32-NEXT:    vl1re32.v v14, (a6)
+; ZVBB-RV32-NEXT:    vl1re32.v v15, (a1)
 ; ZVBB-RV32-NEXT:    add a5, a0, a5
 ; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
 ; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
-; ZVBB-RV32-NEXT:    vl8re16.v v16, (a2)
-; ZVBB-RV32-NEXT:    vl8re16.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl8re32.v v16, (a2)
+; ZVBB-RV32-NEXT:    vl8re32.v v8, (a0)
 ; ZVBB-RV32-NEXT:    addi sp, s0, -80
 ; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    addi sp, sp, 80
 ; ZVBB-RV32-NEXT:    ret
 ;
-; ZVBB-RV64-LABEL: vector_interleave_nxv56bf16_nxv8bf16:
+; ZVBB-RV64-LABEL: vector_interleave_nxv28f32_nxv4f32:
 ; ZVBB-RV64:       # %bb.0:
 ; ZVBB-RV64-NEXT:    addi sp, sp, -80
 ; ZVBB-RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -7718,7 +13286,7 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
 ; ZVBB-RV64-NEXT:    slli a0, a0, 5
 ; ZVBB-RV64-NEXT:    sub sp, sp, a0
 ; ZVBB-RV64-NEXT:    andi sp, sp, -64
-; ZVBB-RV64-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
@@ -7746,51 +13314,51 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
 ; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
 ; ZVBB-RV64-NEXT:    add a6, a7, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV64-NEXT:    vsseg7e16.v v1, (a0)
+; ZVBB-RV64-NEXT:    vsseg7e32.v v1, (a0)
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV64-NEXT:    vsseg7e16.v v21, (a1)
-; ZVBB-RV64-NEXT:    vl1re16.v v18, (a6)
+; ZVBB-RV64-NEXT:    vsseg7e32.v v21, (a1)
+; ZVBB-RV64-NEXT:    vl1re32.v v18, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v19, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v19, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v20, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v20, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v21, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v21, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a3, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v10, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v10, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v11, (a6)
-; ZVBB-RV64-NEXT:    vl1re16.v v8, (a0)
-; ZVBB-RV64-NEXT:    vl1re16.v v16, (a4)
-; ZVBB-RV64-NEXT:    vl1re16.v v9, (a3)
-; ZVBB-RV64-NEXT:    vl1re16.v v17, (a7)
+; ZVBB-RV64-NEXT:    vl1re32.v v11, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re32.v v16, (a4)
+; ZVBB-RV64-NEXT:    vl1re32.v v9, (a3)
+; ZVBB-RV64-NEXT:    vl1re32.v v17, (a7)
 ; ZVBB-RV64-NEXT:    csrr a0, vlenb
 ; ZVBB-RV64-NEXT:    li a3, 14
 ; ZVBB-RV64-NEXT:    mul a0, a0, a3
 ; ZVBB-RV64-NEXT:    add a0, sp, a0
 ; ZVBB-RV64-NEXT:    addi a0, a0, 64
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v12, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v13, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v13, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
 ; ZVBB-RV64-NEXT:    add a2, a0, a2
-; ZVBB-RV64-NEXT:    vl1re16.v v14, (a6)
-; ZVBB-RV64-NEXT:    vl1re16.v v15, (a1)
+; ZVBB-RV64-NEXT:    vl1re32.v v14, (a6)
+; ZVBB-RV64-NEXT:    vl1re32.v v15, (a1)
 ; ZVBB-RV64-NEXT:    add a5, a0, a5
 ; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
 ; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
-; ZVBB-RV64-NEXT:    vl8re16.v v16, (a2)
-; ZVBB-RV64-NEXT:    vl8re16.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl8re32.v v16, (a2)
+; ZVBB-RV64-NEXT:    vl8re32.v v8, (a0)
 ; ZVBB-RV64-NEXT:    addi sp, s0, -80
 ; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    addi sp, sp, 80
 ; ZVBB-RV64-NEXT:    ret
 ;
-; ZIP-LABEL: vector_interleave_nxv56bf16_nxv8bf16:
+; ZIP-LABEL: vector_interleave_nxv28f32_nxv4f32:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -80
 ; ZIP-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -7800,7 +13368,7 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
 ; ZIP-NEXT:    slli a0, a0, 5
 ; ZIP-NEXT:    sub sp, sp, a0
 ; ZIP-NEXT:    andi sp, sp, -64
-; ZIP-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZIP-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
 ; ZIP-NEXT:    vmv2r.v v26, v20
 ; ZIP-NEXT:    addi a0, sp, 64
 ; ZIP-NEXT:    vmv2r.v v24, v16
@@ -7828,139 +13396,55 @@ define <vscale x 56 x bfloat> @vector_interleave_nxv56bf16_nxv8bf16(<vscale x 8
 ; ZIP-NEXT:    vmv1r.v v22, v11
 ; ZIP-NEXT:    add a6, a7, a2
 ; ZIP-NEXT:    vmv1r.v v24, v15
-; ZIP-NEXT:    vsseg7e16.v v1, (a0)
+; ZIP-NEXT:    vsseg7e32.v v1, (a0)
 ; ZIP-NEXT:    vmv1r.v v26, v19
-; ZIP-NEXT:    vsseg7e16.v v21, (a1)
-; ZIP-NEXT:    vl1re16.v v18, (a6)
+; ZIP-NEXT:    vsseg7e32.v v21, (a1)
+; ZIP-NEXT:    vl1re32.v v18, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re16.v v19, (a6)
+; ZIP-NEXT:    vl1re32.v v19, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re16.v v20, (a6)
+; ZIP-NEXT:    vl1re32.v v20, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re16.v v21, (a6)
+; ZIP-NEXT:    vl1re32.v v21, (a6)
 ; ZIP-NEXT:    add a6, a3, a2
-; ZIP-NEXT:    vl1re16.v v10, (a6)
+; ZIP-NEXT:    vl1re32.v v10, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re16.v v11, (a6)
-; ZIP-NEXT:    vl1re16.v v8, (a0)
-; ZIP-NEXT:    vl1re16.v v16, (a4)
-; ZIP-NEXT:    vl1re16.v v9, (a3)
-; ZIP-NEXT:    vl1re16.v v17, (a7)
+; ZIP-NEXT:    vl1re32.v v11, (a6)
+; ZIP-NEXT:    vl1re32.v v8, (a0)
+; ZIP-NEXT:    vl1re32.v v16, (a4)
+; ZIP-NEXT:    vl1re32.v v9, (a3)
+; ZIP-NEXT:    vl1re32.v v17, (a7)
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    li a3, 14
 ; ZIP-NEXT:    mul a0, a0, a3
 ; ZIP-NEXT:    add a0, sp, a0
 ; ZIP-NEXT:    addi a0, a0, 64
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re16.v v12, (a6)
+; ZIP-NEXT:    vl1re32.v v12, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re16.v v13, (a6)
+; ZIP-NEXT:    vl1re32.v v13, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    slli a2, a2, 3
 ; ZIP-NEXT:    add a2, a0, a2
-; ZIP-NEXT:    vl1re16.v v14, (a6)
-; ZIP-NEXT:    vl1re16.v v15, (a1)
+; ZIP-NEXT:    vl1re32.v v14, (a6)
+; ZIP-NEXT:    vl1re32.v v15, (a1)
 ; ZIP-NEXT:    add a5, a0, a5
 ; ZIP-NEXT:    vs2r.v v20, (a5)
 ; ZIP-NEXT:    vs4r.v v16, (a2)
 ; ZIP-NEXT:    vs8r.v v8, (a0)
-; ZIP-NEXT:    vl8re16.v v16, (a2)
-; ZIP-NEXT:    vl8re16.v v8, (a0)
+; ZIP-NEXT:    vl8re32.v v16, (a2)
+; ZIP-NEXT:    vl8re32.v v8, (a0)
 ; ZIP-NEXT:    addi sp, s0, -80
 ; ZIP-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    addi sp, sp, 80
 ; ZIP-NEXT:    ret
-  %res = call <vscale x 56 x bfloat> @llvm.vector.interleave7.nxv56bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4, <vscale x 8 x bfloat> %v5, <vscale x 8 x bfloat> %v6)
-  ret <vscale x 56 x bfloat> %res
-}
-
-define <vscale x 7 x float> @vector_interleave_nxv7f32_nxv1f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4, <vscale x 1 x float> %v5, <vscale x 1 x float> %v6) nounwind {
-; CHECK-LABEL: vector_interleave_nxv7f32_nxv1f32:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    srli a2, a1, 1
-; CHECK-NEXT:    srli a1, a1, 3
-; CHECK-NEXT:    add a3, a0, a2
-; CHECK-NEXT:    add a4, a3, a2
-; CHECK-NEXT:    add a5, a4, a2
-; CHECK-NEXT:    add a6, a5, a2
-; CHECK-NEXT:    vsetvli a7, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vsseg7e32.v v8, (a0)
-; CHECK-NEXT:    add a7, a6, a2
-; CHECK-NEXT:    vle32.v v8, (a7)
-; CHECK-NEXT:    vle32.v v10, (a6)
-; CHECK-NEXT:    add a6, a1, a1
-; CHECK-NEXT:    add a2, a7, a2
-; CHECK-NEXT:    vle32.v v12, (a5)
-; CHECK-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v10, v8, a1
-; CHECK-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v11, (a2)
-; CHECK-NEXT:    vle32.v v9, (a4)
-; CHECK-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v9, v12, a1
-; CHECK-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vle32.v v12, (a3)
-; CHECK-NEXT:    vle32.v v8, (a0)
-; CHECK-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vx v8, v12, a1
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 2
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    ret
-;
-; ZVBB-LABEL: vector_interleave_nxv7f32_nxv1f32:
-; ZVBB:       # %bb.0:
-; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a0, a0, 2
-; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    addi a0, sp, 16
-; ZVBB-NEXT:    csrr a1, vlenb
-; ZVBB-NEXT:    srli a2, a1, 1
-; ZVBB-NEXT:    srli a1, a1, 3
-; ZVBB-NEXT:    add a3, a0, a2
-; ZVBB-NEXT:    add a4, a3, a2
-; ZVBB-NEXT:    add a5, a4, a2
-; ZVBB-NEXT:    add a6, a5, a2
-; ZVBB-NEXT:    vsetvli a7, zero, e32, mf2, ta, ma
-; ZVBB-NEXT:    vsseg7e32.v v8, (a0)
-; ZVBB-NEXT:    add a7, a6, a2
-; ZVBB-NEXT:    vle32.v v8, (a7)
-; ZVBB-NEXT:    vle32.v v10, (a6)
-; ZVBB-NEXT:    add a6, a1, a1
-; ZVBB-NEXT:    add a2, a7, a2
-; ZVBB-NEXT:    vle32.v v12, (a5)
-; ZVBB-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v10, v8, a1
-; ZVBB-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
-; ZVBB-NEXT:    vle32.v v11, (a2)
-; ZVBB-NEXT:    vle32.v v9, (a4)
-; ZVBB-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v9, v12, a1
-; ZVBB-NEXT:    vsetvli a2, zero, e32, mf2, ta, ma
-; ZVBB-NEXT:    vle32.v v12, (a3)
-; ZVBB-NEXT:    vle32.v v8, (a0)
-; ZVBB-NEXT:    vsetvli zero, a6, e32, m1, ta, ma
-; ZVBB-NEXT:    vslideup.vx v8, v12, a1
-; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a0, a0, 2
-; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    ret
-  %res = call <vscale x 7 x float> @llvm.vector.interleave7.nxv7f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4, <vscale x 1 x float> %v5, <vscale x 1 x float> %v6)
-  ret <vscale x 7 x float> %res
+  %res = call <vscale x 28 x float> @llvm.vector.interleave7.nxv28f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4, <vscale x 4 x float> %v5, <vscale x 4 x float> %v6)
+  ret <vscale x 28 x float> %res
 }
 
-define <vscale x 14 x float> @vector_interleave_nxv14f32_nxv2f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4, <vscale x 2 x float> %v5, <vscale x 2 x float> %v6) nounwind {
-; CHECK-LABEL: vector_interleave_nxv14f32_nxv2f32:
+define <vscale x 7 x double> @vector_interleave_nxv7f64_nxv1f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4, <vscale x 1 x double> %v5, <vscale x 1 x double> %v6) nounwind {
+; CHECK-LABEL: vector_interleave_nxv7f64_nxv1f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
@@ -7971,19 +13455,19 @@ define <vscale x 14 x float> @vector_interleave_nxv14f32_nxv2f32(<vscale x 2 x f
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    add a3, a2, a1
-; CHECK-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vsseg7e32.v v8, (a0)
-; CHECK-NEXT:    vl1re32.v v10, (a3)
+; CHECK-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsseg7e64.v v8, (a0)
+; CHECK-NEXT:    vl1re64.v v10, (a3)
 ; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    vl1re32.v v11, (a3)
+; CHECK-NEXT:    vl1re64.v v11, (a3)
 ; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    vl1re32.v v8, (a0)
+; CHECK-NEXT:    vl1re64.v v8, (a0)
 ; CHECK-NEXT:    add a0, a3, a1
-; CHECK-NEXT:    vl1re32.v v9, (a2)
-; CHECK-NEXT:    vl1re32.v v12, (a3)
-; CHECK-NEXT:    vl1re32.v v13, (a0)
+; CHECK-NEXT:    vl1re64.v v9, (a2)
+; CHECK-NEXT:    vl1re64.v v12, (a3)
+; CHECK-NEXT:    vl1re64.v v13, (a0)
 ; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vl1re32.v v14, (a0)
+; CHECK-NEXT:    vl1re64.v v14, (a0)
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a1, a0, 3
 ; CHECK-NEXT:    sub a0, a1, a0
@@ -7991,7 +13475,7 @@ define <vscale x 14 x float> @vector_interleave_nxv14f32_nxv2f32(<vscale x 2 x f
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv14f32_nxv2f32:
+; ZVBB-LABEL: vector_interleave_nxv7f64_nxv1f64:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    csrr a0, vlenb
@@ -8002,31 +13486,31 @@ define <vscale x 14 x float> @vector_interleave_nxv14f32_nxv2f32(<vscale x 2 x f
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    add a2, a0, a1
 ; ZVBB-NEXT:    add a3, a2, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e32, m1, ta, ma
-; ZVBB-NEXT:    vsseg7e32.v v8, (a0)
-; ZVBB-NEXT:    vl1re32.v v10, (a3)
+; ZVBB-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
+; ZVBB-NEXT:    vsseg7e64.v v8, (a0)
+; ZVBB-NEXT:    vl1re64.v v10, (a3)
 ; ZVBB-NEXT:    add a3, a3, a1
-; ZVBB-NEXT:    vl1re32.v v11, (a3)
+; ZVBB-NEXT:    vl1re64.v v11, (a3)
 ; ZVBB-NEXT:    add a3, a3, a1
-; ZVBB-NEXT:    vl1re32.v v8, (a0)
+; ZVBB-NEXT:    vl1re64.v v8, (a0)
 ; ZVBB-NEXT:    add a0, a3, a1
-; ZVBB-NEXT:    vl1re32.v v9, (a2)
-; ZVBB-NEXT:    vl1re32.v v12, (a3)
-; ZVBB-NEXT:    vl1re32.v v13, (a0)
+; ZVBB-NEXT:    vl1re64.v v9, (a2)
+; ZVBB-NEXT:    vl1re64.v v12, (a3)
+; ZVBB-NEXT:    vl1re64.v v13, (a0)
 ; ZVBB-NEXT:    add a0, a0, a1
-; ZVBB-NEXT:    vl1re32.v v14, (a0)
+; ZVBB-NEXT:    vl1re64.v v14, (a0)
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a1, a0, 3
 ; ZVBB-NEXT:    sub a0, a1, a0
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 14 x float> @llvm.vector.interleave7.nxv14f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4, <vscale x 2 x float> %v5, <vscale x 2 x float> %v6)
-  ret <vscale x 14 x float> %res
+  %res = call <vscale x 7 x double> @llvm.vector.interleave7.nxv7f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4, <vscale x 1 x double> %v5, <vscale x 1 x double> %v6)
+  ret <vscale x 7 x double> %res
 }
 
-define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4, <vscale x 4 x float> %v5, <vscale x 4 x float> %v6) nounwind {
-; RV32-LABEL: vector_interleave_nxv28f32_nxv4f32:
+define <vscale x 14 x double> @vector_interleave_nxv14f64_nxv2f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4, <vscale x 2 x double> %v5, <vscale x 2 x double> %v6) nounwind {
+; RV32-LABEL: vector_interleave_nxv14f64_nxv2f64:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -80
 ; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
@@ -8036,7 +13520,7 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
 ; RV32-NEXT:    slli a0, a0, 5
 ; RV32-NEXT:    sub sp, sp, a0
 ; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV32-NEXT:    vmv2r.v v26, v20
 ; RV32-NEXT:    addi a0, sp, 64
 ; RV32-NEXT:    vmv2r.v v24, v16
@@ -8064,51 +13548,51 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
 ; RV32-NEXT:    vmv1r.v v22, v11
 ; RV32-NEXT:    add a6, a7, a2
 ; RV32-NEXT:    vmv1r.v v24, v15
-; RV32-NEXT:    vsseg7e32.v v1, (a0)
+; RV32-NEXT:    vsseg7e64.v v1, (a0)
 ; RV32-NEXT:    vmv1r.v v26, v19
-; RV32-NEXT:    vsseg7e32.v v21, (a1)
-; RV32-NEXT:    vl1re32.v v18, (a6)
+; RV32-NEXT:    vsseg7e64.v v21, (a1)
+; RV32-NEXT:    vl1re64.v v18, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v19, (a6)
+; RV32-NEXT:    vl1re64.v v19, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v20, (a6)
+; RV32-NEXT:    vl1re64.v v20, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v21, (a6)
+; RV32-NEXT:    vl1re64.v v21, (a6)
 ; RV32-NEXT:    add a6, a3, a2
-; RV32-NEXT:    vl1re32.v v10, (a6)
+; RV32-NEXT:    vl1re64.v v10, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v11, (a6)
-; RV32-NEXT:    vl1re32.v v8, (a0)
-; RV32-NEXT:    vl1re32.v v16, (a4)
-; RV32-NEXT:    vl1re32.v v9, (a3)
-; RV32-NEXT:    vl1re32.v v17, (a7)
+; RV32-NEXT:    vl1re64.v v11, (a6)
+; RV32-NEXT:    vl1re64.v v8, (a0)
+; RV32-NEXT:    vl1re64.v v16, (a4)
+; RV32-NEXT:    vl1re64.v v9, (a3)
+; RV32-NEXT:    vl1re64.v v17, (a7)
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    li a3, 14
 ; RV32-NEXT:    mul a0, a0, a3
 ; RV32-NEXT:    add a0, sp, a0
 ; RV32-NEXT:    addi a0, a0, 64
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v12, (a6)
+; RV32-NEXT:    vl1re64.v v12, (a6)
 ; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re32.v v13, (a6)
+; RV32-NEXT:    vl1re64.v v13, (a6)
 ; RV32-NEXT:    add a6, a6, a2
 ; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, a0, a2
-; RV32-NEXT:    vl1re32.v v14, (a6)
-; RV32-NEXT:    vl1re32.v v15, (a1)
+; RV32-NEXT:    vl1re64.v v14, (a6)
+; RV32-NEXT:    vl1re64.v v15, (a1)
 ; RV32-NEXT:    add a5, a0, a5
 ; RV32-NEXT:    vs2r.v v20, (a5)
 ; RV32-NEXT:    vs4r.v v16, (a2)
 ; RV32-NEXT:    vs8r.v v8, (a0)
-; RV32-NEXT:    vl8re32.v v16, (a2)
-; RV32-NEXT:    vl8re32.v v8, (a0)
+; RV32-NEXT:    vl8re64.v v16, (a2)
+; RV32-NEXT:    vl8re64.v v8, (a0)
 ; RV32-NEXT:    addi sp, s0, -80
 ; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    addi sp, sp, 80
 ; RV32-NEXT:    ret
 ;
-; RV64-LABEL: vector_interleave_nxv28f32_nxv4f32:
+; RV64-LABEL: vector_interleave_nxv14f64_nxv2f64:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -80
 ; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -8118,7 +13602,7 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
 ; RV64-NEXT:    slli a0, a0, 5
 ; RV64-NEXT:    sub sp, sp, a0
 ; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; RV64-NEXT:    vmv2r.v v26, v20
 ; RV64-NEXT:    addi a0, sp, 64
 ; RV64-NEXT:    vmv2r.v v24, v16
@@ -8146,51 +13630,51 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
 ; RV64-NEXT:    vmv1r.v v22, v11
 ; RV64-NEXT:    add a6, a7, a2
 ; RV64-NEXT:    vmv1r.v v24, v15
-; RV64-NEXT:    vsseg7e32.v v1, (a0)
+; RV64-NEXT:    vsseg7e64.v v1, (a0)
 ; RV64-NEXT:    vmv1r.v v26, v19
-; RV64-NEXT:    vsseg7e32.v v21, (a1)
-; RV64-NEXT:    vl1re32.v v18, (a6)
+; RV64-NEXT:    vsseg7e64.v v21, (a1)
+; RV64-NEXT:    vl1re64.v v18, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v19, (a6)
+; RV64-NEXT:    vl1re64.v v19, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v20, (a6)
+; RV64-NEXT:    vl1re64.v v20, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v21, (a6)
+; RV64-NEXT:    vl1re64.v v21, (a6)
 ; RV64-NEXT:    add a6, a3, a2
-; RV64-NEXT:    vl1re32.v v10, (a6)
+; RV64-NEXT:    vl1re64.v v10, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v11, (a6)
-; RV64-NEXT:    vl1re32.v v8, (a0)
-; RV64-NEXT:    vl1re32.v v16, (a4)
-; RV64-NEXT:    vl1re32.v v9, (a3)
-; RV64-NEXT:    vl1re32.v v17, (a7)
+; RV64-NEXT:    vl1re64.v v11, (a6)
+; RV64-NEXT:    vl1re64.v v8, (a0)
+; RV64-NEXT:    vl1re64.v v16, (a4)
+; RV64-NEXT:    vl1re64.v v9, (a3)
+; RV64-NEXT:    vl1re64.v v17, (a7)
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    li a3, 14
 ; RV64-NEXT:    mul a0, a0, a3
 ; RV64-NEXT:    add a0, sp, a0
 ; RV64-NEXT:    addi a0, a0, 64
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v12, (a6)
+; RV64-NEXT:    vl1re64.v v12, (a6)
 ; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re32.v v13, (a6)
+; RV64-NEXT:    vl1re64.v v13, (a6)
 ; RV64-NEXT:    add a6, a6, a2
 ; RV64-NEXT:    slli a2, a2, 3
 ; RV64-NEXT:    add a2, a0, a2
-; RV64-NEXT:    vl1re32.v v14, (a6)
-; RV64-NEXT:    vl1re32.v v15, (a1)
+; RV64-NEXT:    vl1re64.v v14, (a6)
+; RV64-NEXT:    vl1re64.v v15, (a1)
 ; RV64-NEXT:    add a5, a0, a5
 ; RV64-NEXT:    vs2r.v v20, (a5)
 ; RV64-NEXT:    vs4r.v v16, (a2)
 ; RV64-NEXT:    vs8r.v v8, (a0)
-; RV64-NEXT:    vl8re32.v v16, (a2)
-; RV64-NEXT:    vl8re32.v v8, (a0)
+; RV64-NEXT:    vl8re64.v v16, (a2)
+; RV64-NEXT:    vl8re64.v v8, (a0)
 ; RV64-NEXT:    addi sp, s0, -80
 ; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    addi sp, sp, 80
 ; RV64-NEXT:    ret
 ;
-; ZVBB-RV32-LABEL: vector_interleave_nxv28f32_nxv4f32:
+; ZVBB-RV32-LABEL: vector_interleave_nxv14f64_nxv2f64:
 ; ZVBB-RV32:       # %bb.0:
 ; ZVBB-RV32-NEXT:    addi sp, sp, -80
 ; ZVBB-RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
@@ -8200,7 +13684,7 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
 ; ZVBB-RV32-NEXT:    slli a0, a0, 5
 ; ZVBB-RV32-NEXT:    sub sp, sp, a0
 ; ZVBB-RV32-NEXT:    andi sp, sp, -64
-; ZVBB-RV32-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV32-NEXT:    addi a0, sp, 64
 ; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
@@ -8228,51 +13712,51 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
 ; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
 ; ZVBB-RV32-NEXT:    add a6, a7, a2
 ; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV32-NEXT:    vsseg7e32.v v1, (a0)
+; ZVBB-RV32-NEXT:    vsseg7e64.v v1, (a0)
 ; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV32-NEXT:    vsseg7e32.v v21, (a1)
-; ZVBB-RV32-NEXT:    vl1re32.v v18, (a6)
+; ZVBB-RV32-NEXT:    vsseg7e64.v v21, (a1)
+; ZVBB-RV32-NEXT:    vl1re64.v v18, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v19, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v19, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v20, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v20, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v21, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v21, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a3, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v10, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v10, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v11, (a6)
-; ZVBB-RV32-NEXT:    vl1re32.v v8, (a0)
-; ZVBB-RV32-NEXT:    vl1re32.v v16, (a4)
-; ZVBB-RV32-NEXT:    vl1re32.v v9, (a3)
-; ZVBB-RV32-NEXT:    vl1re32.v v17, (a7)
+; ZVBB-RV32-NEXT:    vl1re64.v v11, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl1re64.v v16, (a4)
+; ZVBB-RV32-NEXT:    vl1re64.v v9, (a3)
+; ZVBB-RV32-NEXT:    vl1re64.v v17, (a7)
 ; ZVBB-RV32-NEXT:    csrr a0, vlenb
 ; ZVBB-RV32-NEXT:    li a3, 14
 ; ZVBB-RV32-NEXT:    mul a0, a0, a3
 ; ZVBB-RV32-NEXT:    add a0, sp, a0
 ; ZVBB-RV32-NEXT:    addi a0, a0, 64
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v12, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v12, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v13, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v13, (a6)
 ; ZVBB-RV32-NEXT:    add a6, a6, a2
 ; ZVBB-RV32-NEXT:    slli a2, a2, 3
 ; ZVBB-RV32-NEXT:    add a2, a0, a2
-; ZVBB-RV32-NEXT:    vl1re32.v v14, (a6)
-; ZVBB-RV32-NEXT:    vl1re32.v v15, (a1)
+; ZVBB-RV32-NEXT:    vl1re64.v v14, (a6)
+; ZVBB-RV32-NEXT:    vl1re64.v v15, (a1)
 ; ZVBB-RV32-NEXT:    add a5, a0, a5
 ; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
 ; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
-; ZVBB-RV32-NEXT:    vl8re32.v v16, (a2)
-; ZVBB-RV32-NEXT:    vl8re32.v v8, (a0)
+; ZVBB-RV32-NEXT:    vl8re64.v v16, (a2)
+; ZVBB-RV32-NEXT:    vl8re64.v v8, (a0)
 ; ZVBB-RV32-NEXT:    addi sp, s0, -80
 ; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
 ; ZVBB-RV32-NEXT:    addi sp, sp, 80
 ; ZVBB-RV32-NEXT:    ret
 ;
-; ZVBB-RV64-LABEL: vector_interleave_nxv28f32_nxv4f32:
+; ZVBB-RV64-LABEL: vector_interleave_nxv14f64_nxv2f64:
 ; ZVBB-RV64:       # %bb.0:
 ; ZVBB-RV64-NEXT:    addi sp, sp, -80
 ; ZVBB-RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -8282,7 +13766,7 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
 ; ZVBB-RV64-NEXT:    slli a0, a0, 5
 ; ZVBB-RV64-NEXT:    sub sp, sp, a0
 ; ZVBB-RV64-NEXT:    andi sp, sp, -64
-; ZVBB-RV64-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
 ; ZVBB-RV64-NEXT:    addi a0, sp, 64
 ; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
@@ -8310,51 +13794,51 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
 ; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
 ; ZVBB-RV64-NEXT:    add a6, a7, a2
 ; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV64-NEXT:    vsseg7e32.v v1, (a0)
+; ZVBB-RV64-NEXT:    vsseg7e64.v v1, (a0)
 ; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV64-NEXT:    vsseg7e32.v v21, (a1)
-; ZVBB-RV64-NEXT:    vl1re32.v v18, (a6)
+; ZVBB-RV64-NEXT:    vsseg7e64.v v21, (a1)
+; ZVBB-RV64-NEXT:    vl1re64.v v18, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v19, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v19, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v20, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v20, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v21, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v21, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a3, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v10, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v10, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v11, (a6)
-; ZVBB-RV64-NEXT:    vl1re32.v v8, (a0)
-; ZVBB-RV64-NEXT:    vl1re32.v v16, (a4)
-; ZVBB-RV64-NEXT:    vl1re32.v v9, (a3)
-; ZVBB-RV64-NEXT:    vl1re32.v v17, (a7)
+; ZVBB-RV64-NEXT:    vl1re64.v v11, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl1re64.v v16, (a4)
+; ZVBB-RV64-NEXT:    vl1re64.v v9, (a3)
+; ZVBB-RV64-NEXT:    vl1re64.v v17, (a7)
 ; ZVBB-RV64-NEXT:    csrr a0, vlenb
 ; ZVBB-RV64-NEXT:    li a3, 14
 ; ZVBB-RV64-NEXT:    mul a0, a0, a3
 ; ZVBB-RV64-NEXT:    add a0, sp, a0
 ; ZVBB-RV64-NEXT:    addi a0, a0, 64
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v12, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v12, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v13, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v13, (a6)
 ; ZVBB-RV64-NEXT:    add a6, a6, a2
 ; ZVBB-RV64-NEXT:    slli a2, a2, 3
 ; ZVBB-RV64-NEXT:    add a2, a0, a2
-; ZVBB-RV64-NEXT:    vl1re32.v v14, (a6)
-; ZVBB-RV64-NEXT:    vl1re32.v v15, (a1)
+; ZVBB-RV64-NEXT:    vl1re64.v v14, (a6)
+; ZVBB-RV64-NEXT:    vl1re64.v v15, (a1)
 ; ZVBB-RV64-NEXT:    add a5, a0, a5
 ; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
 ; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
 ; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
-; ZVBB-RV64-NEXT:    vl8re32.v v16, (a2)
-; ZVBB-RV64-NEXT:    vl8re32.v v8, (a0)
+; ZVBB-RV64-NEXT:    vl8re64.v v16, (a2)
+; ZVBB-RV64-NEXT:    vl8re64.v v8, (a0)
 ; ZVBB-RV64-NEXT:    addi sp, s0, -80
 ; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZVBB-RV64-NEXT:    addi sp, sp, 80
 ; ZVBB-RV64-NEXT:    ret
 ;
-; ZIP-LABEL: vector_interleave_nxv28f32_nxv4f32:
+; ZIP-LABEL: vector_interleave_nxv14f64_nxv2f64:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -80
 ; ZIP-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
@@ -8364,7 +13848,7 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
 ; ZIP-NEXT:    slli a0, a0, 5
 ; ZIP-NEXT:    sub sp, sp, a0
 ; ZIP-NEXT:    andi sp, sp, -64
-; ZIP-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; ZIP-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
 ; ZIP-NEXT:    vmv2r.v v26, v20
 ; ZIP-NEXT:    addi a0, sp, 64
 ; ZIP-NEXT:    vmv2r.v v24, v16
@@ -8392,529 +13876,1147 @@ define <vscale x 28 x float> @vector_interleave_nxv28f32_nxv4f32(<vscale x 4 x f
 ; ZIP-NEXT:    vmv1r.v v22, v11
 ; ZIP-NEXT:    add a6, a7, a2
 ; ZIP-NEXT:    vmv1r.v v24, v15
-; ZIP-NEXT:    vsseg7e32.v v1, (a0)
+; ZIP-NEXT:    vsseg7e64.v v1, (a0)
 ; ZIP-NEXT:    vmv1r.v v26, v19
-; ZIP-NEXT:    vsseg7e32.v v21, (a1)
-; ZIP-NEXT:    vl1re32.v v18, (a6)
+; ZIP-NEXT:    vsseg7e64.v v21, (a1)
+; ZIP-NEXT:    vl1re64.v v18, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re32.v v19, (a6)
+; ZIP-NEXT:    vl1re64.v v19, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re32.v v20, (a6)
+; ZIP-NEXT:    vl1re64.v v20, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re32.v v21, (a6)
+; ZIP-NEXT:    vl1re64.v v21, (a6)
 ; ZIP-NEXT:    add a6, a3, a2
-; ZIP-NEXT:    vl1re32.v v10, (a6)
+; ZIP-NEXT:    vl1re64.v v10, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re32.v v11, (a6)
-; ZIP-NEXT:    vl1re32.v v8, (a0)
-; ZIP-NEXT:    vl1re32.v v16, (a4)
-; ZIP-NEXT:    vl1re32.v v9, (a3)
-; ZIP-NEXT:    vl1re32.v v17, (a7)
+; ZIP-NEXT:    vl1re64.v v11, (a6)
+; ZIP-NEXT:    vl1re64.v v8, (a0)
+; ZIP-NEXT:    vl1re64.v v16, (a4)
+; ZIP-NEXT:    vl1re64.v v9, (a3)
+; ZIP-NEXT:    vl1re64.v v17, (a7)
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    li a3, 14
 ; ZIP-NEXT:    mul a0, a0, a3
 ; ZIP-NEXT:    add a0, sp, a0
 ; ZIP-NEXT:    addi a0, a0, 64
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re32.v v12, (a6)
+; ZIP-NEXT:    vl1re64.v v12, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re32.v v13, (a6)
+; ZIP-NEXT:    vl1re64.v v13, (a6)
 ; ZIP-NEXT:    add a6, a6, a2
 ; ZIP-NEXT:    slli a2, a2, 3
 ; ZIP-NEXT:    add a2, a0, a2
-; ZIP-NEXT:    vl1re32.v v14, (a6)
-; ZIP-NEXT:    vl1re32.v v15, (a1)
+; ZIP-NEXT:    vl1re64.v v14, (a6)
+; ZIP-NEXT:    vl1re64.v v15, (a1)
 ; ZIP-NEXT:    add a5, a0, a5
 ; ZIP-NEXT:    vs2r.v v20, (a5)
 ; ZIP-NEXT:    vs4r.v v16, (a2)
 ; ZIP-NEXT:    vs8r.v v8, (a0)
-; ZIP-NEXT:    vl8re32.v v16, (a2)
-; ZIP-NEXT:    vl8re32.v v8, (a0)
+; ZIP-NEXT:    vl8re64.v v16, (a2)
+; ZIP-NEXT:    vl8re64.v v8, (a0)
 ; ZIP-NEXT:    addi sp, s0, -80
 ; ZIP-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    addi sp, sp, 80
 ; ZIP-NEXT:    ret
-  %res = call <vscale x 28 x float> @llvm.vector.interleave7.nxv28f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4, <vscale x 4 x float> %v5, <vscale x 4 x float> %v6)
-  ret <vscale x 28 x float> %res
+  %res = call <vscale x 14 x double> @llvm.vector.interleave7.nxv14f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4, <vscale x 2 x double> %v5, <vscale x 2 x double> %v6)
+  ret <vscale x 14 x double> %res
+}
+
+define <vscale x 16 x half> @vector_interleave_nxv16f16_nxv2f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4, <vscale x 2 x half> %v5, <vscale x 2 x half> %v6, <vscale x 2 x half> %v7) nounwind {
+; CHECK-LABEL: vector_interleave_nxv16f16_nxv2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a2, a1, 1
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    add a4, a3, a2
+; CHECK-NEXT:    add a5, a4, a2
+; CHECK-NEXT:    add a6, a5, a2
+; CHECK-NEXT:    add a7, a6, a2
+; CHECK-NEXT:    vsetvli t0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsseg8e16.v v8, (a0)
+; CHECK-NEXT:    add t0, a7, a2
+; CHECK-NEXT:    add a2, t0, a2
+; CHECK-NEXT:    vle16.v v11, (t0)
+; CHECK-NEXT:    vle16.v v8, (a2)
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    add a2, a1, a1
+; CHECK-NEXT:    vle16.v v9, (a7)
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v11, v8, a1
+; CHECK-NEXT:    vsetvli a7, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v10, (a6)
+; CHECK-NEXT:    vle16.v v8, (a5)
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v9, a1
+; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v9, (a4)
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a1
+; CHECK-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v12, (a3)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v12, a1
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16f16_nxv2f16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 2
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    srli a2, a1, 1
+; ZVBB-NEXT:    add a3, a0, a2
+; ZVBB-NEXT:    add a4, a3, a2
+; ZVBB-NEXT:    add a5, a4, a2
+; ZVBB-NEXT:    add a6, a5, a2
+; ZVBB-NEXT:    add a7, a6, a2
+; ZVBB-NEXT:    vsetvli t0, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsseg8e16.v v8, (a0)
+; ZVBB-NEXT:    add t0, a7, a2
+; ZVBB-NEXT:    add a2, t0, a2
+; ZVBB-NEXT:    vle16.v v11, (t0)
+; ZVBB-NEXT:    vle16.v v8, (a2)
+; ZVBB-NEXT:    srli a1, a1, 2
+; ZVBB-NEXT:    add a2, a1, a1
+; ZVBB-NEXT:    vle16.v v9, (a7)
+; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v11, v8, a1
+; ZVBB-NEXT:    vsetvli a7, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vle16.v v10, (a6)
+; ZVBB-NEXT:    vle16.v v8, (a5)
+; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v10, v9, a1
+; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vle16.v v9, (a4)
+; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v9, v8, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vle16.v v12, (a3)
+; ZVBB-NEXT:    vle16.v v8, (a0)
+; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v8, v12, a1
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 2
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 16 x half> @llvm.vector.interleave8.nxv16f16(<vscale x 2 x half> %v0, <vscale x 2 x half> %v1, <vscale x 2 x half> %v2, <vscale x 2 x half> %v3, <vscale x 2 x half> %v4, <vscale x 2 x half> %v5, <vscale x 2 x half> %v6, <vscale x 2 x half> %v7)
+  ret <vscale x 16 x half> %res
+}
+
+define <vscale x 32 x half> @vector_interleave_nxv32f16_nxv4f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4, <vscale x 4 x half> %v5, <vscale x 4 x half> %v6, <vscale x 4 x half> %v7) nounwind {
+; CHECK-LABEL: vector_interleave_nxv32f16_nxv4f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    add a4, a3, a1
+; CHECK-NEXT:    add a5, a4, a1
+; CHECK-NEXT:    add a6, a5, a1
+; CHECK-NEXT:    add a7, a6, a1
+; CHECK-NEXT:    vsetvli t0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsseg8e16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v14, (a7)
+; CHECK-NEXT:    add a1, a7, a1
+; CHECK-NEXT:    vl1re16.v v15, (a1)
+; CHECK-NEXT:    vl1re16.v v12, (a5)
+; CHECK-NEXT:    vl1re16.v v13, (a6)
+; CHECK-NEXT:    vl1re16.v v10, (a3)
+; CHECK-NEXT:    vl1re16.v v11, (a4)
+; CHECK-NEXT:    vl1re16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v9, (a2)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv32f16_nxv4f16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    add a4, a3, a1
+; ZVBB-NEXT:    add a5, a4, a1
+; ZVBB-NEXT:    add a6, a5, a1
+; ZVBB-NEXT:    add a7, a6, a1
+; ZVBB-NEXT:    vsetvli t0, zero, e16, m1, ta, ma
+; ZVBB-NEXT:    vsseg8e16.v v8, (a0)
+; ZVBB-NEXT:    vl1re16.v v14, (a7)
+; ZVBB-NEXT:    add a1, a7, a1
+; ZVBB-NEXT:    vl1re16.v v15, (a1)
+; ZVBB-NEXT:    vl1re16.v v12, (a5)
+; ZVBB-NEXT:    vl1re16.v v13, (a6)
+; ZVBB-NEXT:    vl1re16.v v10, (a3)
+; ZVBB-NEXT:    vl1re16.v v11, (a4)
+; ZVBB-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-NEXT:    vl1re16.v v9, (a2)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 32 x half> @llvm.vector.interleave8.nxv32f16(<vscale x 4 x half> %v0, <vscale x 4 x half> %v1, <vscale x 4 x half> %v2, <vscale x 4 x half> %v3, <vscale x 4 x half> %v4, <vscale x 4 x half> %v5, <vscale x 4 x half> %v6, <vscale x 4 x half> %v7)
+  ret <vscale x 32 x half> %res
 }
 
-define <vscale x 7 x double> @vector_interleave_nxv7f64_nxv1f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4, <vscale x 1 x double> %v5, <vscale x 1 x double> %v6) nounwind {
-; CHECK-LABEL: vector_interleave_nxv7f64_nxv1f64:
+define <vscale x 64 x half> @vector_interleave_nxv64f16_nxv8f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4, <vscale x 8 x half> %v5, <vscale x 8 x half> %v6, <vscale x 8 x half> %v7) nounwind {
+; CHECK-LABEL: vector_interleave_nxv64f16_nxv8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 3
-; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v28, v22
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vmv2r.v v26, v18
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    add a2, a0, a3
+; CHECK-NEXT:    add a4, a1, a3
+; CHECK-NEXT:    add a5, a2, a3
+; CHECK-NEXT:    vmv1r.v v1, v8
+; CHECK-NEXT:    vmv2r.v v24, v14
+; CHECK-NEXT:    add a6, a4, a3
+; CHECK-NEXT:    vmv2r.v v22, v10
+; CHECK-NEXT:    vmv1r.v v2, v22
+; CHECK-NEXT:    add a7, a5, a3
+; CHECK-NEXT:    vmv1r.v v3, v12
+; CHECK-NEXT:    add t0, a6, a3
+; CHECK-NEXT:    vmv1r.v v4, v24
+; CHECK-NEXT:    add t1, a7, a3
+; CHECK-NEXT:    vmv1r.v v5, v16
+; CHECK-NEXT:    add t2, t0, a3
+; CHECK-NEXT:    vmv1r.v v6, v26
+; CHECK-NEXT:    add t3, t1, a3
+; CHECK-NEXT:    vmv1r.v v7, v20
+; CHECK-NEXT:    add t4, t2, a3
+; CHECK-NEXT:    vmv1r.v v8, v28
+; CHECK-NEXT:    vmv1r.v v22, v9
+; CHECK-NEXT:    add t5, t3, a3
+; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    add t6, t4, a3
+; CHECK-NEXT:    vmv1r.v v26, v17
+; CHECK-NEXT:    vsseg8e16.v v1, (a0)
+; CHECK-NEXT:    vmv1r.v v28, v21
+; CHECK-NEXT:    vsseg8e16.v v22, (a1)
+; CHECK-NEXT:    vl1re16.v v14, (t5)
+; CHECK-NEXT:    add t5, t5, a3
+; CHECK-NEXT:    add a3, t6, a3
+; CHECK-NEXT:    vl1re16.v v22, (t6)
+; CHECK-NEXT:    vl1re16.v v15, (t5)
+; CHECK-NEXT:    vl1re16.v v23, (a3)
+; CHECK-NEXT:    vl1re16.v v12, (t1)
+; CHECK-NEXT:    vl1re16.v v20, (t2)
+; CHECK-NEXT:    vl1re16.v v13, (t3)
+; CHECK-NEXT:    vl1re16.v v21, (t4)
+; CHECK-NEXT:    vl1re16.v v10, (a5)
+; CHECK-NEXT:    vl1re16.v v18, (a6)
+; CHECK-NEXT:    vl1re16.v v11, (a7)
+; CHECK-NEXT:    vl1re16.v v19, (t0)
+; CHECK-NEXT:    vl1re16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v16, (a1)
+; CHECK-NEXT:    vl1re16.v v9, (a2)
+; CHECK-NEXT:    vl1re16.v v17, (a4)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv64f16_nxv8f16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 4
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-NEXT:    vmv2r.v v28, v22
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    vmv2r.v v26, v18
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    slli a1, a1, 3
+; ZVBB-NEXT:    add a1, sp, a1
+; ZVBB-NEXT:    addi a1, a1, 16
+; ZVBB-NEXT:    csrr a3, vlenb
+; ZVBB-NEXT:    add a2, a0, a3
+; ZVBB-NEXT:    add a4, a1, a3
+; ZVBB-NEXT:    add a5, a2, a3
+; ZVBB-NEXT:    vmv1r.v v1, v8
+; ZVBB-NEXT:    vmv2r.v v24, v14
+; ZVBB-NEXT:    add a6, a4, a3
+; ZVBB-NEXT:    vmv2r.v v22, v10
+; ZVBB-NEXT:    vmv1r.v v2, v22
+; ZVBB-NEXT:    add a7, a5, a3
+; ZVBB-NEXT:    vmv1r.v v3, v12
+; ZVBB-NEXT:    add t0, a6, a3
+; ZVBB-NEXT:    vmv1r.v v4, v24
+; ZVBB-NEXT:    add t1, a7, a3
+; ZVBB-NEXT:    vmv1r.v v5, v16
+; ZVBB-NEXT:    add t2, t0, a3
+; ZVBB-NEXT:    vmv1r.v v6, v26
+; ZVBB-NEXT:    add t3, t1, a3
+; ZVBB-NEXT:    vmv1r.v v7, v20
+; ZVBB-NEXT:    add t4, t2, a3
+; ZVBB-NEXT:    vmv1r.v v8, v28
+; ZVBB-NEXT:    vmv1r.v v22, v9
+; ZVBB-NEXT:    add t5, t3, a3
+; ZVBB-NEXT:    vmv1r.v v24, v13
+; ZVBB-NEXT:    add t6, t4, a3
+; ZVBB-NEXT:    vmv1r.v v26, v17
+; ZVBB-NEXT:    vsseg8e16.v v1, (a0)
+; ZVBB-NEXT:    vmv1r.v v28, v21
+; ZVBB-NEXT:    vsseg8e16.v v22, (a1)
+; ZVBB-NEXT:    vl1re16.v v14, (t5)
+; ZVBB-NEXT:    add t5, t5, a3
+; ZVBB-NEXT:    add a3, t6, a3
+; ZVBB-NEXT:    vl1re16.v v22, (t6)
+; ZVBB-NEXT:    vl1re16.v v15, (t5)
+; ZVBB-NEXT:    vl1re16.v v23, (a3)
+; ZVBB-NEXT:    vl1re16.v v12, (t1)
+; ZVBB-NEXT:    vl1re16.v v20, (t2)
+; ZVBB-NEXT:    vl1re16.v v13, (t3)
+; ZVBB-NEXT:    vl1re16.v v21, (t4)
+; ZVBB-NEXT:    vl1re16.v v10, (a5)
+; ZVBB-NEXT:    vl1re16.v v18, (a6)
+; ZVBB-NEXT:    vl1re16.v v11, (a7)
+; ZVBB-NEXT:    vl1re16.v v19, (t0)
+; ZVBB-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-NEXT:    vl1re16.v v16, (a1)
+; ZVBB-NEXT:    vl1re16.v v9, (a2)
+; ZVBB-NEXT:    vl1re16.v v17, (a4)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 4
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 64 x half> @llvm.vector.interleave8.nxv64f16(<vscale x 8 x half> %v0, <vscale x 8 x half> %v1, <vscale x 8 x half> %v2, <vscale x 8 x half> %v3, <vscale x 8 x half> %v4, <vscale x 8 x half> %v5, <vscale x 8 x half> %v6, <vscale x 8 x half> %v7)
+  ret <vscale x 64 x half> %res
+}
+
+define <vscale x 16 x bfloat> @vector_interleave_nxv16bf16_nxv2bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4, <vscale x 2 x bfloat> %v5, <vscale x 2 x bfloat> %v6, <vscale x 2 x bfloat> %v7) nounwind {
+; CHECK-LABEL: vector_interleave_nxv16bf16_nxv2bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a2, a1, 1
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    add a4, a3, a2
+; CHECK-NEXT:    add a5, a4, a2
+; CHECK-NEXT:    add a6, a5, a2
+; CHECK-NEXT:    add a7, a6, a2
+; CHECK-NEXT:    vsetvli t0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsseg8e16.v v8, (a0)
+; CHECK-NEXT:    add t0, a7, a2
+; CHECK-NEXT:    add a2, t0, a2
+; CHECK-NEXT:    vle16.v v11, (t0)
+; CHECK-NEXT:    vle16.v v8, (a2)
+; CHECK-NEXT:    srli a1, a1, 2
+; CHECK-NEXT:    add a2, a1, a1
+; CHECK-NEXT:    vle16.v v9, (a7)
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v11, v8, a1
+; CHECK-NEXT:    vsetvli a7, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v10, (a6)
+; CHECK-NEXT:    vle16.v v8, (a5)
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v9, a1
+; CHECK-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v9, (a4)
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a1
+; CHECK-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v12, (a3)
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v12, a1
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16bf16_nxv2bf16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 2
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    srli a2, a1, 1
+; ZVBB-NEXT:    add a3, a0, a2
+; ZVBB-NEXT:    add a4, a3, a2
+; ZVBB-NEXT:    add a5, a4, a2
+; ZVBB-NEXT:    add a6, a5, a2
+; ZVBB-NEXT:    add a7, a6, a2
+; ZVBB-NEXT:    vsetvli t0, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vsseg8e16.v v8, (a0)
+; ZVBB-NEXT:    add t0, a7, a2
+; ZVBB-NEXT:    add a2, t0, a2
+; ZVBB-NEXT:    vle16.v v11, (t0)
+; ZVBB-NEXT:    vle16.v v8, (a2)
+; ZVBB-NEXT:    srli a1, a1, 2
+; ZVBB-NEXT:    add a2, a1, a1
+; ZVBB-NEXT:    vle16.v v9, (a7)
+; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v11, v8, a1
+; ZVBB-NEXT:    vsetvli a7, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vle16.v v10, (a6)
+; ZVBB-NEXT:    vle16.v v8, (a5)
+; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v10, v9, a1
+; ZVBB-NEXT:    vsetvli a5, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vle16.v v9, (a4)
+; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v9, v8, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e16, mf2, ta, ma
+; ZVBB-NEXT:    vle16.v v12, (a3)
+; ZVBB-NEXT:    vle16.v v8, (a0)
+; ZVBB-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v8, v12, a1
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 2
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 16 x bfloat> @llvm.vector.interleave8.nxv16bf16(<vscale x 2 x bfloat> %v0, <vscale x 2 x bfloat> %v1, <vscale x 2 x bfloat> %v2, <vscale x 2 x bfloat> %v3, <vscale x 2 x bfloat> %v4, <vscale x 2 x bfloat> %v5, <vscale x 2 x bfloat> %v6, <vscale x 2 x bfloat> %v7)
+  ret <vscale x 16 x bfloat> %res
+}
+
+define <vscale x 32 x bfloat> @vector_interleave_nxv32bf16_nxv4bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4, <vscale x 4 x bfloat> %v5, <vscale x 4 x bfloat> %v6, <vscale x 4 x bfloat> %v7) nounwind {
+; CHECK-LABEL: vector_interleave_nxv32bf16_nxv4bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    sub sp, sp, a0
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    add a2, a0, a1
 ; CHECK-NEXT:    add a3, a2, a1
-; CHECK-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
-; CHECK-NEXT:    vsseg7e64.v v8, (a0)
-; CHECK-NEXT:    vl1re64.v v10, (a3)
-; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    vl1re64.v v11, (a3)
-; CHECK-NEXT:    add a3, a3, a1
-; CHECK-NEXT:    vl1re64.v v8, (a0)
-; CHECK-NEXT:    add a0, a3, a1
-; CHECK-NEXT:    vl1re64.v v9, (a2)
-; CHECK-NEXT:    vl1re64.v v12, (a3)
-; CHECK-NEXT:    vl1re64.v v13, (a0)
-; CHECK-NEXT:    add a0, a0, a1
-; CHECK-NEXT:    vl1re64.v v14, (a0)
+; CHECK-NEXT:    add a4, a3, a1
+; CHECK-NEXT:    add a5, a4, a1
+; CHECK-NEXT:    add a6, a5, a1
+; CHECK-NEXT:    add a7, a6, a1
+; CHECK-NEXT:    vsetvli t0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vsseg8e16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v14, (a7)
+; CHECK-NEXT:    add a1, a7, a1
+; CHECK-NEXT:    vl1re16.v v15, (a1)
+; CHECK-NEXT:    vl1re16.v v12, (a5)
+; CHECK-NEXT:    vl1re16.v v13, (a6)
+; CHECK-NEXT:    vl1re16.v v10, (a3)
+; CHECK-NEXT:    vl1re16.v v11, (a4)
+; CHECK-NEXT:    vl1re16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v9, (a2)
 ; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a1, a0, 3
-; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    slli a0, a0, 3
 ; CHECK-NEXT:    add sp, sp, a0
 ; CHECK-NEXT:    addi sp, sp, 16
 ; CHECK-NEXT:    ret
 ;
-; ZVBB-LABEL: vector_interleave_nxv7f64_nxv1f64:
+; ZVBB-LABEL: vector_interleave_nxv32bf16_nxv4bf16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a1, a0, 3
-; ZVBB-NEXT:    sub a0, a1, a0
+; ZVBB-NEXT:    slli a0, a0, 3
 ; ZVBB-NEXT:    sub sp, sp, a0
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    add a2, a0, a1
 ; ZVBB-NEXT:    add a3, a2, a1
-; ZVBB-NEXT:    vsetvli a4, zero, e64, m1, ta, ma
-; ZVBB-NEXT:    vsseg7e64.v v8, (a0)
-; ZVBB-NEXT:    vl1re64.v v10, (a3)
-; ZVBB-NEXT:    add a3, a3, a1
-; ZVBB-NEXT:    vl1re64.v v11, (a3)
-; ZVBB-NEXT:    add a3, a3, a1
-; ZVBB-NEXT:    vl1re64.v v8, (a0)
-; ZVBB-NEXT:    add a0, a3, a1
-; ZVBB-NEXT:    vl1re64.v v9, (a2)
-; ZVBB-NEXT:    vl1re64.v v12, (a3)
-; ZVBB-NEXT:    vl1re64.v v13, (a0)
-; ZVBB-NEXT:    add a0, a0, a1
-; ZVBB-NEXT:    vl1re64.v v14, (a0)
+; ZVBB-NEXT:    add a4, a3, a1
+; ZVBB-NEXT:    add a5, a4, a1
+; ZVBB-NEXT:    add a6, a5, a1
+; ZVBB-NEXT:    add a7, a6, a1
+; ZVBB-NEXT:    vsetvli t0, zero, e16, m1, ta, ma
+; ZVBB-NEXT:    vsseg8e16.v v8, (a0)
+; ZVBB-NEXT:    vl1re16.v v14, (a7)
+; ZVBB-NEXT:    add a1, a7, a1
+; ZVBB-NEXT:    vl1re16.v v15, (a1)
+; ZVBB-NEXT:    vl1re16.v v12, (a5)
+; ZVBB-NEXT:    vl1re16.v v13, (a6)
+; ZVBB-NEXT:    vl1re16.v v10, (a3)
+; ZVBB-NEXT:    vl1re16.v v11, (a4)
+; ZVBB-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-NEXT:    vl1re16.v v9, (a2)
 ; ZVBB-NEXT:    csrr a0, vlenb
-; ZVBB-NEXT:    slli a1, a0, 3
-; ZVBB-NEXT:    sub a0, a1, a0
+; ZVBB-NEXT:    slli a0, a0, 3
 ; ZVBB-NEXT:    add sp, sp, a0
 ; ZVBB-NEXT:    addi sp, sp, 16
 ; ZVBB-NEXT:    ret
-  %res = call <vscale x 7 x double> @llvm.vector.interleave7.nxv7f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4, <vscale x 1 x double> %v5, <vscale x 1 x double> %v6)
-  ret <vscale x 7 x double> %res
+  %res = call <vscale x 32 x bfloat> @llvm.vector.interleave8.nxv32bf16(<vscale x 4 x bfloat> %v0, <vscale x 4 x bfloat> %v1, <vscale x 4 x bfloat> %v2, <vscale x 4 x bfloat> %v3, <vscale x 4 x bfloat> %v4, <vscale x 4 x bfloat> %v5, <vscale x 4 x bfloat> %v6, <vscale x 4 x bfloat> %v7)
+  ret <vscale x 32 x bfloat> %res
 }
 
-define <vscale x 14 x double> @vector_interleave_nxv14f64_nxv2f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4, <vscale x 2 x double> %v5, <vscale x 2 x double> %v6) nounwind {
-; RV32-LABEL: vector_interleave_nxv14f64_nxv2f64:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -80
-; RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
-; RV32-NEXT:    addi s0, sp, 80
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 5
-; RV32-NEXT:    sub sp, sp, a0
-; RV32-NEXT:    andi sp, sp, -64
-; RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV32-NEXT:    vmv2r.v v26, v20
-; RV32-NEXT:    addi a0, sp, 64
-; RV32-NEXT:    vmv2r.v v24, v16
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 3
-; RV32-NEXT:    sub a1, a2, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 64
-; RV32-NEXT:    vmv2r.v v22, v12
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    vmv2r.v v20, v8
-; RV32-NEXT:    vmv1r.v v1, v20
-; RV32-NEXT:    vmv1r.v v3, v22
-; RV32-NEXT:    vmv1r.v v5, v24
-; RV32-NEXT:    vmv1r.v v7, v26
-; RV32-NEXT:    add a3, a0, a2
-; RV32-NEXT:    vmv1r.v v2, v10
-; RV32-NEXT:    add a4, a1, a2
-; RV32-NEXT:    slli a5, a2, 2
-; RV32-NEXT:    vmv1r.v v4, v14
-; RV32-NEXT:    slli a6, a2, 4
-; RV32-NEXT:    add a7, a4, a2
-; RV32-NEXT:    vmv1r.v v6, v18
-; RV32-NEXT:    sub a5, a6, a5
-; RV32-NEXT:    vmv1r.v v22, v11
-; RV32-NEXT:    add a6, a7, a2
-; RV32-NEXT:    vmv1r.v v24, v15
-; RV32-NEXT:    vsseg7e64.v v1, (a0)
-; RV32-NEXT:    vmv1r.v v26, v19
-; RV32-NEXT:    vsseg7e64.v v21, (a1)
-; RV32-NEXT:    vl1re64.v v18, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v19, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v20, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v21, (a6)
-; RV32-NEXT:    add a6, a3, a2
-; RV32-NEXT:    vl1re64.v v10, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v11, (a6)
-; RV32-NEXT:    vl1re64.v v8, (a0)
-; RV32-NEXT:    vl1re64.v v16, (a4)
-; RV32-NEXT:    vl1re64.v v9, (a3)
-; RV32-NEXT:    vl1re64.v v17, (a7)
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a3, 14
-; RV32-NEXT:    mul a0, a0, a3
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 64
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v12, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    vl1re64.v v13, (a6)
-; RV32-NEXT:    add a6, a6, a2
-; RV32-NEXT:    slli a2, a2, 3
-; RV32-NEXT:    add a2, a0, a2
-; RV32-NEXT:    vl1re64.v v14, (a6)
-; RV32-NEXT:    vl1re64.v v15, (a1)
-; RV32-NEXT:    add a5, a0, a5
-; RV32-NEXT:    vs2r.v v20, (a5)
-; RV32-NEXT:    vs4r.v v16, (a2)
-; RV32-NEXT:    vs8r.v v8, (a0)
-; RV32-NEXT:    vl8re64.v v16, (a2)
-; RV32-NEXT:    vl8re64.v v8, (a0)
-; RV32-NEXT:    addi sp, s0, -80
-; RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 80
-; RV32-NEXT:    ret
+define <vscale x 64 x bfloat> @vector_interleave_nxv64bf16_nxv8bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4, <vscale x 8 x bfloat> %v5, <vscale x 8 x bfloat> %v6, <vscale x 8 x bfloat> %v7) nounwind {
+; CHECK-LABEL: vector_interleave_nxv64bf16_nxv8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v28, v22
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vmv2r.v v26, v18
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    add a2, a0, a3
+; CHECK-NEXT:    add a4, a1, a3
+; CHECK-NEXT:    add a5, a2, a3
+; CHECK-NEXT:    vmv1r.v v1, v8
+; CHECK-NEXT:    vmv2r.v v24, v14
+; CHECK-NEXT:    add a6, a4, a3
+; CHECK-NEXT:    vmv2r.v v22, v10
+; CHECK-NEXT:    vmv1r.v v2, v22
+; CHECK-NEXT:    add a7, a5, a3
+; CHECK-NEXT:    vmv1r.v v3, v12
+; CHECK-NEXT:    add t0, a6, a3
+; CHECK-NEXT:    vmv1r.v v4, v24
+; CHECK-NEXT:    add t1, a7, a3
+; CHECK-NEXT:    vmv1r.v v5, v16
+; CHECK-NEXT:    add t2, t0, a3
+; CHECK-NEXT:    vmv1r.v v6, v26
+; CHECK-NEXT:    add t3, t1, a3
+; CHECK-NEXT:    vmv1r.v v7, v20
+; CHECK-NEXT:    add t4, t2, a3
+; CHECK-NEXT:    vmv1r.v v8, v28
+; CHECK-NEXT:    vmv1r.v v22, v9
+; CHECK-NEXT:    add t5, t3, a3
+; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    add t6, t4, a3
+; CHECK-NEXT:    vmv1r.v v26, v17
+; CHECK-NEXT:    vsseg8e16.v v1, (a0)
+; CHECK-NEXT:    vmv1r.v v28, v21
+; CHECK-NEXT:    vsseg8e16.v v22, (a1)
+; CHECK-NEXT:    vl1re16.v v14, (t5)
+; CHECK-NEXT:    add t5, t5, a3
+; CHECK-NEXT:    add a3, t6, a3
+; CHECK-NEXT:    vl1re16.v v22, (t6)
+; CHECK-NEXT:    vl1re16.v v15, (t5)
+; CHECK-NEXT:    vl1re16.v v23, (a3)
+; CHECK-NEXT:    vl1re16.v v12, (t1)
+; CHECK-NEXT:    vl1re16.v v20, (t2)
+; CHECK-NEXT:    vl1re16.v v13, (t3)
+; CHECK-NEXT:    vl1re16.v v21, (t4)
+; CHECK-NEXT:    vl1re16.v v10, (a5)
+; CHECK-NEXT:    vl1re16.v v18, (a6)
+; CHECK-NEXT:    vl1re16.v v11, (a7)
+; CHECK-NEXT:    vl1re16.v v19, (t0)
+; CHECK-NEXT:    vl1re16.v v8, (a0)
+; CHECK-NEXT:    vl1re16.v v16, (a1)
+; CHECK-NEXT:    vl1re16.v v9, (a2)
+; CHECK-NEXT:    vl1re16.v v17, (a4)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv64bf16_nxv8bf16:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 4
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVBB-NEXT:    vmv2r.v v28, v22
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    vmv2r.v v26, v18
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    slli a1, a1, 3
+; ZVBB-NEXT:    add a1, sp, a1
+; ZVBB-NEXT:    addi a1, a1, 16
+; ZVBB-NEXT:    csrr a3, vlenb
+; ZVBB-NEXT:    add a2, a0, a3
+; ZVBB-NEXT:    add a4, a1, a3
+; ZVBB-NEXT:    add a5, a2, a3
+; ZVBB-NEXT:    vmv1r.v v1, v8
+; ZVBB-NEXT:    vmv2r.v v24, v14
+; ZVBB-NEXT:    add a6, a4, a3
+; ZVBB-NEXT:    vmv2r.v v22, v10
+; ZVBB-NEXT:    vmv1r.v v2, v22
+; ZVBB-NEXT:    add a7, a5, a3
+; ZVBB-NEXT:    vmv1r.v v3, v12
+; ZVBB-NEXT:    add t0, a6, a3
+; ZVBB-NEXT:    vmv1r.v v4, v24
+; ZVBB-NEXT:    add t1, a7, a3
+; ZVBB-NEXT:    vmv1r.v v5, v16
+; ZVBB-NEXT:    add t2, t0, a3
+; ZVBB-NEXT:    vmv1r.v v6, v26
+; ZVBB-NEXT:    add t3, t1, a3
+; ZVBB-NEXT:    vmv1r.v v7, v20
+; ZVBB-NEXT:    add t4, t2, a3
+; ZVBB-NEXT:    vmv1r.v v8, v28
+; ZVBB-NEXT:    vmv1r.v v22, v9
+; ZVBB-NEXT:    add t5, t3, a3
+; ZVBB-NEXT:    vmv1r.v v24, v13
+; ZVBB-NEXT:    add t6, t4, a3
+; ZVBB-NEXT:    vmv1r.v v26, v17
+; ZVBB-NEXT:    vsseg8e16.v v1, (a0)
+; ZVBB-NEXT:    vmv1r.v v28, v21
+; ZVBB-NEXT:    vsseg8e16.v v22, (a1)
+; ZVBB-NEXT:    vl1re16.v v14, (t5)
+; ZVBB-NEXT:    add t5, t5, a3
+; ZVBB-NEXT:    add a3, t6, a3
+; ZVBB-NEXT:    vl1re16.v v22, (t6)
+; ZVBB-NEXT:    vl1re16.v v15, (t5)
+; ZVBB-NEXT:    vl1re16.v v23, (a3)
+; ZVBB-NEXT:    vl1re16.v v12, (t1)
+; ZVBB-NEXT:    vl1re16.v v20, (t2)
+; ZVBB-NEXT:    vl1re16.v v13, (t3)
+; ZVBB-NEXT:    vl1re16.v v21, (t4)
+; ZVBB-NEXT:    vl1re16.v v10, (a5)
+; ZVBB-NEXT:    vl1re16.v v18, (a6)
+; ZVBB-NEXT:    vl1re16.v v11, (a7)
+; ZVBB-NEXT:    vl1re16.v v19, (t0)
+; ZVBB-NEXT:    vl1re16.v v8, (a0)
+; ZVBB-NEXT:    vl1re16.v v16, (a1)
+; ZVBB-NEXT:    vl1re16.v v9, (a2)
+; ZVBB-NEXT:    vl1re16.v v17, (a4)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 4
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 64 x bfloat> @llvm.vector.interleave8.nxv64bf16(<vscale x 8 x bfloat> %v0, <vscale x 8 x bfloat> %v1, <vscale x 8 x bfloat> %v2, <vscale x 8 x bfloat> %v3, <vscale x 8 x bfloat> %v4, <vscale x 8 x bfloat> %v5, <vscale x 8 x bfloat> %v6, <vscale x 8 x bfloat> %v7)
+  ret <vscale x 64 x bfloat> %res
+}
+
+define <vscale x 8 x float> @vector_interleave_nxv8f32_nxv1f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4, <vscale x 1 x float> %v5, <vscale x 1 x float> %v6, <vscale x 1 x float> %v8) nounwind {
+; CHECK-LABEL: vector_interleave_nxv8f32_nxv1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    srli a2, a1, 1
+; CHECK-NEXT:    add a3, a0, a2
+; CHECK-NEXT:    add a4, a3, a2
+; CHECK-NEXT:    add a5, a4, a2
+; CHECK-NEXT:    add a6, a5, a2
+; CHECK-NEXT:    add a7, a6, a2
+; CHECK-NEXT:    vsetvli t0, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vsseg8e32.v v8, (a0)
+; CHECK-NEXT:    add t0, a7, a2
+; CHECK-NEXT:    add a2, t0, a2
+; CHECK-NEXT:    vle32.v v11, (t0)
+; CHECK-NEXT:    vle32.v v8, (a2)
+; CHECK-NEXT:    srli a1, a1, 3
+; CHECK-NEXT:    add a2, a1, a1
+; CHECK-NEXT:    vle32.v v9, (a7)
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v11, v8, a1
+; CHECK-NEXT:    vsetvli a7, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v10, (a6)
+; CHECK-NEXT:    vle32.v v8, (a5)
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v9, a1
+; CHECK-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v9, (a4)
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v9, v8, a1
+; CHECK-NEXT:    vsetvli a4, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vle32.v v12, (a3)
+; CHECK-NEXT:    vle32.v v8, (a0)
+; CHECK-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v12, a1
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
 ;
-; RV64-LABEL: vector_interleave_nxv14f64_nxv2f64:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -80
-; RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
-; RV64-NEXT:    addi s0, sp, 80
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 5
-; RV64-NEXT:    sub sp, sp, a0
-; RV64-NEXT:    andi sp, sp, -64
-; RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; RV64-NEXT:    vmv2r.v v26, v20
-; RV64-NEXT:    addi a0, sp, 64
-; RV64-NEXT:    vmv2r.v v24, v16
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a2, a1, 3
-; RV64-NEXT:    sub a1, a2, a1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 64
-; RV64-NEXT:    vmv2r.v v22, v12
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    vmv2r.v v20, v8
-; RV64-NEXT:    vmv1r.v v1, v20
-; RV64-NEXT:    vmv1r.v v3, v22
-; RV64-NEXT:    vmv1r.v v5, v24
-; RV64-NEXT:    vmv1r.v v7, v26
-; RV64-NEXT:    add a3, a0, a2
-; RV64-NEXT:    vmv1r.v v2, v10
-; RV64-NEXT:    add a4, a1, a2
-; RV64-NEXT:    slli a5, a2, 2
-; RV64-NEXT:    vmv1r.v v4, v14
-; RV64-NEXT:    slli a6, a2, 4
-; RV64-NEXT:    add a7, a4, a2
-; RV64-NEXT:    vmv1r.v v6, v18
-; RV64-NEXT:    sub a5, a6, a5
-; RV64-NEXT:    vmv1r.v v22, v11
-; RV64-NEXT:    add a6, a7, a2
-; RV64-NEXT:    vmv1r.v v24, v15
-; RV64-NEXT:    vsseg7e64.v v1, (a0)
-; RV64-NEXT:    vmv1r.v v26, v19
-; RV64-NEXT:    vsseg7e64.v v21, (a1)
-; RV64-NEXT:    vl1re64.v v18, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v19, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v20, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v21, (a6)
-; RV64-NEXT:    add a6, a3, a2
-; RV64-NEXT:    vl1re64.v v10, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v11, (a6)
-; RV64-NEXT:    vl1re64.v v8, (a0)
-; RV64-NEXT:    vl1re64.v v16, (a4)
-; RV64-NEXT:    vl1re64.v v9, (a3)
-; RV64-NEXT:    vl1re64.v v17, (a7)
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    li a3, 14
-; RV64-NEXT:    mul a0, a0, a3
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 64
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v12, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    vl1re64.v v13, (a6)
-; RV64-NEXT:    add a6, a6, a2
-; RV64-NEXT:    slli a2, a2, 3
-; RV64-NEXT:    add a2, a0, a2
-; RV64-NEXT:    vl1re64.v v14, (a6)
-; RV64-NEXT:    vl1re64.v v15, (a1)
-; RV64-NEXT:    add a5, a0, a5
-; RV64-NEXT:    vs2r.v v20, (a5)
-; RV64-NEXT:    vs4r.v v16, (a2)
-; RV64-NEXT:    vs8r.v v8, (a0)
-; RV64-NEXT:    vl8re64.v v16, (a2)
-; RV64-NEXT:    vl8re64.v v8, (a0)
-; RV64-NEXT:    addi sp, s0, -80
-; RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 80
-; RV64-NEXT:    ret
+; ZVBB-LABEL: vector_interleave_nxv8f32_nxv1f32:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 2
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    srli a2, a1, 1
+; ZVBB-NEXT:    add a3, a0, a2
+; ZVBB-NEXT:    add a4, a3, a2
+; ZVBB-NEXT:    add a5, a4, a2
+; ZVBB-NEXT:    add a6, a5, a2
+; ZVBB-NEXT:    add a7, a6, a2
+; ZVBB-NEXT:    vsetvli t0, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vsseg8e32.v v8, (a0)
+; ZVBB-NEXT:    add t0, a7, a2
+; ZVBB-NEXT:    add a2, t0, a2
+; ZVBB-NEXT:    vle32.v v11, (t0)
+; ZVBB-NEXT:    vle32.v v8, (a2)
+; ZVBB-NEXT:    srli a1, a1, 3
+; ZVBB-NEXT:    add a2, a1, a1
+; ZVBB-NEXT:    vle32.v v9, (a7)
+; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v11, v8, a1
+; ZVBB-NEXT:    vsetvli a7, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vle32.v v10, (a6)
+; ZVBB-NEXT:    vle32.v v8, (a5)
+; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v10, v9, a1
+; ZVBB-NEXT:    vsetvli a5, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vle32.v v9, (a4)
+; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v9, v8, a1
+; ZVBB-NEXT:    vsetvli a4, zero, e32, mf2, ta, ma
+; ZVBB-NEXT:    vle32.v v12, (a3)
+; ZVBB-NEXT:    vle32.v v8, (a0)
+; ZVBB-NEXT:    vsetvli zero, a2, e32, m1, ta, ma
+; ZVBB-NEXT:    vslideup.vx v8, v12, a1
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 2
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 8 x float> @llvm.vector.interleave8.nxv8f32(<vscale x 1 x float> %v0, <vscale x 1 x float> %v1, <vscale x 1 x float> %v2, <vscale x 1 x float> %v3, <vscale x 1 x float> %v4, <vscale x 1 x float> %v5, <vscale x 1 x float> %v6, <vscale x 1 x float> %v8)
+  ret <vscale x 8 x float> %res
+}
+
+define <vscale x 16 x float> @vector_interleave_nxv16f32_nxv2f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4, <vscale x 2 x float> %v5, <vscale x 2 x float> %v6, <vscale x 2 x float> %v7) nounwind {
+; CHECK-LABEL: vector_interleave_nxv16f32_nxv2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    add a4, a3, a1
+; CHECK-NEXT:    add a5, a4, a1
+; CHECK-NEXT:    add a6, a5, a1
+; CHECK-NEXT:    add a7, a6, a1
+; CHECK-NEXT:    vsetvli t0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vsseg8e32.v v8, (a0)
+; CHECK-NEXT:    vl1re32.v v14, (a7)
+; CHECK-NEXT:    add a1, a7, a1
+; CHECK-NEXT:    vl1re32.v v15, (a1)
+; CHECK-NEXT:    vl1re32.v v12, (a5)
+; CHECK-NEXT:    vl1re32.v v13, (a6)
+; CHECK-NEXT:    vl1re32.v v10, (a3)
+; CHECK-NEXT:    vl1re32.v v11, (a4)
+; CHECK-NEXT:    vl1re32.v v8, (a0)
+; CHECK-NEXT:    vl1re32.v v9, (a2)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
 ;
-; ZVBB-RV32-LABEL: vector_interleave_nxv14f64_nxv2f64:
-; ZVBB-RV32:       # %bb.0:
-; ZVBB-RV32-NEXT:    addi sp, sp, -80
-; ZVBB-RV32-NEXT:    sw ra, 76(sp) # 4-byte Folded Spill
-; ZVBB-RV32-NEXT:    sw s0, 72(sp) # 4-byte Folded Spill
-; ZVBB-RV32-NEXT:    addi s0, sp, 80
-; ZVBB-RV32-NEXT:    csrr a0, vlenb
-; ZVBB-RV32-NEXT:    slli a0, a0, 5
-; ZVBB-RV32-NEXT:    sub sp, sp, a0
-; ZVBB-RV32-NEXT:    andi sp, sp, -64
-; ZVBB-RV32-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; ZVBB-RV32-NEXT:    vmv2r.v v26, v20
-; ZVBB-RV32-NEXT:    addi a0, sp, 64
-; ZVBB-RV32-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV32-NEXT:    csrr a1, vlenb
-; ZVBB-RV32-NEXT:    slli a2, a1, 3
-; ZVBB-RV32-NEXT:    sub a1, a2, a1
-; ZVBB-RV32-NEXT:    add a1, sp, a1
-; ZVBB-RV32-NEXT:    addi a1, a1, 64
-; ZVBB-RV32-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV32-NEXT:    csrr a2, vlenb
-; ZVBB-RV32-NEXT:    vmv2r.v v20, v8
-; ZVBB-RV32-NEXT:    vmv1r.v v1, v20
-; ZVBB-RV32-NEXT:    vmv1r.v v3, v22
-; ZVBB-RV32-NEXT:    vmv1r.v v5, v24
-; ZVBB-RV32-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV32-NEXT:    add a3, a0, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV32-NEXT:    add a4, a1, a2
-; ZVBB-RV32-NEXT:    slli a5, a2, 2
-; ZVBB-RV32-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV32-NEXT:    slli a6, a2, 4
-; ZVBB-RV32-NEXT:    add a7, a4, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV32-NEXT:    sub a5, a6, a5
-; ZVBB-RV32-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV32-NEXT:    add a6, a7, a2
-; ZVBB-RV32-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV32-NEXT:    vsseg7e64.v v1, (a0)
-; ZVBB-RV32-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV32-NEXT:    vsseg7e64.v v21, (a1)
-; ZVBB-RV32-NEXT:    vl1re64.v v18, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v19, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v20, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v21, (a6)
-; ZVBB-RV32-NEXT:    add a6, a3, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v10, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v11, (a6)
-; ZVBB-RV32-NEXT:    vl1re64.v v8, (a0)
-; ZVBB-RV32-NEXT:    vl1re64.v v16, (a4)
-; ZVBB-RV32-NEXT:    vl1re64.v v9, (a3)
-; ZVBB-RV32-NEXT:    vl1re64.v v17, (a7)
-; ZVBB-RV32-NEXT:    csrr a0, vlenb
-; ZVBB-RV32-NEXT:    li a3, 14
-; ZVBB-RV32-NEXT:    mul a0, a0, a3
-; ZVBB-RV32-NEXT:    add a0, sp, a0
-; ZVBB-RV32-NEXT:    addi a0, a0, 64
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v12, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v13, (a6)
-; ZVBB-RV32-NEXT:    add a6, a6, a2
-; ZVBB-RV32-NEXT:    slli a2, a2, 3
-; ZVBB-RV32-NEXT:    add a2, a0, a2
-; ZVBB-RV32-NEXT:    vl1re64.v v14, (a6)
-; ZVBB-RV32-NEXT:    vl1re64.v v15, (a1)
-; ZVBB-RV32-NEXT:    add a5, a0, a5
-; ZVBB-RV32-NEXT:    vs2r.v v20, (a5)
-; ZVBB-RV32-NEXT:    vs4r.v v16, (a2)
-; ZVBB-RV32-NEXT:    vs8r.v v8, (a0)
-; ZVBB-RV32-NEXT:    vl8re64.v v16, (a2)
-; ZVBB-RV32-NEXT:    vl8re64.v v8, (a0)
-; ZVBB-RV32-NEXT:    addi sp, s0, -80
-; ZVBB-RV32-NEXT:    lw ra, 76(sp) # 4-byte Folded Reload
-; ZVBB-RV32-NEXT:    lw s0, 72(sp) # 4-byte Folded Reload
-; ZVBB-RV32-NEXT:    addi sp, sp, 80
-; ZVBB-RV32-NEXT:    ret
+; ZVBB-LABEL: vector_interleave_nxv16f32_nxv2f32:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    add a4, a3, a1
+; ZVBB-NEXT:    add a5, a4, a1
+; ZVBB-NEXT:    add a6, a5, a1
+; ZVBB-NEXT:    add a7, a6, a1
+; ZVBB-NEXT:    vsetvli t0, zero, e32, m1, ta, ma
+; ZVBB-NEXT:    vsseg8e32.v v8, (a0)
+; ZVBB-NEXT:    vl1re32.v v14, (a7)
+; ZVBB-NEXT:    add a1, a7, a1
+; ZVBB-NEXT:    vl1re32.v v15, (a1)
+; ZVBB-NEXT:    vl1re32.v v12, (a5)
+; ZVBB-NEXT:    vl1re32.v v13, (a6)
+; ZVBB-NEXT:    vl1re32.v v10, (a3)
+; ZVBB-NEXT:    vl1re32.v v11, (a4)
+; ZVBB-NEXT:    vl1re32.v v8, (a0)
+; ZVBB-NEXT:    vl1re32.v v9, (a2)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 16 x float> @llvm.vector.interleave8.nxv16f32(<vscale x 2 x float> %v0, <vscale x 2 x float> %v1, <vscale x 2 x float> %v2, <vscale x 2 x float> %v3, <vscale x 2 x float> %v4, <vscale x 2 x float> %v5, <vscale x 2 x float> %v6, <vscale x 2 x float> %v7)
+  ret <vscale x 16 x float> %res
+}
+
+define <vscale x 32 x float> @vector_interleave_nxv32f32_nxv4f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4, <vscale x 4 x float> %v5, <vscale x 4 x float> %v6, <vscale x 4 x float> %v7) nounwind {
+; CHECK-LABEL: vector_interleave_nxv32f32_nxv4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v28, v22
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vmv2r.v v26, v18
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    add a2, a0, a3
+; CHECK-NEXT:    add a4, a1, a3
+; CHECK-NEXT:    add a5, a2, a3
+; CHECK-NEXT:    vmv1r.v v1, v8
+; CHECK-NEXT:    vmv2r.v v24, v14
+; CHECK-NEXT:    add a6, a4, a3
+; CHECK-NEXT:    vmv2r.v v22, v10
+; CHECK-NEXT:    vmv1r.v v2, v22
+; CHECK-NEXT:    add a7, a5, a3
+; CHECK-NEXT:    vmv1r.v v3, v12
+; CHECK-NEXT:    add t0, a6, a3
+; CHECK-NEXT:    vmv1r.v v4, v24
+; CHECK-NEXT:    add t1, a7, a3
+; CHECK-NEXT:    vmv1r.v v5, v16
+; CHECK-NEXT:    add t2, t0, a3
+; CHECK-NEXT:    vmv1r.v v6, v26
+; CHECK-NEXT:    add t3, t1, a3
+; CHECK-NEXT:    vmv1r.v v7, v20
+; CHECK-NEXT:    add t4, t2, a3
+; CHECK-NEXT:    vmv1r.v v8, v28
+; CHECK-NEXT:    vmv1r.v v22, v9
+; CHECK-NEXT:    add t5, t3, a3
+; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    add t6, t4, a3
+; CHECK-NEXT:    vmv1r.v v26, v17
+; CHECK-NEXT:    vsseg8e32.v v1, (a0)
+; CHECK-NEXT:    vmv1r.v v28, v21
+; CHECK-NEXT:    vsseg8e32.v v22, (a1)
+; CHECK-NEXT:    vl1re32.v v14, (t5)
+; CHECK-NEXT:    add t5, t5, a3
+; CHECK-NEXT:    add a3, t6, a3
+; CHECK-NEXT:    vl1re32.v v22, (t6)
+; CHECK-NEXT:    vl1re32.v v15, (t5)
+; CHECK-NEXT:    vl1re32.v v23, (a3)
+; CHECK-NEXT:    vl1re32.v v12, (t1)
+; CHECK-NEXT:    vl1re32.v v20, (t2)
+; CHECK-NEXT:    vl1re32.v v13, (t3)
+; CHECK-NEXT:    vl1re32.v v21, (t4)
+; CHECK-NEXT:    vl1re32.v v10, (a5)
+; CHECK-NEXT:    vl1re32.v v18, (a6)
+; CHECK-NEXT:    vl1re32.v v11, (a7)
+; CHECK-NEXT:    vl1re32.v v19, (t0)
+; CHECK-NEXT:    vl1re32.v v8, (a0)
+; CHECK-NEXT:    vl1re32.v v16, (a1)
+; CHECK-NEXT:    vl1re32.v v9, (a2)
+; CHECK-NEXT:    vl1re32.v v17, (a4)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
 ;
-; ZVBB-RV64-LABEL: vector_interleave_nxv14f64_nxv2f64:
-; ZVBB-RV64:       # %bb.0:
-; ZVBB-RV64-NEXT:    addi sp, sp, -80
-; ZVBB-RV64-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
-; ZVBB-RV64-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
-; ZVBB-RV64-NEXT:    addi s0, sp, 80
-; ZVBB-RV64-NEXT:    csrr a0, vlenb
-; ZVBB-RV64-NEXT:    slli a0, a0, 5
-; ZVBB-RV64-NEXT:    sub sp, sp, a0
-; ZVBB-RV64-NEXT:    andi sp, sp, -64
-; ZVBB-RV64-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; ZVBB-RV64-NEXT:    vmv2r.v v26, v20
-; ZVBB-RV64-NEXT:    addi a0, sp, 64
-; ZVBB-RV64-NEXT:    vmv2r.v v24, v16
-; ZVBB-RV64-NEXT:    csrr a1, vlenb
-; ZVBB-RV64-NEXT:    slli a2, a1, 3
-; ZVBB-RV64-NEXT:    sub a1, a2, a1
-; ZVBB-RV64-NEXT:    add a1, sp, a1
-; ZVBB-RV64-NEXT:    addi a1, a1, 64
-; ZVBB-RV64-NEXT:    vmv2r.v v22, v12
-; ZVBB-RV64-NEXT:    csrr a2, vlenb
-; ZVBB-RV64-NEXT:    vmv2r.v v20, v8
-; ZVBB-RV64-NEXT:    vmv1r.v v1, v20
-; ZVBB-RV64-NEXT:    vmv1r.v v3, v22
-; ZVBB-RV64-NEXT:    vmv1r.v v5, v24
-; ZVBB-RV64-NEXT:    vmv1r.v v7, v26
-; ZVBB-RV64-NEXT:    add a3, a0, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v2, v10
-; ZVBB-RV64-NEXT:    add a4, a1, a2
-; ZVBB-RV64-NEXT:    slli a5, a2, 2
-; ZVBB-RV64-NEXT:    vmv1r.v v4, v14
-; ZVBB-RV64-NEXT:    slli a6, a2, 4
-; ZVBB-RV64-NEXT:    add a7, a4, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v6, v18
-; ZVBB-RV64-NEXT:    sub a5, a6, a5
-; ZVBB-RV64-NEXT:    vmv1r.v v22, v11
-; ZVBB-RV64-NEXT:    add a6, a7, a2
-; ZVBB-RV64-NEXT:    vmv1r.v v24, v15
-; ZVBB-RV64-NEXT:    vsseg7e64.v v1, (a0)
-; ZVBB-RV64-NEXT:    vmv1r.v v26, v19
-; ZVBB-RV64-NEXT:    vsseg7e64.v v21, (a1)
-; ZVBB-RV64-NEXT:    vl1re64.v v18, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v19, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v20, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v21, (a6)
-; ZVBB-RV64-NEXT:    add a6, a3, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v10, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v11, (a6)
-; ZVBB-RV64-NEXT:    vl1re64.v v8, (a0)
-; ZVBB-RV64-NEXT:    vl1re64.v v16, (a4)
-; ZVBB-RV64-NEXT:    vl1re64.v v9, (a3)
-; ZVBB-RV64-NEXT:    vl1re64.v v17, (a7)
-; ZVBB-RV64-NEXT:    csrr a0, vlenb
-; ZVBB-RV64-NEXT:    li a3, 14
-; ZVBB-RV64-NEXT:    mul a0, a0, a3
-; ZVBB-RV64-NEXT:    add a0, sp, a0
-; ZVBB-RV64-NEXT:    addi a0, a0, 64
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v12, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v13, (a6)
-; ZVBB-RV64-NEXT:    add a6, a6, a2
-; ZVBB-RV64-NEXT:    slli a2, a2, 3
-; ZVBB-RV64-NEXT:    add a2, a0, a2
-; ZVBB-RV64-NEXT:    vl1re64.v v14, (a6)
-; ZVBB-RV64-NEXT:    vl1re64.v v15, (a1)
-; ZVBB-RV64-NEXT:    add a5, a0, a5
-; ZVBB-RV64-NEXT:    vs2r.v v20, (a5)
-; ZVBB-RV64-NEXT:    vs4r.v v16, (a2)
-; ZVBB-RV64-NEXT:    vs8r.v v8, (a0)
-; ZVBB-RV64-NEXT:    vl8re64.v v16, (a2)
-; ZVBB-RV64-NEXT:    vl8re64.v v8, (a0)
-; ZVBB-RV64-NEXT:    addi sp, s0, -80
-; ZVBB-RV64-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
-; ZVBB-RV64-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
-; ZVBB-RV64-NEXT:    addi sp, sp, 80
-; ZVBB-RV64-NEXT:    ret
+; ZVBB-LABEL: vector_interleave_nxv32f32_nxv4f32:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 4
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    vsetvli a0, zero, e32, m1, ta, ma
+; ZVBB-NEXT:    vmv2r.v v28, v22
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    vmv2r.v v26, v18
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    slli a1, a1, 3
+; ZVBB-NEXT:    add a1, sp, a1
+; ZVBB-NEXT:    addi a1, a1, 16
+; ZVBB-NEXT:    csrr a3, vlenb
+; ZVBB-NEXT:    add a2, a0, a3
+; ZVBB-NEXT:    add a4, a1, a3
+; ZVBB-NEXT:    add a5, a2, a3
+; ZVBB-NEXT:    vmv1r.v v1, v8
+; ZVBB-NEXT:    vmv2r.v v24, v14
+; ZVBB-NEXT:    add a6, a4, a3
+; ZVBB-NEXT:    vmv2r.v v22, v10
+; ZVBB-NEXT:    vmv1r.v v2, v22
+; ZVBB-NEXT:    add a7, a5, a3
+; ZVBB-NEXT:    vmv1r.v v3, v12
+; ZVBB-NEXT:    add t0, a6, a3
+; ZVBB-NEXT:    vmv1r.v v4, v24
+; ZVBB-NEXT:    add t1, a7, a3
+; ZVBB-NEXT:    vmv1r.v v5, v16
+; ZVBB-NEXT:    add t2, t0, a3
+; ZVBB-NEXT:    vmv1r.v v6, v26
+; ZVBB-NEXT:    add t3, t1, a3
+; ZVBB-NEXT:    vmv1r.v v7, v20
+; ZVBB-NEXT:    add t4, t2, a3
+; ZVBB-NEXT:    vmv1r.v v8, v28
+; ZVBB-NEXT:    vmv1r.v v22, v9
+; ZVBB-NEXT:    add t5, t3, a3
+; ZVBB-NEXT:    vmv1r.v v24, v13
+; ZVBB-NEXT:    add t6, t4, a3
+; ZVBB-NEXT:    vmv1r.v v26, v17
+; ZVBB-NEXT:    vsseg8e32.v v1, (a0)
+; ZVBB-NEXT:    vmv1r.v v28, v21
+; ZVBB-NEXT:    vsseg8e32.v v22, (a1)
+; ZVBB-NEXT:    vl1re32.v v14, (t5)
+; ZVBB-NEXT:    add t5, t5, a3
+; ZVBB-NEXT:    add a3, t6, a3
+; ZVBB-NEXT:    vl1re32.v v22, (t6)
+; ZVBB-NEXT:    vl1re32.v v15, (t5)
+; ZVBB-NEXT:    vl1re32.v v23, (a3)
+; ZVBB-NEXT:    vl1re32.v v12, (t1)
+; ZVBB-NEXT:    vl1re32.v v20, (t2)
+; ZVBB-NEXT:    vl1re32.v v13, (t3)
+; ZVBB-NEXT:    vl1re32.v v21, (t4)
+; ZVBB-NEXT:    vl1re32.v v10, (a5)
+; ZVBB-NEXT:    vl1re32.v v18, (a6)
+; ZVBB-NEXT:    vl1re32.v v11, (a7)
+; ZVBB-NEXT:    vl1re32.v v19, (t0)
+; ZVBB-NEXT:    vl1re32.v v8, (a0)
+; ZVBB-NEXT:    vl1re32.v v16, (a1)
+; ZVBB-NEXT:    vl1re32.v v9, (a2)
+; ZVBB-NEXT:    vl1re32.v v17, (a4)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 4
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 32 x float> @llvm.vector.interleave8.nxv32f32(<vscale x 4 x float> %v0, <vscale x 4 x float> %v1, <vscale x 4 x float> %v2, <vscale x 4 x float> %v3, <vscale x 4 x float> %v4, <vscale x 4 x float> %v5, <vscale x 4 x float> %v6, <vscale x 4 x float> %v7)
+  ret <vscale x 32 x float> %res
+}
+
+define <vscale x 8 x double> @vector_interleave_nxv8f64_nxv1f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4, <vscale x 1 x double> %v5, <vscale x 1 x double> %v6, <vscale x 1 x double> %v8) nounwind {
+; CHECK-LABEL: vector_interleave_nxv8f64_nxv1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    add a2, a0, a1
+; CHECK-NEXT:    add a3, a2, a1
+; CHECK-NEXT:    add a4, a3, a1
+; CHECK-NEXT:    add a5, a4, a1
+; CHECK-NEXT:    add a6, a5, a1
+; CHECK-NEXT:    add a7, a6, a1
+; CHECK-NEXT:    vsetvli t0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vsseg8e64.v v8, (a0)
+; CHECK-NEXT:    vl1re64.v v14, (a7)
+; CHECK-NEXT:    add a1, a7, a1
+; CHECK-NEXT:    vl1re64.v v15, (a1)
+; CHECK-NEXT:    vl1re64.v v12, (a5)
+; CHECK-NEXT:    vl1re64.v v13, (a6)
+; CHECK-NEXT:    vl1re64.v v10, (a3)
+; CHECK-NEXT:    vl1re64.v v11, (a4)
+; CHECK-NEXT:    vl1re64.v v8, (a0)
+; CHECK-NEXT:    vl1re64.v v9, (a2)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
 ;
-; ZIP-LABEL: vector_interleave_nxv14f64_nxv2f64:
-; ZIP:       # %bb.0:
-; ZIP-NEXT:    addi sp, sp, -80
-; ZIP-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
-; ZIP-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
-; ZIP-NEXT:    addi s0, sp, 80
-; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    slli a0, a0, 5
-; ZIP-NEXT:    sub sp, sp, a0
-; ZIP-NEXT:    andi sp, sp, -64
-; ZIP-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
-; ZIP-NEXT:    vmv2r.v v26, v20
-; ZIP-NEXT:    addi a0, sp, 64
-; ZIP-NEXT:    vmv2r.v v24, v16
-; ZIP-NEXT:    csrr a1, vlenb
-; ZIP-NEXT:    slli a2, a1, 3
-; ZIP-NEXT:    sub a1, a2, a1
-; ZIP-NEXT:    add a1, sp, a1
-; ZIP-NEXT:    addi a1, a1, 64
-; ZIP-NEXT:    vmv2r.v v22, v12
-; ZIP-NEXT:    csrr a2, vlenb
-; ZIP-NEXT:    vmv2r.v v20, v8
-; ZIP-NEXT:    vmv1r.v v1, v20
-; ZIP-NEXT:    vmv1r.v v3, v22
-; ZIP-NEXT:    vmv1r.v v5, v24
-; ZIP-NEXT:    vmv1r.v v7, v26
-; ZIP-NEXT:    add a3, a0, a2
-; ZIP-NEXT:    vmv1r.v v2, v10
-; ZIP-NEXT:    add a4, a1, a2
-; ZIP-NEXT:    slli a5, a2, 2
-; ZIP-NEXT:    vmv1r.v v4, v14
-; ZIP-NEXT:    slli a6, a2, 4
-; ZIP-NEXT:    add a7, a4, a2
-; ZIP-NEXT:    vmv1r.v v6, v18
-; ZIP-NEXT:    sub a5, a6, a5
-; ZIP-NEXT:    vmv1r.v v22, v11
-; ZIP-NEXT:    add a6, a7, a2
-; ZIP-NEXT:    vmv1r.v v24, v15
-; ZIP-NEXT:    vsseg7e64.v v1, (a0)
-; ZIP-NEXT:    vmv1r.v v26, v19
-; ZIP-NEXT:    vsseg7e64.v v21, (a1)
-; ZIP-NEXT:    vl1re64.v v18, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re64.v v19, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re64.v v20, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re64.v v21, (a6)
-; ZIP-NEXT:    add a6, a3, a2
-; ZIP-NEXT:    vl1re64.v v10, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re64.v v11, (a6)
-; ZIP-NEXT:    vl1re64.v v8, (a0)
-; ZIP-NEXT:    vl1re64.v v16, (a4)
-; ZIP-NEXT:    vl1re64.v v9, (a3)
-; ZIP-NEXT:    vl1re64.v v17, (a7)
-; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    li a3, 14
-; ZIP-NEXT:    mul a0, a0, a3
-; ZIP-NEXT:    add a0, sp, a0
-; ZIP-NEXT:    addi a0, a0, 64
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re64.v v12, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    vl1re64.v v13, (a6)
-; ZIP-NEXT:    add a6, a6, a2
-; ZIP-NEXT:    slli a2, a2, 3
-; ZIP-NEXT:    add a2, a0, a2
-; ZIP-NEXT:    vl1re64.v v14, (a6)
-; ZIP-NEXT:    vl1re64.v v15, (a1)
-; ZIP-NEXT:    add a5, a0, a5
-; ZIP-NEXT:    vs2r.v v20, (a5)
-; ZIP-NEXT:    vs4r.v v16, (a2)
-; ZIP-NEXT:    vs8r.v v8, (a0)
-; ZIP-NEXT:    vl8re64.v v16, (a2)
-; ZIP-NEXT:    vl8re64.v v8, (a0)
-; ZIP-NEXT:    addi sp, s0, -80
-; ZIP-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
-; ZIP-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
-; ZIP-NEXT:    addi sp, sp, 80
-; ZIP-NEXT:    ret
-  %res = call <vscale x 14 x double> @llvm.vector.interleave7.nxv14f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4, <vscale x 2 x double> %v5, <vscale x 2 x double> %v6)
-  ret <vscale x 14 x double> %res
+; ZVBB-LABEL: vector_interleave_nxv8f64_nxv1f64:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    add a2, a0, a1
+; ZVBB-NEXT:    add a3, a2, a1
+; ZVBB-NEXT:    add a4, a3, a1
+; ZVBB-NEXT:    add a5, a4, a1
+; ZVBB-NEXT:    add a6, a5, a1
+; ZVBB-NEXT:    add a7, a6, a1
+; ZVBB-NEXT:    vsetvli t0, zero, e64, m1, ta, ma
+; ZVBB-NEXT:    vsseg8e64.v v8, (a0)
+; ZVBB-NEXT:    vl1re64.v v14, (a7)
+; ZVBB-NEXT:    add a1, a7, a1
+; ZVBB-NEXT:    vl1re64.v v15, (a1)
+; ZVBB-NEXT:    vl1re64.v v12, (a5)
+; ZVBB-NEXT:    vl1re64.v v13, (a6)
+; ZVBB-NEXT:    vl1re64.v v10, (a3)
+; ZVBB-NEXT:    vl1re64.v v11, (a4)
+; ZVBB-NEXT:    vl1re64.v v8, (a0)
+; ZVBB-NEXT:    vl1re64.v v9, (a2)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 3
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 8 x double> @llvm.vector.interleave8.nxv8f64(<vscale x 1 x double> %v0, <vscale x 1 x double> %v1, <vscale x 1 x double> %v2, <vscale x 1 x double> %v3, <vscale x 1 x double> %v4, <vscale x 1 x double> %v5, <vscale x 1 x double> %v6, <vscale x 1 x double> %v8)
+  ret <vscale x 8 x double> %res
+}
+
+define <vscale x 16 x double> @vector_interleave_nxv16f64_nxv2f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4, <vscale x 2 x double> %v5, <vscale x 2 x double> %v6, <vscale x 2 x double> %v7) nounwind {
+; CHECK-LABEL: vector_interleave_nxv16f64_nxv2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; CHECK-NEXT:    vmv2r.v v28, v22
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vmv2r.v v26, v18
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    add a2, a0, a3
+; CHECK-NEXT:    add a4, a1, a3
+; CHECK-NEXT:    add a5, a2, a3
+; CHECK-NEXT:    vmv1r.v v1, v8
+; CHECK-NEXT:    vmv2r.v v24, v14
+; CHECK-NEXT:    add a6, a4, a3
+; CHECK-NEXT:    vmv2r.v v22, v10
+; CHECK-NEXT:    vmv1r.v v2, v22
+; CHECK-NEXT:    add a7, a5, a3
+; CHECK-NEXT:    vmv1r.v v3, v12
+; CHECK-NEXT:    add t0, a6, a3
+; CHECK-NEXT:    vmv1r.v v4, v24
+; CHECK-NEXT:    add t1, a7, a3
+; CHECK-NEXT:    vmv1r.v v5, v16
+; CHECK-NEXT:    add t2, t0, a3
+; CHECK-NEXT:    vmv1r.v v6, v26
+; CHECK-NEXT:    add t3, t1, a3
+; CHECK-NEXT:    vmv1r.v v7, v20
+; CHECK-NEXT:    add t4, t2, a3
+; CHECK-NEXT:    vmv1r.v v8, v28
+; CHECK-NEXT:    vmv1r.v v22, v9
+; CHECK-NEXT:    add t5, t3, a3
+; CHECK-NEXT:    vmv1r.v v24, v13
+; CHECK-NEXT:    add t6, t4, a3
+; CHECK-NEXT:    vmv1r.v v26, v17
+; CHECK-NEXT:    vsseg8e64.v v1, (a0)
+; CHECK-NEXT:    vmv1r.v v28, v21
+; CHECK-NEXT:    vsseg8e64.v v22, (a1)
+; CHECK-NEXT:    vl1re64.v v14, (t5)
+; CHECK-NEXT:    add t5, t5, a3
+; CHECK-NEXT:    add a3, t6, a3
+; CHECK-NEXT:    vl1re64.v v22, (t6)
+; CHECK-NEXT:    vl1re64.v v15, (t5)
+; CHECK-NEXT:    vl1re64.v v23, (a3)
+; CHECK-NEXT:    vl1re64.v v12, (t1)
+; CHECK-NEXT:    vl1re64.v v20, (t2)
+; CHECK-NEXT:    vl1re64.v v13, (t3)
+; CHECK-NEXT:    vl1re64.v v21, (t4)
+; CHECK-NEXT:    vl1re64.v v10, (a5)
+; CHECK-NEXT:    vl1re64.v v18, (a6)
+; CHECK-NEXT:    vl1re64.v v11, (a7)
+; CHECK-NEXT:    vl1re64.v v19, (t0)
+; CHECK-NEXT:    vl1re64.v v8, (a0)
+; CHECK-NEXT:    vl1re64.v v16, (a1)
+; CHECK-NEXT:    vl1re64.v v9, (a2)
+; CHECK-NEXT:    vl1re64.v v17, (a4)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+;
+; ZVBB-LABEL: vector_interleave_nxv16f64_nxv2f64:
+; ZVBB:       # %bb.0:
+; ZVBB-NEXT:    addi sp, sp, -16
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 4
+; ZVBB-NEXT:    sub sp, sp, a0
+; ZVBB-NEXT:    vsetvli a0, zero, e64, m1, ta, ma
+; ZVBB-NEXT:    vmv2r.v v28, v22
+; ZVBB-NEXT:    addi a0, sp, 16
+; ZVBB-NEXT:    vmv2r.v v26, v18
+; ZVBB-NEXT:    csrr a1, vlenb
+; ZVBB-NEXT:    slli a1, a1, 3
+; ZVBB-NEXT:    add a1, sp, a1
+; ZVBB-NEXT:    addi a1, a1, 16
+; ZVBB-NEXT:    csrr a3, vlenb
+; ZVBB-NEXT:    add a2, a0, a3
+; ZVBB-NEXT:    add a4, a1, a3
+; ZVBB-NEXT:    add a5, a2, a3
+; ZVBB-NEXT:    vmv1r.v v1, v8
+; ZVBB-NEXT:    vmv2r.v v24, v14
+; ZVBB-NEXT:    add a6, a4, a3
+; ZVBB-NEXT:    vmv2r.v v22, v10
+; ZVBB-NEXT:    vmv1r.v v2, v22
+; ZVBB-NEXT:    add a7, a5, a3
+; ZVBB-NEXT:    vmv1r.v v3, v12
+; ZVBB-NEXT:    add t0, a6, a3
+; ZVBB-NEXT:    vmv1r.v v4, v24
+; ZVBB-NEXT:    add t1, a7, a3
+; ZVBB-NEXT:    vmv1r.v v5, v16
+; ZVBB-NEXT:    add t2, t0, a3
+; ZVBB-NEXT:    vmv1r.v v6, v26
+; ZVBB-NEXT:    add t3, t1, a3
+; ZVBB-NEXT:    vmv1r.v v7, v20
+; ZVBB-NEXT:    add t4, t2, a3
+; ZVBB-NEXT:    vmv1r.v v8, v28
+; ZVBB-NEXT:    vmv1r.v v22, v9
+; ZVBB-NEXT:    add t5, t3, a3
+; ZVBB-NEXT:    vmv1r.v v24, v13
+; ZVBB-NEXT:    add t6, t4, a3
+; ZVBB-NEXT:    vmv1r.v v26, v17
+; ZVBB-NEXT:    vsseg8e64.v v1, (a0)
+; ZVBB-NEXT:    vmv1r.v v28, v21
+; ZVBB-NEXT:    vsseg8e64.v v22, (a1)
+; ZVBB-NEXT:    vl1re64.v v14, (t5)
+; ZVBB-NEXT:    add t5, t5, a3
+; ZVBB-NEXT:    add a3, t6, a3
+; ZVBB-NEXT:    vl1re64.v v22, (t6)
+; ZVBB-NEXT:    vl1re64.v v15, (t5)
+; ZVBB-NEXT:    vl1re64.v v23, (a3)
+; ZVBB-NEXT:    vl1re64.v v12, (t1)
+; ZVBB-NEXT:    vl1re64.v v20, (t2)
+; ZVBB-NEXT:    vl1re64.v v13, (t3)
+; ZVBB-NEXT:    vl1re64.v v21, (t4)
+; ZVBB-NEXT:    vl1re64.v v10, (a5)
+; ZVBB-NEXT:    vl1re64.v v18, (a6)
+; ZVBB-NEXT:    vl1re64.v v11, (a7)
+; ZVBB-NEXT:    vl1re64.v v19, (t0)
+; ZVBB-NEXT:    vl1re64.v v8, (a0)
+; ZVBB-NEXT:    vl1re64.v v16, (a1)
+; ZVBB-NEXT:    vl1re64.v v9, (a2)
+; ZVBB-NEXT:    vl1re64.v v17, (a4)
+; ZVBB-NEXT:    csrr a0, vlenb
+; ZVBB-NEXT:    slli a0, a0, 4
+; ZVBB-NEXT:    add sp, sp, a0
+; ZVBB-NEXT:    addi sp, sp, 16
+; ZVBB-NEXT:    ret
+  %res = call <vscale x 16 x double> @llvm.vector.interleave8.nxv16f64(<vscale x 2 x double> %v0, <vscale x 2 x double> %v1, <vscale x 2 x double> %v2, <vscale x 2 x double> %v3, <vscale x 2 x double> %v4, <vscale x 2 x double> %v5, <vscale x 2 x double> %v6, <vscale x 2 x double> %v7)
+  ret <vscale x 16 x double> %res
 }

>From 777ccf859ec87e5d2f5d3aad36cbcf920ea1b648 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 21 May 2025 12:38:13 +0100
Subject: [PATCH 2/6] Add nounwind to avoid cfi directives

---
 .../RISCV/rvv/vector-deinterleave-fixed.ll    | 106 +----------
 .../RISCV/rvv/vector-interleave-fixed.ll      | 168 ++----------------
 2 files changed, 18 insertions(+), 256 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index a3ad0b26efd4d..b2bc01e3c5174 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -189,15 +189,13 @@ define {<8 x i64>, <8 x i64>} @vector_deinterleave_v8i64_v16i64(<16 x i64> %vec)
   ret {<8 x i64>, <8 x i64>} %retval
 }
 
-define {<2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v6i32(<6 x i32> %v) {
+define {<2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v6i32(<6 x i32> %v) nounwind {
 ; CHECK-LABEL: vector_deinterleave3_v2i32_v6i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v12, v8, 2
@@ -215,23 +213,19 @@ define {<2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v6i32(<6 x
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 	   %res = call {<2 x i32>, <2 x i32>, <2 x i32>} @llvm.vector.deinterleave3.v6i32(<6 x i32> %v)
 	   ret {<2 x i32>, <2 x i32>, <2 x i32>} %res
 }
 
-define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v8i32(<8 x i32> %v) {
+define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v8i32(<8 x i32> %v) nounwind {
 ; CHECK-LABEL: vector_deinterleave3_v2i32_v8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vsetivli zero, 2, e32, m2, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 6
@@ -251,23 +245,19 @@ define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 	   %res = call {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @llvm.vector.deinterleave4.v8i32(<8 x i32> %v)
 	   ret {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} %res
 }
 
-define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterleave5_v2i16_v10i16(<10 x i16> %v) {
+define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterleave5_v2i16_v10i16(<10 x i16> %v) nounwind {
 ; CHECK-LABEL: vector_deinterleave5_v2i16_v10i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v12, v8, 6
@@ -292,23 +282,19 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterle
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 	   %res = call {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @llvm.vector.deinterleave5.v10i16(<10 x i16> %v)
 	   ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res
 }
 
-define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterleave6_v2i16_v12i16(<12 x i16> %v) {
+define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vector_deinterleave6_v2i16_v12i16(<12 x i16> %v) nounwind {
 ; CHECK-LABEL: vector_deinterleave6_v2i16_v12i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    vsetivli zero, 2, e16, m1, ta, ma
 ; CHECK-NEXT:    vslidedown.vi v14, v8, 6
@@ -335,29 +321,22 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vecto
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 	   %res = call {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @llvm.vector.deinterleave6.v12i16(<12 x i16> %v)
 	   ret {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} %res
 }
 
-define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave7_v14i8_v2i8(<14 x i8> %v) {
+define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave7_v14i8_v2i8(<14 x i8> %v) nounwind {
 ; RV32-LABEL: vector_deinterleave7_v14i8_v2i8:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    .cfi_offset s1, -12
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 2
 ; RV32-NEXT:    sub sp, sp, a0
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 4 * vlenb
 ; RV32-NEXT:    addi a0, sp, 32
 ; RV32-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
 ; RV32-NEXT:    csrr s1, vlenb
@@ -424,31 +403,21 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @v
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a0, a0, 2
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 48
 ; RV32-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT:    .cfi_restore ra
-; RV32-NEXT:    .cfi_restore s0
-; RV32-NEXT:    .cfi_restore s1
 ; RV32-NEXT:    addi sp, sp, 48
-; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vector_deinterleave7_v14i8_v2i8:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -64
-; RV64-NEXT:    .cfi_def_cfa_offset 64
 ; RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    .cfi_offset s1, -24
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 2
 ; RV64-NEXT:    sub sp, sp, a0
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 4 * vlenb
 ; RV64-NEXT:    addi a0, sp, 32
 ; RV64-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
 ; RV64-NEXT:    csrr s1, vlenb
@@ -515,31 +484,21 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @v
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a0, a0, 2
 ; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    .cfi_def_cfa sp, 64
 ; RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT:    .cfi_restore ra
-; RV64-NEXT:    .cfi_restore s0
-; RV64-NEXT:    .cfi_restore s1
 ; RV64-NEXT:    addi sp, sp, 64
-; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
 ;
 ; ZIP-LABEL: vector_deinterleave7_v14i8_v2i8:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -64
-; ZIP-NEXT:    .cfi_def_cfa_offset 64
 ; ZIP-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
 ; ZIP-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
 ; ZIP-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
-; ZIP-NEXT:    .cfi_offset ra, -8
-; ZIP-NEXT:    .cfi_offset s0, -16
-; ZIP-NEXT:    .cfi_offset s1, -24
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 2
 ; ZIP-NEXT:    sub sp, sp, a0
-; ZIP-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 4 * vlenb
 ; ZIP-NEXT:    addi a0, sp, 32
 ; ZIP-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
 ; ZIP-NEXT:    csrr s1, vlenb
@@ -606,42 +565,29 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @v
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 2
 ; ZIP-NEXT:    add sp, sp, a0
-; ZIP-NEXT:    .cfi_def_cfa sp, 64
 ; ZIP-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
-; ZIP-NEXT:    .cfi_restore ra
-; ZIP-NEXT:    .cfi_restore s0
-; ZIP-NEXT:    .cfi_restore s1
 ; ZIP-NEXT:    addi sp, sp, 64
-; ZIP-NEXT:    .cfi_def_cfa_offset 0
 ; ZIP-NEXT:    ret
 	   %res = call {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @llvm.vector.deinterleave7.v14i8(<14 x i8> %v)
 	   ret {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} %res
 }
 
-define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave8_v16i8_v2i8(<16 x i8> %v) {
+define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave8_v16i8_v2i8(<16 x i8> %v) nounwind {
 ; RV32-LABEL: vector_deinterleave8_v16i8_v2i8:
 ; RV32:       # %bb.0:
 ; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    .cfi_def_cfa_offset 48
 ; RV32-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
 ; RV32-NEXT:    sw s4, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT:    .cfi_offset ra, -4
-; RV32-NEXT:    .cfi_offset s0, -8
-; RV32-NEXT:    .cfi_offset s1, -12
-; RV32-NEXT:    .cfi_offset s2, -16
-; RV32-NEXT:    .cfi_offset s3, -20
-; RV32-NEXT:    .cfi_offset s4, -24
 ; RV32-NEXT:    csrr a0, vlenb
 ; RV32-NEXT:    slli a1, a0, 1
 ; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    sub sp, sp, a0
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 3 * vlenb
 ; RV32-NEXT:    addi a0, sp, 16
 ; RV32-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
 ; RV32-NEXT:    csrr s1, vlenb
@@ -701,44 +647,28 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2
 ; RV32-NEXT:    slli a1, a0, 1
 ; RV32-NEXT:    add a0, a1, a0
 ; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    .cfi_def_cfa sp, 48
 ; RV32-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s2, 32(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s3, 28(sp) # 4-byte Folded Reload
 ; RV32-NEXT:    lw s4, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    .cfi_restore ra
-; RV32-NEXT:    .cfi_restore s0
-; RV32-NEXT:    .cfi_restore s1
-; RV32-NEXT:    .cfi_restore s2
-; RV32-NEXT:    .cfi_restore s3
-; RV32-NEXT:    .cfi_restore s4
 ; RV32-NEXT:    addi sp, sp, 48
-; RV32-NEXT:    .cfi_def_cfa_offset 0
 ; RV32-NEXT:    ret
 ;
 ; RV64-LABEL: vector_deinterleave8_v16i8_v2i8:
 ; RV64:       # %bb.0:
 ; RV64-NEXT:    addi sp, sp, -64
-; RV64-NEXT:    .cfi_def_cfa_offset 64
 ; RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
 ; RV64-NEXT:    sd s4, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT:    .cfi_offset ra, -8
-; RV64-NEXT:    .cfi_offset s0, -16
-; RV64-NEXT:    .cfi_offset s1, -24
-; RV64-NEXT:    .cfi_offset s2, -32
-; RV64-NEXT:    .cfi_offset s3, -40
-; RV64-NEXT:    .cfi_offset s4, -48
 ; RV64-NEXT:    csrr a0, vlenb
 ; RV64-NEXT:    slli a1, a0, 1
 ; RV64-NEXT:    add a0, a1, a0
 ; RV64-NEXT:    sub sp, sp, a0
-; RV64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 3 * vlenb
 ; RV64-NEXT:    addi a0, sp, 16
 ; RV64-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
 ; RV64-NEXT:    csrr s1, vlenb
@@ -798,44 +728,28 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2
 ; RV64-NEXT:    slli a1, a0, 1
 ; RV64-NEXT:    add a0, a1, a0
 ; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    .cfi_def_cfa sp, 64
 ; RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s2, 32(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s3, 24(sp) # 8-byte Folded Reload
 ; RV64-NEXT:    ld s4, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT:    .cfi_restore ra
-; RV64-NEXT:    .cfi_restore s0
-; RV64-NEXT:    .cfi_restore s1
-; RV64-NEXT:    .cfi_restore s2
-; RV64-NEXT:    .cfi_restore s3
-; RV64-NEXT:    .cfi_restore s4
 ; RV64-NEXT:    addi sp, sp, 64
-; RV64-NEXT:    .cfi_def_cfa_offset 0
 ; RV64-NEXT:    ret
 ;
 ; ZIP-LABEL: vector_deinterleave8_v16i8_v2i8:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -64
-; ZIP-NEXT:    .cfi_def_cfa_offset 64
 ; ZIP-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
 ; ZIP-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
 ; ZIP-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
 ; ZIP-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
 ; ZIP-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
 ; ZIP-NEXT:    sd s4, 16(sp) # 8-byte Folded Spill
-; ZIP-NEXT:    .cfi_offset ra, -8
-; ZIP-NEXT:    .cfi_offset s0, -16
-; ZIP-NEXT:    .cfi_offset s1, -24
-; ZIP-NEXT:    .cfi_offset s2, -32
-; ZIP-NEXT:    .cfi_offset s3, -40
-; ZIP-NEXT:    .cfi_offset s4, -48
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a1, a0, 1
 ; ZIP-NEXT:    add a0, a1, a0
 ; ZIP-NEXT:    sub sp, sp, a0
-; ZIP-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x03, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 3 * vlenb
 ; ZIP-NEXT:    addi a0, sp, 16
 ; ZIP-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
 ; ZIP-NEXT:    csrr s1, vlenb
@@ -895,21 +809,13 @@ define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2
 ; ZIP-NEXT:    slli a1, a0, 1
 ; ZIP-NEXT:    add a0, a1, a0
 ; ZIP-NEXT:    add sp, sp, a0
-; ZIP-NEXT:    .cfi_def_cfa sp, 64
 ; ZIP-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    ld s2, 32(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    ld s3, 24(sp) # 8-byte Folded Reload
 ; ZIP-NEXT:    ld s4, 16(sp) # 8-byte Folded Reload
-; ZIP-NEXT:    .cfi_restore ra
-; ZIP-NEXT:    .cfi_restore s0
-; ZIP-NEXT:    .cfi_restore s1
-; ZIP-NEXT:    .cfi_restore s2
-; ZIP-NEXT:    .cfi_restore s3
-; ZIP-NEXT:    .cfi_restore s4
 ; ZIP-NEXT:    addi sp, sp, 64
-; ZIP-NEXT:    .cfi_def_cfa_offset 0
 ; ZIP-NEXT:    ret
 	   %res = call {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @llvm.vector.deinterleave8.v16i8(<16 x i8> %v)
 	   ret {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} %res
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
index faf7903c21614..3dc83d50ee3f3 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave-fixed.ll
@@ -167,15 +167,13 @@ define <4 x i64> @vector_interleave_v4i64_v2i64(<2 x i64> %a, <2 x i64> %b) {
 	   ret <4 x i64> %res
 }
 
-define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) {
+define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind {
 ; CHECK-LABEL: vector_interleave3_v6i32_v2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a1, a1, 1
@@ -193,19 +191,15 @@ define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave3_v6i32_v2i32:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 16
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a1, a1, 1
@@ -223,19 +217,15 @@ define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    .cfi_def_cfa sp, 16
 ; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 0
 ; ZVBB-NEXT:    ret
 ;
 ; ZIP-LABEL: vector_interleave3_v6i32_v2i32:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -16
-; ZIP-NEXT:    .cfi_def_cfa_offset 16
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 1
 ; ZIP-NEXT:    sub sp, sp, a0
-; ZIP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; ZIP-NEXT:    addi a0, sp, 16
 ; ZIP-NEXT:    csrr a1, vlenb
 ; ZIP-NEXT:    srli a1, a1, 1
@@ -253,23 +243,19 @@ define <6 x i32> @vector_interleave3_v6i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 1
 ; ZIP-NEXT:    add sp, sp, a0
-; ZIP-NEXT:    .cfi_def_cfa sp, 16
 ; ZIP-NEXT:    addi sp, sp, 16
-; ZIP-NEXT:    .cfi_def_cfa_offset 0
 ; ZIP-NEXT:    ret
 	   %res = call <6 x i32> @llvm.vector.interleave3.v6i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c)
 	   ret <6 x i32> %res
 }
 
-define <8 x i32> @vector_interleave4_v8i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
+define <8 x i32> @vector_interleave4_v8i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) nounwind {
 ; CHECK-LABEL: vector_interleave4_v8i32_v2i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a1, a1, 1
@@ -290,19 +276,15 @@ define <8 x i32> @vector_interleave4_v8i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave4_v8i32_v2i32:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 16
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a1, a1, 1
@@ -323,19 +305,15 @@ define <8 x i32> @vector_interleave4_v8i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    .cfi_def_cfa sp, 16
 ; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 0
 ; ZVBB-NEXT:    ret
 ;
 ; ZIP-LABEL: vector_interleave4_v8i32_v2i32:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -16
-; ZIP-NEXT:    .cfi_def_cfa_offset 16
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 1
 ; ZIP-NEXT:    sub sp, sp, a0
-; ZIP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; ZIP-NEXT:    addi a0, sp, 16
 ; ZIP-NEXT:    csrr a1, vlenb
 ; ZIP-NEXT:    srli a1, a1, 1
@@ -356,23 +334,19 @@ define <8 x i32> @vector_interleave4_v8i32_v2i32(<2 x i32> %a, <2 x i32> %b, <2
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 1
 ; ZIP-NEXT:    add sp, sp, a0
-; ZIP-NEXT:    .cfi_def_cfa sp, 16
 ; ZIP-NEXT:    addi sp, sp, 16
-; ZIP-NEXT:    .cfi_def_cfa_offset 0
 ; ZIP-NEXT:    ret
 	   %res = call <8 x i32> @llvm.vector.interleave4.v8i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d)
 	   ret <8 x i32> %res
 }
 
-define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e) {
+define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e) nounwind {
 ; CHECK-LABEL: vector_interleave5_v10i16_v2i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a1, a1, 2
@@ -397,19 +371,15 @@ define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave5_v10i16_v2i16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 16
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a1, a1, 2
@@ -434,19 +404,15 @@ define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    .cfi_def_cfa sp, 16
 ; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 0
 ; ZVBB-NEXT:    ret
 ;
 ; ZIP-LABEL: vector_interleave5_v10i16_v2i16:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -16
-; ZIP-NEXT:    .cfi_def_cfa_offset 16
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 1
 ; ZIP-NEXT:    sub sp, sp, a0
-; ZIP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; ZIP-NEXT:    addi a0, sp, 16
 ; ZIP-NEXT:    csrr a1, vlenb
 ; ZIP-NEXT:    srli a1, a1, 2
@@ -471,23 +437,19 @@ define <10 x i16> @vector_interleave5_v10i16_v2i16(<2 x i16> %a, <2 x i16> %b, <
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 1
 ; ZIP-NEXT:    add sp, sp, a0
-; ZIP-NEXT:    .cfi_def_cfa sp, 16
 ; ZIP-NEXT:    addi sp, sp, 16
-; ZIP-NEXT:    .cfi_def_cfa_offset 0
 ; ZIP-NEXT:    ret
 	   %res = call <10 x i16> @llvm.vector.interleave5.v10i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e)
 	   ret <10 x i16> %res
 }
 
-define <12 x i16> @vector_interleave6_v12i16_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e, <2 x i16> %f) {
+define <12 x i16> @vector_interleave6_v12i16_v2i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e, <2 x i16> %f) nounwind {
 ; CHECK-LABEL: vector_interleave6_v12i16_v2i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a1, a1, 2
@@ -515,19 +477,15 @@ define <12 x i16> @vector_interleave6_v12i16_v2i16(<2 x i16> %a, <2 x i16> %b, <
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave6_v12i16_v2i16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 16
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a1, a1, 2
@@ -555,19 +513,15 @@ define <12 x i16> @vector_interleave6_v12i16_v2i16(<2 x i16> %a, <2 x i16> %b, <
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    .cfi_def_cfa sp, 16
 ; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 0
 ; ZVBB-NEXT:    ret
 ;
 ; ZIP-LABEL: vector_interleave6_v12i16_v2i16:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -16
-; ZIP-NEXT:    .cfi_def_cfa_offset 16
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 1
 ; ZIP-NEXT:    sub sp, sp, a0
-; ZIP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; ZIP-NEXT:    addi a0, sp, 16
 ; ZIP-NEXT:    csrr a1, vlenb
 ; ZIP-NEXT:    srli a1, a1, 2
@@ -595,22 +549,18 @@ define <12 x i16> @vector_interleave6_v12i16_v2i16(<2 x i16> %a, <2 x i16> %b, <
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 1
 ; ZIP-NEXT:    add sp, sp, a0
-; ZIP-NEXT:    .cfi_def_cfa sp, 16
 ; ZIP-NEXT:    addi sp, sp, 16
-; ZIP-NEXT:    .cfi_def_cfa_offset 0
 ; ZIP-NEXT:    ret
 	   %res = call <12 x i16> @llvm.vector.interleave6.v12i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, <2 x i16> %d, <2 x i16> %e, <2 x i16> %f)
 	   ret <12 x i16> %res
 }
 
-define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g) {
+define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g) nounwind {
 ; CHECK-LABEL: vector_interleave7_v14i8_v2i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a1, a1, 3
@@ -641,18 +591,14 @@ define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i
 ; CHECK-NEXT:    vslideup.vi v8, v12, 8
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave7_v14i8_v2i8:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 16
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a1, a1, 3
@@ -683,18 +629,14 @@ define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i
 ; ZVBB-NEXT:    vslideup.vi v8, v12, 8
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    .cfi_def_cfa sp, 16
 ; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 0
 ; ZVBB-NEXT:    ret
 ;
 ; ZIP-LABEL: vector_interleave7_v14i8_v2i8:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -16
-; ZIP-NEXT:    .cfi_def_cfa_offset 16
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    sub sp, sp, a0
-; ZIP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
 ; ZIP-NEXT:    addi a0, sp, 16
 ; ZIP-NEXT:    csrr a1, vlenb
 ; ZIP-NEXT:    srli a1, a1, 3
@@ -725,22 +667,18 @@ define <14 x i8> @vector_interleave7_v14i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i
 ; ZIP-NEXT:    vslideup.vi v8, v12, 8
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    add sp, sp, a0
-; ZIP-NEXT:    .cfi_def_cfa sp, 16
 ; ZIP-NEXT:    addi sp, sp, 16
-; ZIP-NEXT:    .cfi_def_cfa_offset 0
 ; ZIP-NEXT:    ret
 	   %res = call <14 x i8> @llvm.vector.interleave7.v14i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g)
 	   ret <14 x i8> %res
 }
 
-define <16 x i8> @vector_interleave8_v16i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g, <2 x i8> %h) {
+define <16 x i8> @vector_interleave8_v16i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g, <2 x i8> %h) nounwind {
 ; CHECK-LABEL: vector_interleave8_v16i8_v2i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a1, a1, 3
@@ -774,18 +712,14 @@ define <16 x i8> @vector_interleave8_v16i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i
 ; CHECK-NEXT:    vslideup.vi v8, v10, 8
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave8_v16i8_v2i8:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 16
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a1, a1, 3
@@ -819,18 +753,14 @@ define <16 x i8> @vector_interleave8_v16i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i
 ; ZVBB-NEXT:    vslideup.vi v8, v10, 8
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    .cfi_def_cfa sp, 16
 ; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 0
 ; ZVBB-NEXT:    ret
 ;
 ; ZIP-LABEL: vector_interleave8_v16i8_v2i8:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -16
-; ZIP-NEXT:    .cfi_def_cfa_offset 16
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    sub sp, sp, a0
-; ZIP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 1 * vlenb
 ; ZIP-NEXT:    addi a0, sp, 16
 ; ZIP-NEXT:    csrr a1, vlenb
 ; ZIP-NEXT:    srli a1, a1, 3
@@ -864,9 +794,7 @@ define <16 x i8> @vector_interleave8_v16i8_v2i8(<2 x i8> %a, <2 x i8> %b, <2 x i
 ; ZIP-NEXT:    vslideup.vi v8, v10, 8
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    add sp, sp, a0
-; ZIP-NEXT:    .cfi_def_cfa sp, 16
 ; ZIP-NEXT:    addi sp, sp, 16
-; ZIP-NEXT:    .cfi_def_cfa_offset 0
 ; ZIP-NEXT:    ret
 	   %res = call <16 x i8> @llvm.vector.interleave8.v16i8(<2 x i8> %a, <2 x i8> %b, <2 x i8> %c, <2 x i8> %d, <2 x i8> %e, <2 x i8> %f, <2 x i8> %g, <2 x i8> %h)
 	   ret <16 x i8> %res
@@ -1064,15 +992,13 @@ define <4 x double> @vector_interleave_v4f64_v2f64(<2 x double> %a, <2 x double>
 	   ret <4 x double> %res
 }
 
-define <6 x float> @vector_interleave3_v6f32_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
+define <6 x float> @vector_interleave3_v6f32_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c) nounwind {
 ; CHECK-LABEL: vector_interleave3_v6f32_v2f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a1, a1, 1
@@ -1090,19 +1016,15 @@ define <6 x float> @vector_interleave3_v6f32_v2f32(<2 x float> %a, <2 x float> %
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave3_v6f32_v2f32:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 16
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a1, a1, 1
@@ -1120,19 +1042,15 @@ define <6 x float> @vector_interleave3_v6f32_v2f32(<2 x float> %a, <2 x float> %
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    .cfi_def_cfa sp, 16
 ; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 0
 ; ZVBB-NEXT:    ret
 ;
 ; ZIP-LABEL: vector_interleave3_v6f32_v2f32:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -16
-; ZIP-NEXT:    .cfi_def_cfa_offset 16
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 1
 ; ZIP-NEXT:    sub sp, sp, a0
-; ZIP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; ZIP-NEXT:    addi a0, sp, 16
 ; ZIP-NEXT:    csrr a1, vlenb
 ; ZIP-NEXT:    srli a1, a1, 1
@@ -1150,23 +1068,19 @@ define <6 x float> @vector_interleave3_v6f32_v2f32(<2 x float> %a, <2 x float> %
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 1
 ; ZIP-NEXT:    add sp, sp, a0
-; ZIP-NEXT:    .cfi_def_cfa sp, 16
 ; ZIP-NEXT:    addi sp, sp, 16
-; ZIP-NEXT:    .cfi_def_cfa_offset 0
 ; ZIP-NEXT:    ret
 	   %res = call <6 x float> @llvm.vector.interleave3.v6f32(<2 x float> %a, <2 x float> %b, <2 x float> %c)
 	   ret <6 x float> %res
 }
 
-define <8 x float> @vector_interleave4_v8f32_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) {
+define <8 x float> @vector_interleave4_v8f32_v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d) nounwind {
 ; CHECK-LABEL: vector_interleave4_v8f32_v2f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a1, a1, 1
@@ -1187,19 +1101,15 @@ define <8 x float> @vector_interleave4_v8f32_v2f32(<2 x float> %a, <2 x float> %
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave4_v8f32_v2f32:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 16
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a1, a1, 1
@@ -1220,19 +1130,15 @@ define <8 x float> @vector_interleave4_v8f32_v2f32(<2 x float> %a, <2 x float> %
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    .cfi_def_cfa sp, 16
 ; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 0
 ; ZVBB-NEXT:    ret
 ;
 ; ZIP-LABEL: vector_interleave4_v8f32_v2f32:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -16
-; ZIP-NEXT:    .cfi_def_cfa_offset 16
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 1
 ; ZIP-NEXT:    sub sp, sp, a0
-; ZIP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; ZIP-NEXT:    addi a0, sp, 16
 ; ZIP-NEXT:    csrr a1, vlenb
 ; ZIP-NEXT:    srli a1, a1, 1
@@ -1253,23 +1159,19 @@ define <8 x float> @vector_interleave4_v8f32_v2f32(<2 x float> %a, <2 x float> %
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 1
 ; ZIP-NEXT:    add sp, sp, a0
-; ZIP-NEXT:    .cfi_def_cfa sp, 16
 ; ZIP-NEXT:    addi sp, sp, 16
-; ZIP-NEXT:    .cfi_def_cfa_offset 0
 ; ZIP-NEXT:    ret
 	   %res = call <8 x float> @llvm.vector.interleave4.v8f32(<2 x float> %a, <2 x float> %b, <2 x float> %c, <2 x float> %d)
 	   ret <8 x float> %res
 }
 
-define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e) {
+define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e) nounwind {
 ; CHECK-LABEL: vector_interleave5_v10f16_v2f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a1, a1, 2
@@ -1294,19 +1196,15 @@ define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave5_v10f16_v2f16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 16
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a1, a1, 2
@@ -1331,19 +1229,15 @@ define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    .cfi_def_cfa sp, 16
 ; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 0
 ; ZVBB-NEXT:    ret
 ;
 ; ZIP-LABEL: vector_interleave5_v10f16_v2f16:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -16
-; ZIP-NEXT:    .cfi_def_cfa_offset 16
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 1
 ; ZIP-NEXT:    sub sp, sp, a0
-; ZIP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; ZIP-NEXT:    addi a0, sp, 16
 ; ZIP-NEXT:    csrr a1, vlenb
 ; ZIP-NEXT:    srli a1, a1, 2
@@ -1368,23 +1262,19 @@ define <10 x half> @vector_interleave5_v10f16_v2f16(<2 x half> %a, <2 x half> %b
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 1
 ; ZIP-NEXT:    add sp, sp, a0
-; ZIP-NEXT:    .cfi_def_cfa sp, 16
 ; ZIP-NEXT:    addi sp, sp, 16
-; ZIP-NEXT:    .cfi_def_cfa_offset 0
 ; ZIP-NEXT:    ret
 	   %res = call <10 x half> @llvm.vector.interleave5.v10f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e)
 	   ret <10 x half> %res
 }
 
-define <12 x half> @vector_interleave6_v12f16_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e, <2 x half> %f) {
+define <12 x half> @vector_interleave6_v12f16_v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e, <2 x half> %f) nounwind {
 ; CHECK-LABEL: vector_interleave6_v12f16_v2f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a1, a1, 2
@@ -1412,19 +1302,15 @@ define <12 x half> @vector_interleave6_v12f16_v2f16(<2 x half> %a, <2 x half> %b
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave6_v12f16_v2f16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 16
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a1, a1, 2
@@ -1452,19 +1338,15 @@ define <12 x half> @vector_interleave6_v12f16_v2f16(<2 x half> %a, <2 x half> %b
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    .cfi_def_cfa sp, 16
 ; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 0
 ; ZVBB-NEXT:    ret
 ;
 ; ZIP-LABEL: vector_interleave6_v12f16_v2f16:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -16
-; ZIP-NEXT:    .cfi_def_cfa_offset 16
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 1
 ; ZIP-NEXT:    sub sp, sp, a0
-; ZIP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; ZIP-NEXT:    addi a0, sp, 16
 ; ZIP-NEXT:    csrr a1, vlenb
 ; ZIP-NEXT:    srli a1, a1, 2
@@ -1492,23 +1374,19 @@ define <12 x half> @vector_interleave6_v12f16_v2f16(<2 x half> %a, <2 x half> %b
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 1
 ; ZIP-NEXT:    add sp, sp, a0
-; ZIP-NEXT:    .cfi_def_cfa sp, 16
 ; ZIP-NEXT:    addi sp, sp, 16
-; ZIP-NEXT:    .cfi_def_cfa_offset 0
 ; ZIP-NEXT:    ret
 	   %res = call <12 x half> @llvm.vector.interleave6.v12f16(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d, <2 x half> %e, <2 x half> %f)
 	   ret <12 x half> %res
 }
 
-define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g) {
+define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g) nounwind {
 ; CHECK-LABEL: vector_interleave7_v7f16_v1f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a1, a1, 2
@@ -1540,19 +1418,15 @@ define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b,
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave7_v7f16_v1f16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 16
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a1, a1, 2
@@ -1584,19 +1458,15 @@ define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b,
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    .cfi_def_cfa sp, 16
 ; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 0
 ; ZVBB-NEXT:    ret
 ;
 ; ZIP-LABEL: vector_interleave7_v7f16_v1f16:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -16
-; ZIP-NEXT:    .cfi_def_cfa_offset 16
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 1
 ; ZIP-NEXT:    sub sp, sp, a0
-; ZIP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; ZIP-NEXT:    addi a0, sp, 16
 ; ZIP-NEXT:    csrr a1, vlenb
 ; ZIP-NEXT:    srli a1, a1, 2
@@ -1628,23 +1498,19 @@ define <7 x half> @vector_interleave7_v7f16_v1f16(<1 x half> %a, <1 x half> %b,
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 1
 ; ZIP-NEXT:    add sp, sp, a0
-; ZIP-NEXT:    .cfi_def_cfa sp, 16
 ; ZIP-NEXT:    addi sp, sp, 16
-; ZIP-NEXT:    .cfi_def_cfa_offset 0
 ; ZIP-NEXT:    ret
 	   %res = call <7 x half> @llvm.vector.interleave7.v7f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g)
 	   ret <7 x half> %res
 }
 
-define <8 x half> @vector_interleave8_v8f16_v1f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g, <1 x half> %h) {
+define <8 x half> @vector_interleave8_v8f16_v1f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g, <1 x half> %h) nounwind {
 ; CHECK-LABEL: vector_interleave8_v8f16_v1f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; CHECK-NEXT:    addi a0, sp, 16
 ; CHECK-NEXT:    csrr a1, vlenb
 ; CHECK-NEXT:    srli a1, a1, 2
@@ -1679,19 +1545,15 @@ define <8 x half> @vector_interleave8_v8f16_v1f16(<1 x half> %a, <1 x half> %b,
 ; CHECK-NEXT:    csrr a0, vlenb
 ; CHECK-NEXT:    slli a0, a0, 1
 ; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
 ; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
 ; CHECK-NEXT:    ret
 ;
 ; ZVBB-LABEL: vector_interleave8_v8f16_v1f16:
 ; ZVBB:       # %bb.0:
 ; ZVBB-NEXT:    addi sp, sp, -16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 16
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    sub sp, sp, a0
-; ZVBB-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; ZVBB-NEXT:    addi a0, sp, 16
 ; ZVBB-NEXT:    csrr a1, vlenb
 ; ZVBB-NEXT:    srli a1, a1, 2
@@ -1726,19 +1588,15 @@ define <8 x half> @vector_interleave8_v8f16_v1f16(<1 x half> %a, <1 x half> %b,
 ; ZVBB-NEXT:    csrr a0, vlenb
 ; ZVBB-NEXT:    slli a0, a0, 1
 ; ZVBB-NEXT:    add sp, sp, a0
-; ZVBB-NEXT:    .cfi_def_cfa sp, 16
 ; ZVBB-NEXT:    addi sp, sp, 16
-; ZVBB-NEXT:    .cfi_def_cfa_offset 0
 ; ZVBB-NEXT:    ret
 ;
 ; ZIP-LABEL: vector_interleave8_v8f16_v1f16:
 ; ZIP:       # %bb.0:
 ; ZIP-NEXT:    addi sp, sp, -16
-; ZIP-NEXT:    .cfi_def_cfa_offset 16
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 1
 ; ZIP-NEXT:    sub sp, sp, a0
-; ZIP-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
 ; ZIP-NEXT:    addi a0, sp, 16
 ; ZIP-NEXT:    csrr a1, vlenb
 ; ZIP-NEXT:    srli a1, a1, 2
@@ -1773,9 +1631,7 @@ define <8 x half> @vector_interleave8_v8f16_v1f16(<1 x half> %a, <1 x half> %b,
 ; ZIP-NEXT:    csrr a0, vlenb
 ; ZIP-NEXT:    slli a0, a0, 1
 ; ZIP-NEXT:    add sp, sp, a0
-; ZIP-NEXT:    .cfi_def_cfa sp, 16
 ; ZIP-NEXT:    addi sp, sp, 16
-; ZIP-NEXT:    .cfi_def_cfa_offset 0
 ; ZIP-NEXT:    ret
 	   %res = call <8 x half> @llvm.vector.interleave8.v8f16(<1 x half> %a, <1 x half> %b, <1 x half> %c, <1 x half> %d, <1 x half> %e, <1 x half> %f, <1 x half> %g, <1 x half> %h)
 	   ret <8 x half> %res

>From d0166d963c4f9b5361f6b5c8b714116ba0d80bbf Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 21 May 2025 12:39:10 +0100
Subject: [PATCH 3/6] Fix test name

---
 llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index b2bc01e3c5174..2bf96b4fdf39b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -219,8 +219,8 @@ define {<2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v6i32(<6 x
 	   ret {<2 x i32>, <2 x i32>, <2 x i32>} %res
 }
 
-define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave3_v2i32_v8i32(<8 x i32> %v) nounwind {
-; CHECK-LABEL: vector_deinterleave3_v2i32_v8i32:
+define {<2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>} @vector_deinterleave4_v2i32_v8i32(<8 x i32> %v) nounwind {
+; CHECK-LABEL: vector_deinterleave4_v2i32_v8i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    addi sp, sp, -16
 ; CHECK-NEXT:    csrr a0, vlenb

>From fdbcca443c3c0732b5453c3ebc24a91e06afdb70 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 21 May 2025 12:43:20 +0100
Subject: [PATCH 4/6] Use +m

---
 .../RISCV/rvv/vector-deinterleave-fixed.ll    | 575 +++---------------
 1 file changed, 88 insertions(+), 487 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index 2bf96b4fdf39b..aab2f08277831 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh | FileCheck %s --check-prefixes=CHECK,V,RV32
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh | FileCheck %s --check-prefixes=CHECK,V,RV64
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh,+experimental-xrivosvizip | FileCheck %s --check-prefixes=CHECK,ZIP
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+m,+zvfh | FileCheck %s --check-prefixes=CHECK,V,RV32
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+m,+zvfh | FileCheck %s --check-prefixes=CHECK,V,RV64
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+m,+zvfh,+experimental-xrivosvizip | FileCheck %s --check-prefixes=CHECK,ZIP
 
 ; Integers
 
@@ -328,495 +328,93 @@ define {<2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>, <2 x i16>} @vecto
 }
 
 define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave7_v14i8_v2i8(<14 x i8> %v) nounwind {
-; RV32-LABEL: vector_deinterleave7_v14i8_v2i8:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 2
-; RV32-NEXT:    sub sp, sp, a0
-; RV32-NEXT:    addi a0, sp, 32
-; RV32-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT:    csrr s1, vlenb
-; RV32-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v11, v8, 10
-; RV32-NEXT:    vslidedown.vi v10, v8, 8
-; RV32-NEXT:    vslidedown.vi v9, v8, 2
-; RV32-NEXT:    srli s0, s1, 3
-; RV32-NEXT:    add a0, s0, s0
-; RV32-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
-; RV32-NEXT:    vslideup.vx v10, v11, s0
-; RV32-NEXT:    vmv1r.v v11, v8
-; RV32-NEXT:    vslideup.vx v11, v9, s0
-; RV32-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v9, v8, 12
-; RV32-NEXT:    srli a0, s1, 2
-; RV32-NEXT:    add a1, a0, s0
-; RV32-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
-; RV32-NEXT:    vslideup.vx v10, v9, a0
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 1
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 32
-; RV32-NEXT:    vs1r.v v10, (a2) # vscale x 8-byte Folded Spill
-; RV32-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v9, v8, 4
-; RV32-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
-; RV32-NEXT:    vslideup.vx v11, v9, a0
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 32
-; RV32-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT:    li a1, 3
-; RV32-NEXT:    mv a0, s0
-; RV32-NEXT:    call __mulsi3
-; RV32-NEXT:    add s0, a0, s0
-; RV32-NEXT:    addi a1, sp, 32
-; RV32-NEXT:    vl1r.v v8, (a1) # vscale x 8-byte Folded Reload
-; RV32-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 6
-; RV32-NEXT:    srli s1, s1, 1
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 32
-; RV32-NEXT:    vl1r.v v9, (a1) # vscale x 8-byte Folded Reload
-; RV32-NEXT:    vsetvli zero, s0, e8, mf2, ta, ma
-; RV32-NEXT:    vslideup.vx v9, v8, a0
-; RV32-NEXT:    add a0, s1, s1
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 32
-; RV32-NEXT:    vl1r.v v8, (a1) # vscale x 8-byte Folded Reload
-; RV32-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; RV32-NEXT:    vslideup.vx v9, v8, s1
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a1, a0, 1
-; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 32
-; RV32-NEXT:    vs1r.v v9, (a0)
-; RV32-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; RV32-NEXT:    vlseg7e8.v v8, (a0)
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 2
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 48
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vector_deinterleave7_v14i8_v2i8:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -64
-; RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 2
-; RV64-NEXT:    sub sp, sp, a0
-; RV64-NEXT:    addi a0, sp, 32
-; RV64-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT:    csrr s1, vlenb
-; RV64-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v11, v8, 10
-; RV64-NEXT:    vslidedown.vi v10, v8, 8
-; RV64-NEXT:    vslidedown.vi v9, v8, 2
-; RV64-NEXT:    srli s0, s1, 3
-; RV64-NEXT:    add a0, s0, s0
-; RV64-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
-; RV64-NEXT:    vslideup.vx v10, v11, s0
-; RV64-NEXT:    vmv1r.v v11, v8
-; RV64-NEXT:    vslideup.vx v11, v9, s0
-; RV64-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 12
-; RV64-NEXT:    srli a0, s1, 2
-; RV64-NEXT:    add a1, a0, s0
-; RV64-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
-; RV64-NEXT:    vslideup.vx v10, v9, a0
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    slli a2, a2, 1
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 32
-; RV64-NEXT:    vs1r.v v10, (a2) # vscale x 8-byte Folded Spill
-; RV64-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v9, v8, 4
-; RV64-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
-; RV64-NEXT:    vslideup.vx v11, v9, a0
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 32
-; RV64-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT:    li a1, 3
-; RV64-NEXT:    mv a0, s0
-; RV64-NEXT:    call __muldi3
-; RV64-NEXT:    add s0, a0, s0
-; RV64-NEXT:    addi a1, sp, 32
-; RV64-NEXT:    vl1r.v v8, (a1) # vscale x 8-byte Folded Reload
-; RV64-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v8, 6
-; RV64-NEXT:    srli s1, s1, 1
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 32
-; RV64-NEXT:    vl1r.v v9, (a1) # vscale x 8-byte Folded Reload
-; RV64-NEXT:    vsetvli zero, s0, e8, mf2, ta, ma
-; RV64-NEXT:    vslideup.vx v9, v8, a0
-; RV64-NEXT:    add a0, s1, s1
-; RV64-NEXT:    csrr a1, vlenb
-; RV64-NEXT:    slli a1, a1, 1
-; RV64-NEXT:    add a1, sp, a1
-; RV64-NEXT:    addi a1, a1, 32
-; RV64-NEXT:    vl1r.v v8, (a1) # vscale x 8-byte Folded Reload
-; RV64-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; RV64-NEXT:    vslideup.vx v9, v8, s1
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a1, a0, 1
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 32
-; RV64-NEXT:    vs1r.v v9, (a0)
-; RV64-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; RV64-NEXT:    vlseg7e8.v v8, (a0)
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 2
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 64
-; RV64-NEXT:    ret
-;
-; ZIP-LABEL: vector_deinterleave7_v14i8_v2i8:
-; ZIP:       # %bb.0:
-; ZIP-NEXT:    addi sp, sp, -64
-; ZIP-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; ZIP-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
-; ZIP-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
-; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    slli a0, a0, 2
-; ZIP-NEXT:    sub sp, sp, a0
-; ZIP-NEXT:    addi a0, sp, 32
-; ZIP-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; ZIP-NEXT:    csrr s1, vlenb
-; ZIP-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; ZIP-NEXT:    vslidedown.vi v11, v8, 10
-; ZIP-NEXT:    vslidedown.vi v10, v8, 8
-; ZIP-NEXT:    vslidedown.vi v9, v8, 2
-; ZIP-NEXT:    srli s0, s1, 3
-; ZIP-NEXT:    add a0, s0, s0
-; ZIP-NEXT:    vsetvli zero, a0, e8, mf2, tu, ma
-; ZIP-NEXT:    vslideup.vx v10, v11, s0
-; ZIP-NEXT:    vmv1r.v v11, v8
-; ZIP-NEXT:    vslideup.vx v11, v9, s0
-; ZIP-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; ZIP-NEXT:    vslidedown.vi v9, v8, 12
-; ZIP-NEXT:    srli a0, s1, 2
-; ZIP-NEXT:    add a1, a0, s0
-; ZIP-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
-; ZIP-NEXT:    vslideup.vx v10, v9, a0
-; ZIP-NEXT:    csrr a2, vlenb
-; ZIP-NEXT:    slli a2, a2, 1
-; ZIP-NEXT:    add a2, sp, a2
-; ZIP-NEXT:    addi a2, a2, 32
-; ZIP-NEXT:    vs1r.v v10, (a2) # vscale x 8-byte Folded Spill
-; ZIP-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; ZIP-NEXT:    vslidedown.vi v9, v8, 4
-; ZIP-NEXT:    vsetvli zero, a1, e8, mf2, tu, ma
-; ZIP-NEXT:    vslideup.vx v11, v9, a0
-; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    add a0, sp, a0
-; ZIP-NEXT:    addi a0, a0, 32
-; ZIP-NEXT:    vs1r.v v11, (a0) # vscale x 8-byte Folded Spill
-; ZIP-NEXT:    li a1, 3
-; ZIP-NEXT:    mv a0, s0
-; ZIP-NEXT:    call __muldi3
-; ZIP-NEXT:    add s0, a0, s0
-; ZIP-NEXT:    addi a1, sp, 32
-; ZIP-NEXT:    vl1r.v v8, (a1) # vscale x 8-byte Folded Reload
-; ZIP-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; ZIP-NEXT:    vslidedown.vi v8, v8, 6
-; ZIP-NEXT:    srli s1, s1, 1
-; ZIP-NEXT:    csrr a1, vlenb
-; ZIP-NEXT:    add a1, sp, a1
-; ZIP-NEXT:    addi a1, a1, 32
-; ZIP-NEXT:    vl1r.v v9, (a1) # vscale x 8-byte Folded Reload
-; ZIP-NEXT:    vsetvli zero, s0, e8, mf2, ta, ma
-; ZIP-NEXT:    vslideup.vx v9, v8, a0
-; ZIP-NEXT:    add a0, s1, s1
-; ZIP-NEXT:    csrr a1, vlenb
-; ZIP-NEXT:    slli a1, a1, 1
-; ZIP-NEXT:    add a1, sp, a1
-; ZIP-NEXT:    addi a1, a1, 32
-; ZIP-NEXT:    vl1r.v v8, (a1) # vscale x 8-byte Folded Reload
-; ZIP-NEXT:    vsetvli zero, a0, e8, m1, ta, ma
-; ZIP-NEXT:    vslideup.vx v9, v8, s1
-; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    slli a1, a0, 1
-; ZIP-NEXT:    add a0, a1, a0
-; ZIP-NEXT:    add a0, sp, a0
-; ZIP-NEXT:    addi a0, a0, 32
-; ZIP-NEXT:    vs1r.v v9, (a0)
-; ZIP-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; ZIP-NEXT:    vlseg7e8.v v8, (a0)
-; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    slli a0, a0, 2
-; ZIP-NEXT:    add sp, sp, a0
-; ZIP-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; ZIP-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; ZIP-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
-; ZIP-NEXT:    addi sp, sp, 64
-; ZIP-NEXT:    ret
+; CHECK-LABEL: vector_deinterleave7_v14i8_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v8, 10
+; CHECK-NEXT:    vslidedown.vi v10, v8, 8
+; CHECK-NEXT:    vslidedown.vi v11, v8, 12
+; CHECK-NEXT:    vslidedown.vi v12, v8, 2
+; CHECK-NEXT:    vslidedown.vi v13, v8, 4
+; CHECK-NEXT:    vslidedown.vi v14, v8, 6
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a2, a0, 2
+; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    add a3, a1, a1
+; CHECK-NEXT:    add a4, a2, a1
+; CHECK-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vx v10, v9, a1
+; CHECK-NEXT:    vslideup.vx v8, v12, a1
+; CHECK-NEXT:    slli a3, a1, 1
+; CHECK-NEXT:    vsetvli zero, a4, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vx v10, v11, a2
+; CHECK-NEXT:    vslideup.vx v8, v13, a2
+; CHECK-NEXT:    add a2, a0, a0
+; CHECK-NEXT:    add a3, a3, a1
+; CHECK-NEXT:    add a1, a3, a1
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v14, a3
+; CHECK-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v10, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs1r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vlseg7e8.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
 	   %res = call {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @llvm.vector.deinterleave7.v14i8(<14 x i8> %v)
 	   ret {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} %res
 }
 
 define {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @vector_deinterleave8_v16i8_v2i8(<16 x i8> %v) nounwind {
-; RV32-LABEL: vector_deinterleave8_v16i8_v2i8:
-; RV32:       # %bb.0:
-; RV32-NEXT:    addi sp, sp, -48
-; RV32-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s0, 40(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s1, 36(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s2, 32(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s3, 28(sp) # 4-byte Folded Spill
-; RV32-NEXT:    sw s4, 24(sp) # 4-byte Folded Spill
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a1, a0, 1
-; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    sub sp, sp, a0
-; RV32-NEXT:    addi a0, sp, 16
-; RV32-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT:    csrr s1, vlenb
-; RV32-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 10
-; RV32-NEXT:    vslidedown.vi v9, v8, 8
-; RV32-NEXT:    srli s0, s1, 3
-; RV32-NEXT:    srli s2, s1, 2
-; RV32-NEXT:    add s3, s0, s0
-; RV32-NEXT:    add s4, s2, s0
-; RV32-NEXT:    vsetvli zero, s3, e8, mf2, tu, ma
-; RV32-NEXT:    vslideup.vx v9, v10, s0
-; RV32-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v10, v8, 12
-; RV32-NEXT:    vsetvli zero, s4, e8, mf2, tu, ma
-; RV32-NEXT:    vslideup.vx v9, v10, s2
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV32-NEXT:    li a1, 3
-; RV32-NEXT:    mv a0, s0
-; RV32-NEXT:    call __mulsi3
-; RV32-NEXT:    add a1, a0, s0
-; RV32-NEXT:    addi a2, sp, 16
-; RV32-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v12, 14
-; RV32-NEXT:    vslidedown.vi v9, v12, 2
-; RV32-NEXT:    vmv1r.v v10, v12
-; RV32-NEXT:    vslidedown.vi v11, v12, 4
-; RV32-NEXT:    vslidedown.vi v12, v12, 6
-; RV32-NEXT:    srli s1, s1, 1
-; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    add a2, sp, a2
-; RV32-NEXT:    addi a2, a2, 16
-; RV32-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
-; RV32-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; RV32-NEXT:    vslideup.vx v13, v8, a0
-; RV32-NEXT:    vsetvli zero, s3, e8, mf2, tu, ma
-; RV32-NEXT:    vslideup.vx v10, v9, s0
-; RV32-NEXT:    add a2, s1, s1
-; RV32-NEXT:    vsetvli zero, s4, e8, mf2, tu, ma
-; RV32-NEXT:    vslideup.vx v10, v11, s2
-; RV32-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; RV32-NEXT:    vslideup.vx v10, v12, a0
-; RV32-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
-; RV32-NEXT:    vslideup.vx v10, v13, s1
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a0, a0, 1
-; RV32-NEXT:    add a0, sp, a0
-; RV32-NEXT:    addi a0, a0, 16
-; RV32-NEXT:    vs1r.v v10, (a0)
-; RV32-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; RV32-NEXT:    vlseg8e8.v v8, (a0)
-; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    slli a1, a0, 1
-; RV32-NEXT:    add a0, a1, a0
-; RV32-NEXT:    add sp, sp, a0
-; RV32-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s0, 40(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s1, 36(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s2, 32(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s3, 28(sp) # 4-byte Folded Reload
-; RV32-NEXT:    lw s4, 24(sp) # 4-byte Folded Reload
-; RV32-NEXT:    addi sp, sp, 48
-; RV32-NEXT:    ret
-;
-; RV64-LABEL: vector_deinterleave8_v16i8_v2i8:
-; RV64:       # %bb.0:
-; RV64-NEXT:    addi sp, sp, -64
-; RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
-; RV64-NEXT:    sd s4, 16(sp) # 8-byte Folded Spill
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a1, a0, 1
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    sub sp, sp, a0
-; RV64-NEXT:    addi a0, sp, 16
-; RV64-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT:    csrr s1, vlenb
-; RV64-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 10
-; RV64-NEXT:    vslidedown.vi v9, v8, 8
-; RV64-NEXT:    srli s0, s1, 3
-; RV64-NEXT:    srli s2, s1, 2
-; RV64-NEXT:    add s3, s0, s0
-; RV64-NEXT:    add s4, s2, s0
-; RV64-NEXT:    vsetvli zero, s3, e8, mf2, tu, ma
-; RV64-NEXT:    vslideup.vx v9, v10, s0
-; RV64-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v10, v8, 12
-; RV64-NEXT:    vsetvli zero, s4, e8, mf2, tu, ma
-; RV64-NEXT:    vslideup.vx v9, v10, s2
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; RV64-NEXT:    li a1, 3
-; RV64-NEXT:    mv a0, s0
-; RV64-NEXT:    call __muldi3
-; RV64-NEXT:    add a1, a0, s0
-; RV64-NEXT:    addi a2, sp, 16
-; RV64-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; RV64-NEXT:    vslidedown.vi v8, v12, 14
-; RV64-NEXT:    vslidedown.vi v9, v12, 2
-; RV64-NEXT:    vmv1r.v v10, v12
-; RV64-NEXT:    vslidedown.vi v11, v12, 4
-; RV64-NEXT:    vslidedown.vi v12, v12, 6
-; RV64-NEXT:    srli s1, s1, 1
-; RV64-NEXT:    csrr a2, vlenb
-; RV64-NEXT:    add a2, sp, a2
-; RV64-NEXT:    addi a2, a2, 16
-; RV64-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
-; RV64-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; RV64-NEXT:    vslideup.vx v13, v8, a0
-; RV64-NEXT:    vsetvli zero, s3, e8, mf2, tu, ma
-; RV64-NEXT:    vslideup.vx v10, v9, s0
-; RV64-NEXT:    add a2, s1, s1
-; RV64-NEXT:    vsetvli zero, s4, e8, mf2, tu, ma
-; RV64-NEXT:    vslideup.vx v10, v11, s2
-; RV64-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; RV64-NEXT:    vslideup.vx v10, v12, a0
-; RV64-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
-; RV64-NEXT:    vslideup.vx v10, v13, s1
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a0, a0, 1
-; RV64-NEXT:    add a0, sp, a0
-; RV64-NEXT:    addi a0, a0, 16
-; RV64-NEXT:    vs1r.v v10, (a0)
-; RV64-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; RV64-NEXT:    vlseg8e8.v v8, (a0)
-; RV64-NEXT:    csrr a0, vlenb
-; RV64-NEXT:    slli a1, a0, 1
-; RV64-NEXT:    add a0, a1, a0
-; RV64-NEXT:    add sp, sp, a0
-; RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s2, 32(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s3, 24(sp) # 8-byte Folded Reload
-; RV64-NEXT:    ld s4, 16(sp) # 8-byte Folded Reload
-; RV64-NEXT:    addi sp, sp, 64
-; RV64-NEXT:    ret
-;
-; ZIP-LABEL: vector_deinterleave8_v16i8_v2i8:
-; ZIP:       # %bb.0:
-; ZIP-NEXT:    addi sp, sp, -64
-; ZIP-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; ZIP-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
-; ZIP-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
-; ZIP-NEXT:    sd s2, 32(sp) # 8-byte Folded Spill
-; ZIP-NEXT:    sd s3, 24(sp) # 8-byte Folded Spill
-; ZIP-NEXT:    sd s4, 16(sp) # 8-byte Folded Spill
-; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    slli a1, a0, 1
-; ZIP-NEXT:    add a0, a1, a0
-; ZIP-NEXT:    sub sp, sp, a0
-; ZIP-NEXT:    addi a0, sp, 16
-; ZIP-NEXT:    vs1r.v v8, (a0) # vscale x 8-byte Folded Spill
-; ZIP-NEXT:    csrr s1, vlenb
-; ZIP-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; ZIP-NEXT:    vslidedown.vi v10, v8, 10
-; ZIP-NEXT:    vslidedown.vi v9, v8, 8
-; ZIP-NEXT:    srli s0, s1, 3
-; ZIP-NEXT:    srli s2, s1, 2
-; ZIP-NEXT:    add s3, s0, s0
-; ZIP-NEXT:    add s4, s2, s0
-; ZIP-NEXT:    vsetvli zero, s3, e8, mf2, tu, ma
-; ZIP-NEXT:    vslideup.vx v9, v10, s0
-; ZIP-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; ZIP-NEXT:    vslidedown.vi v10, v8, 12
-; ZIP-NEXT:    vsetvli zero, s4, e8, mf2, tu, ma
-; ZIP-NEXT:    vslideup.vx v9, v10, s2
-; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    add a0, sp, a0
-; ZIP-NEXT:    addi a0, a0, 16
-; ZIP-NEXT:    vs1r.v v9, (a0) # vscale x 8-byte Folded Spill
-; ZIP-NEXT:    li a1, 3
-; ZIP-NEXT:    mv a0, s0
-; ZIP-NEXT:    call __muldi3
-; ZIP-NEXT:    add a1, a0, s0
-; ZIP-NEXT:    addi a2, sp, 16
-; ZIP-NEXT:    vl1r.v v12, (a2) # vscale x 8-byte Folded Reload
-; ZIP-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
-; ZIP-NEXT:    vslidedown.vi v8, v12, 14
-; ZIP-NEXT:    vslidedown.vi v9, v12, 2
-; ZIP-NEXT:    vmv1r.v v10, v12
-; ZIP-NEXT:    vslidedown.vi v11, v12, 4
-; ZIP-NEXT:    vslidedown.vi v12, v12, 6
-; ZIP-NEXT:    srli s1, s1, 1
-; ZIP-NEXT:    csrr a2, vlenb
-; ZIP-NEXT:    add a2, sp, a2
-; ZIP-NEXT:    addi a2, a2, 16
-; ZIP-NEXT:    vl1r.v v13, (a2) # vscale x 8-byte Folded Reload
-; ZIP-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; ZIP-NEXT:    vslideup.vx v13, v8, a0
-; ZIP-NEXT:    vsetvli zero, s3, e8, mf2, tu, ma
-; ZIP-NEXT:    vslideup.vx v10, v9, s0
-; ZIP-NEXT:    add a2, s1, s1
-; ZIP-NEXT:    vsetvli zero, s4, e8, mf2, tu, ma
-; ZIP-NEXT:    vslideup.vx v10, v11, s2
-; ZIP-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
-; ZIP-NEXT:    vslideup.vx v10, v12, a0
-; ZIP-NEXT:    vsetvli zero, a2, e8, m1, ta, ma
-; ZIP-NEXT:    vslideup.vx v10, v13, s1
-; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    slli a0, a0, 1
-; ZIP-NEXT:    add a0, sp, a0
-; ZIP-NEXT:    addi a0, a0, 16
-; ZIP-NEXT:    vs1r.v v10, (a0)
-; ZIP-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
-; ZIP-NEXT:    vlseg8e8.v v8, (a0)
-; ZIP-NEXT:    csrr a0, vlenb
-; ZIP-NEXT:    slli a1, a0, 1
-; ZIP-NEXT:    add a0, a1, a0
-; ZIP-NEXT:    add sp, sp, a0
-; ZIP-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; ZIP-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; ZIP-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
-; ZIP-NEXT:    ld s2, 32(sp) # 8-byte Folded Reload
-; ZIP-NEXT:    ld s3, 24(sp) # 8-byte Folded Reload
-; ZIP-NEXT:    ld s4, 16(sp) # 8-byte Folded Reload
-; ZIP-NEXT:    addi sp, sp, 64
-; ZIP-NEXT:    ret
+; CHECK-LABEL: vector_deinterleave8_v16i8_v2i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    sub sp, sp, a0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    vsetivli zero, 2, e8, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v8, 10
+; CHECK-NEXT:    vslidedown.vi v10, v8, 8
+; CHECK-NEXT:    vslidedown.vi v11, v8, 12
+; CHECK-NEXT:    vslidedown.vi v12, v8, 14
+; CHECK-NEXT:    vslidedown.vi v13, v8, 2
+; CHECK-NEXT:    vslidedown.vi v14, v8, 4
+; CHECK-NEXT:    vslidedown.vi v15, v8, 6
+; CHECK-NEXT:    srli a1, a0, 3
+; CHECK-NEXT:    srli a2, a0, 2
+; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    add a3, a1, a1
+; CHECK-NEXT:    add a4, a2, a1
+; CHECK-NEXT:    slli a5, a1, 1
+; CHECK-NEXT:    add a6, a0, a0
+; CHECK-NEXT:    vsetvli zero, a3, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vx v10, v9, a1
+; CHECK-NEXT:    add a5, a5, a1
+; CHECK-NEXT:    vslideup.vx v8, v13, a1
+; CHECK-NEXT:    vsetvli zero, a4, e8, mf2, tu, ma
+; CHECK-NEXT:    vslideup.vx v10, v11, a2
+; CHECK-NEXT:    add a1, a5, a1
+; CHECK-NEXT:    vslideup.vx v8, v14, a2
+; CHECK-NEXT:    vsetvli zero, a1, e8, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v10, v12, a5
+; CHECK-NEXT:    vslideup.vx v8, v15, a5
+; CHECK-NEXT:    vsetvli zero, a6, e8, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v10, a0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs1r.v v8, (a0)
+; CHECK-NEXT:    vsetvli a1, zero, e8, mf8, ta, ma
+; CHECK-NEXT:    vlseg8e8.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
 	   %res = call {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} @llvm.vector.deinterleave8.v16i8(<16 x i8> %v)
 	   ret {<2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>, <2 x i8>} %res
 }
@@ -1221,3 +819,6 @@ define {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>,
 	   %res = call {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} @llvm.vector.deinterleave8.v8f16(<8 x half> %v)
 	   ret {<1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>, <1 x half>} %res
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; RV32: {{.*}}
+; RV64: {{.*}}

>From 7d93db61d15fbac338c4c691811e7bc5960bcd6a Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 21 May 2025 12:54:53 +0100
Subject: [PATCH 5/6] Use >=/<= and update comment

---
 llvm/include/llvm/IR/Intrinsics.h | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index b64784909fc25..5a810784a54bf 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -169,7 +169,7 @@ namespace Intrinsic {
       AArch64Svcount,
     } Kind;
 
-    // These three have to be contiguous.
+    // These six have to be contiguous.
     static_assert(OneFourthVecArgument == OneThirdVecArgument + 1 &&
                   OneFifthVecArgument == OneFourthVecArgument + 1 &&
                   OneSixthVecArgument == OneFifthVecArgument + 1 &&
@@ -194,9 +194,7 @@ namespace Intrinsic {
     unsigned getArgumentNumber() const {
       assert(Kind == Argument || Kind == ExtendArgument ||
              Kind == TruncArgument || Kind == HalfVecArgument ||
-             Kind == OneThirdVecArgument || Kind == OneFourthVecArgument ||
-             Kind == OneFifthVecArgument || Kind == OneSixthVecArgument ||
-             Kind == OneSeventhVecArgument || Kind == OneEighthVecArgument ||
+             (Kind >= OneThirdVecArgument && Kind <= OneEighthVecArgument) ||
              Kind == SameVecWidthArgument || Kind == VecElementArgument ||
              Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
              Kind == VecOfBitcastsToInt);
@@ -205,9 +203,7 @@ namespace Intrinsic {
     ArgKind getArgumentKind() const {
       assert(Kind == Argument || Kind == ExtendArgument ||
              Kind == TruncArgument || Kind == HalfVecArgument ||
-             Kind == OneThirdVecArgument || Kind == OneFourthVecArgument ||
-             Kind == OneFifthVecArgument || Kind == OneSixthVecArgument ||
-             Kind == OneSeventhVecArgument || Kind == OneEighthVecArgument ||
+             (Kind >= OneThirdVecArgument && Kind <= OneEighthVecArgument) ||
              Kind == SameVecWidthArgument || Kind == VecElementArgument ||
              Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
              Kind == VecOfBitcastsToInt);

>From be3d17dd55f16b018047652d0c6bb9fa8ba54613 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 26 May 2025 14:30:48 +0100
Subject: [PATCH 6/6] [IR] Consolidate OneNthElements IIT descriptors. NFCI

This replaces LLVMHalfElementsVectorType and LLVMOneNElementsVectorType into one parameterized IIT descriptor.

The type signature is encoded as the vector type to match followed by the divisor N, and inside IITDescriptor this is store as two 16 bit parts, similarly to LLVMVectorOfAnyPointersToElt.

This also allows us to use a foreach range to delcare the [de]interleave intrinsics.
---
 llvm/include/llvm/IR/Intrinsics.h         |  38 ++---
 llvm/include/llvm/IR/Intrinsics.td        | 161 +++-------------------
 llvm/include/llvm/IR/IntrinsicsAArch64.td |  10 +-
 llvm/lib/IR/Intrinsics.cpp                |  78 ++---------
 4 files changed, 50 insertions(+), 237 deletions(-)

diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 5a810784a54bf..7aa6cbe93c132 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -151,13 +151,7 @@ namespace Intrinsic {
       Argument,
       ExtendArgument,
       TruncArgument,
-      HalfVecArgument,
-      OneThirdVecArgument,
-      OneFourthVecArgument,
-      OneFifthVecArgument,
-      OneSixthVecArgument,
-      OneSeventhVecArgument,
-      OneEighthVecArgument,
+      OneNthEltsVecArgument,
       SameVecWidthArgument,
       VecOfAnyPtrsToElt,
       VecElementArgument,
@@ -169,12 +163,6 @@ namespace Intrinsic {
       AArch64Svcount,
     } Kind;
 
-    // These six have to be contiguous.
-    static_assert(OneFourthVecArgument == OneThirdVecArgument + 1 &&
-                  OneFifthVecArgument == OneFourthVecArgument + 1 &&
-                  OneSixthVecArgument == OneFifthVecArgument + 1 &&
-                  OneSeventhVecArgument == OneSixthVecArgument + 1 &&
-                  OneEighthVecArgument == OneSeventhVecArgument + 1);
     union {
       unsigned Integer_Width;
       unsigned Float_Width;
@@ -193,20 +181,16 @@ namespace Intrinsic {
 
     unsigned getArgumentNumber() const {
       assert(Kind == Argument || Kind == ExtendArgument ||
-             Kind == TruncArgument || Kind == HalfVecArgument ||
-             (Kind >= OneThirdVecArgument && Kind <= OneEighthVecArgument) ||
-             Kind == SameVecWidthArgument || Kind == VecElementArgument ||
-             Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
-             Kind == VecOfBitcastsToInt);
+             Kind == TruncArgument || Kind == SameVecWidthArgument ||
+             Kind == VecElementArgument || Kind == Subdivide2Argument ||
+             Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt);
       return Argument_Info >> 3;
     }
     ArgKind getArgumentKind() const {
       assert(Kind == Argument || Kind == ExtendArgument ||
-             Kind == TruncArgument || Kind == HalfVecArgument ||
-             (Kind >= OneThirdVecArgument && Kind <= OneEighthVecArgument) ||
-             Kind == SameVecWidthArgument || Kind == VecElementArgument ||
-             Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
-             Kind == VecOfBitcastsToInt);
+             Kind == TruncArgument || Kind == SameVecWidthArgument ||
+             Kind == VecElementArgument || Kind == Subdivide2Argument ||
+             Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt);
       return (ArgKind)(Argument_Info & 7);
     }
 
@@ -216,8 +200,14 @@ namespace Intrinsic {
       assert(Kind == VecOfAnyPtrsToElt);
       return Argument_Info >> 16;
     }
+    // OneNthEltsVecArguments uses both a divisor N and a reference argument for
+    // the full-width vector to match
+    unsigned getVectorDivisor() const {
+      assert(Kind == OneNthEltsVecArgument);
+      return Argument_Info >> 16;
+    }
     unsigned getRefArgNumber() const {
-      assert(Kind == VecOfAnyPtrsToElt);
+      assert(Kind == VecOfAnyPtrsToElt || Kind == OneNthEltsVecArgument);
       return Argument_Info & 0xFFFF;
     }
 
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 900c3469e2bb8..0ac7288ebdf8a 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -306,7 +306,7 @@ def IIT_TRUNC_ARG : IIT_Base<26>;
 def IIT_ANYPTR : IIT_Base<27>;
 def IIT_V1 : IIT_Vec<1, 28>;
 def IIT_VARARG : IIT_VT<isVoid, 29>;
-def IIT_HALF_VEC_ARG : IIT_Base<30>;
+def IIT_ONE_NTH_ELTS_VEC_ARG : IIT_Base<30>;
 def IIT_SAME_VEC_WIDTH_ARG : IIT_Base<31>;
 def IIT_VEC_OF_ANYPTRS_TO_ELT : IIT_Base<34>;
 def IIT_I128 : IIT_Int<128, 35>;
@@ -335,14 +335,8 @@ def IIT_I4 : IIT_Int<4, 58>;
 def IIT_AARCH64_SVCOUNT : IIT_VT<aarch64svcount, 59>;
 def IIT_V6 : IIT_Vec<6, 60>;
 def IIT_V10 : IIT_Vec<10, 61>;
-def IIT_ONE_THIRD_VEC_ARG : IIT_Base<62>;
-def IIT_ONE_FIFTH_VEC_ARG : IIT_Base<63>;
-def IIT_ONE_SEVENTH_VEC_ARG : IIT_Base<64>;
-def IIT_V2048: IIT_Vec<2048, 65>;
-def IIT_V4096: IIT_Vec<4096, 66>;
-def IIT_ONE_FOURTH_VEC_ARG : IIT_Base<67>;
-def IIT_ONE_SIXTH_VEC_ARG : IIT_Base<68>;
-def IIT_ONE_EIGHTH_VEC_ARG : IIT_Base<69>;
+def IIT_V2048: IIT_Vec<2048, 62>;
+def IIT_V4096: IIT_Vec<4096, 63>;
 }
 
 defvar IIT_all_FixedTypes = !filter(iit, IIT_all,
@@ -479,27 +473,15 @@ class LLVMVectorOfAnyPointersToElt<int num>
 class LLVMVectorElementType<int num> : LLVMMatchType<num, IIT_VEC_ELEMENT>;
 
 // Match the type of another intrinsic parameter that is expected to be a
-// vector type, but change the element count to be half as many.
-class LLVMHalfElementsVectorType<int num>
-  : LLVMMatchType<num, IIT_HALF_VEC_ARG>;
-
-class LLVMOneThirdElementsVectorType<int num>
-  : LLVMMatchType<num, IIT_ONE_THIRD_VEC_ARG>;
-
-class LLVMOneFourthElementsVectorType<int num>
-  : LLVMMatchType<num, IIT_ONE_FOURTH_VEC_ARG>;
-
-class LLVMOneFifthElementsVectorType<int num>
-  : LLVMMatchType<num, IIT_ONE_FIFTH_VEC_ARG>;
-
-class LLVMOneSixthElementsVectorType<int num>
-  : LLVMMatchType<num, IIT_ONE_SIXTH_VEC_ARG>;
-
-class LLVMOneSeventhElementsVectorType<int num>
-  : LLVMMatchType<num, IIT_ONE_SEVENTH_VEC_ARG>;
-
-class LLVMOneEighthElementsVectorType<int num>
-  : LLVMMatchType<num, IIT_ONE_EIGHTH_VEC_ARG>;
+// vector type, but change the element count to be 1/n of it.
+class LLVMOneNthElementsVectorType<int idx, int n>
+  : LLVMMatchType<idx, IIT_ONE_NTH_ELTS_VEC_ARG> {
+  let Sig = [
+    IIT_ONE_NTH_ELTS_VEC_ARG.Number,
+    EncNextArgN<idx>.ret,
+    n,
+  ];
+}
 
 // Match the type of another intrinsic parameter that is expected to be a
 // vector type (i.e. <N x iM>) but with each element subdivided to
@@ -2770,118 +2752,15 @@ def int_vector_extract : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                                                [llvm_anyvector_ty, llvm_i64_ty],
                                                [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<1>>]>;
 
+foreach n = 2...8 in {
+  def int_vector_interleave#n   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                                                       !listsplat(LLVMOneNthElementsVectorType<0, n>, n),
+                                                       [IntrNoMem]>;
 
-def int_vector_interleave2   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                                     [LLVMHalfElementsVectorType<0>,
-                                                      LLVMHalfElementsVectorType<0>],
-                                                     [IntrNoMem]>;
-
-def int_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType<0>,
-                                                      LLVMHalfElementsVectorType<0>],
-                                                     [llvm_anyvector_ty],
-                                                     [IntrNoMem]>;
-
-def int_vector_interleave3   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                                     [LLVMOneThirdElementsVectorType<0>,
-                                                      LLVMOneThirdElementsVectorType<0>,
-                                                      LLVMOneThirdElementsVectorType<0>],
-                                                     [IntrNoMem]>;
-
-def int_vector_deinterleave3 : DefaultAttrsIntrinsic<[LLVMOneThirdElementsVectorType<0>,
-                                                      LLVMOneThirdElementsVectorType<0>,
-                                                      LLVMOneThirdElementsVectorType<0>],
-                                                     [llvm_anyvector_ty],
-                                                     [IntrNoMem]>;
-
-def int_vector_interleave4   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                                     [LLVMOneFourthElementsVectorType<0>,
-                                                      LLVMOneFourthElementsVectorType<0>,
-                                                      LLVMOneFourthElementsVectorType<0>,
-                                                      LLVMOneFourthElementsVectorType<0>],
-                                                     [IntrNoMem]>;
-
-def int_vector_deinterleave4 : DefaultAttrsIntrinsic<[LLVMOneFourthElementsVectorType<0>,
-                                                      LLVMOneFourthElementsVectorType<0>,
-                                                      LLVMOneFourthElementsVectorType<0>,
-                                                      LLVMOneFourthElementsVectorType<0>],
-                                                     [llvm_anyvector_ty],
-                                                     [IntrNoMem]>;
-
-def int_vector_interleave5   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                                     [LLVMOneFifthElementsVectorType<0>,
-                                                      LLVMOneFifthElementsVectorType<0>,
-                                                      LLVMOneFifthElementsVectorType<0>,
-                                                      LLVMOneFifthElementsVectorType<0>,
-                                                      LLVMOneFifthElementsVectorType<0>],
-                                                     [IntrNoMem]>;
-
-def int_vector_deinterleave5 : DefaultAttrsIntrinsic<[LLVMOneFifthElementsVectorType<0>,
-                                                      LLVMOneFifthElementsVectorType<0>,
-                                                      LLVMOneFifthElementsVectorType<0>,
-                                                      LLVMOneFifthElementsVectorType<0>,
-                                                      LLVMOneFifthElementsVectorType<0>],
-                                                     [llvm_anyvector_ty],
-                                                     [IntrNoMem]>;
-
-def int_vector_interleave6   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                                     [LLVMOneSixthElementsVectorType<0>,
-                                                      LLVMOneSixthElementsVectorType<0>,
-                                                      LLVMOneSixthElementsVectorType<0>,
-                                                      LLVMOneSixthElementsVectorType<0>,
-                                                      LLVMOneSixthElementsVectorType<0>,
-                                                      LLVMOneSixthElementsVectorType<0>],
-                                                     [IntrNoMem]>;
-
-def int_vector_deinterleave6 : DefaultAttrsIntrinsic<[LLVMOneSixthElementsVectorType<0>,
-                                                      LLVMOneSixthElementsVectorType<0>,
-                                                      LLVMOneSixthElementsVectorType<0>,
-                                                      LLVMOneSixthElementsVectorType<0>,
-                                                      LLVMOneSixthElementsVectorType<0>,
-                                                      LLVMOneSixthElementsVectorType<0>],
-                                                     [llvm_anyvector_ty],
-                                                     [IntrNoMem]>;
-
-def int_vector_interleave7   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                                     [LLVMOneSeventhElementsVectorType<0>,
-                                                      LLVMOneSeventhElementsVectorType<0>,
-                                                      LLVMOneSeventhElementsVectorType<0>,
-                                                      LLVMOneSeventhElementsVectorType<0>,
-                                                      LLVMOneSeventhElementsVectorType<0>,
-                                                      LLVMOneSeventhElementsVectorType<0>,
-                                                      LLVMOneSeventhElementsVectorType<0>],
-                                                     [IntrNoMem]>;
-
-def int_vector_deinterleave7 : DefaultAttrsIntrinsic<[LLVMOneSeventhElementsVectorType<0>,
-                                                      LLVMOneSeventhElementsVectorType<0>,
-                                                      LLVMOneSeventhElementsVectorType<0>,
-                                                      LLVMOneSeventhElementsVectorType<0>,
-                                                      LLVMOneSeventhElementsVectorType<0>,
-                                                      LLVMOneSeventhElementsVectorType<0>,
-                                                      LLVMOneSeventhElementsVectorType<0>],
-                                                     [llvm_anyvector_ty],
-                                                     [IntrNoMem]>;
-
-def int_vector_interleave8   : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                                                     [LLVMOneEighthElementsVectorType<0>,
-                                                      LLVMOneEighthElementsVectorType<0>,
-                                                      LLVMOneEighthElementsVectorType<0>,
-                                                      LLVMOneEighthElementsVectorType<0>,
-                                                      LLVMOneEighthElementsVectorType<0>,
-                                                      LLVMOneEighthElementsVectorType<0>,
-                                                      LLVMOneEighthElementsVectorType<0>,
-                                                      LLVMOneEighthElementsVectorType<0>],
-                                                     [IntrNoMem]>;
-
-def int_vector_deinterleave8 : DefaultAttrsIntrinsic<[LLVMOneEighthElementsVectorType<0>,
-                                                      LLVMOneEighthElementsVectorType<0>,
-                                                      LLVMOneEighthElementsVectorType<0>,
-                                                      LLVMOneEighthElementsVectorType<0>,
-                                                      LLVMOneEighthElementsVectorType<0>,
-                                                      LLVMOneEighthElementsVectorType<0>,
-                                                      LLVMOneEighthElementsVectorType<0>,
-                                                      LLVMOneEighthElementsVectorType<0>],
-                                                     [llvm_anyvector_ty],
-                                                     [IntrNoMem]>;
+  def int_vector_deinterleave#n : DefaultAttrsIntrinsic<!listsplat(LLVMOneNthElementsVectorType<0, n>, n),
+                                                       [llvm_anyvector_ty],
+                                                       [IntrNoMem]>;
+}
 
 //===-------------- Intrinsics to perform partial reduction ---------------===//
 
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 67fd91f0896eb..0ec5f5163118e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -186,7 +186,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                 [IntrNoMem]>;
   class AdvSIMD_2VectorArg_Tied_Narrow_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty],
+                [LLVMOneNthElementsVectorType<0, 2>, llvm_anyvector_ty],
                 [IntrNoMem]>;
   class AdvSIMD_2VectorArg_Lane_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyint_ty],
@@ -207,11 +207,11 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                [IntrNoMem]>;
   class AdvSIMD_3VectorArg_Tied_Narrow_Intrinsic
       : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-               [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty,
+               [LLVMOneNthElementsVectorType<0, 2>, llvm_anyvector_ty,
                 LLVMMatchType<1>], [IntrNoMem]>;
   class AdvSIMD_3VectorArg_Scalar_Tied_Narrow_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
-                [LLVMHalfElementsVectorType<0>, llvm_anyvector_ty, llvm_i32_ty],
+                [LLVMOneNthElementsVectorType<0, 2>, llvm_anyvector_ty, llvm_i32_ty],
                 [IntrNoMem]>;
   class AdvSIMD_CvtFxToFP_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty],
@@ -1330,7 +1330,7 @@ let TargetPrefix = "aarch64" in {  // All intrinsics start with "llvm.aarch64.".
                 [IntrNoMem, ImmArg<ArgIndex<0>>]>;
 
   class AdvSIMD_SVE_PUNPKHI_Intrinsic
-    : DefaultAttrsIntrinsic<[LLVMHalfElementsVectorType<0>],
+    : DefaultAttrsIntrinsic<[LLVMOneNthElementsVectorType<0, 2>],
                 [llvm_anyvector_ty],
                 [IntrNoMem]>;
 
@@ -4229,4 +4229,4 @@ let TargetPrefix = "aarch64" in {
   def int_aarch64_sme_fp8_fvdot_lane_za16_vg1x2  : SME_FP8_ZA_LANE_VGx2_Intrinsic;
   def int_aarch64_sme_fp8_fvdotb_lane_za32_vg1x4 : SME_FP8_ZA_LANE_VGx2_Intrinsic;
   def int_aarch64_sme_fp8_fvdott_lane_za32_vg1x4 : SME_FP8_ZA_LANE_VGx2_Intrinsic;
-}
\ No newline at end of file
+}
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index 28f7523476774..e631419d5e1c2 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -366,46 +366,11 @@ DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
         IITDescriptor::get(IITDescriptor::TruncArgument, ArgInfo));
     return;
   }
-  case IIT_HALF_VEC_ARG: {
-    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
-    OutputTable.push_back(
-        IITDescriptor::get(IITDescriptor::HalfVecArgument, ArgInfo));
-    return;
-  }
-  case IIT_ONE_THIRD_VEC_ARG: {
-    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
-    OutputTable.push_back(
-        IITDescriptor::get(IITDescriptor::OneThirdVecArgument, ArgInfo));
-    return;
-  }
-  case IIT_ONE_FOURTH_VEC_ARG: {
-    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
-    OutputTable.push_back(
-        IITDescriptor::get(IITDescriptor::OneFourthVecArgument, ArgInfo));
-    return;
-  }
-  case IIT_ONE_FIFTH_VEC_ARG: {
-    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
-    OutputTable.push_back(
-        IITDescriptor::get(IITDescriptor::OneFifthVecArgument, ArgInfo));
-    return;
-  }
-  case IIT_ONE_SIXTH_VEC_ARG: {
-    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
-    OutputTable.push_back(
-        IITDescriptor::get(IITDescriptor::OneSixthVecArgument, ArgInfo));
-    return;
-  }
-  case IIT_ONE_SEVENTH_VEC_ARG: {
-    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
-    OutputTable.push_back(
-        IITDescriptor::get(IITDescriptor::OneSeventhVecArgument, ArgInfo));
-    return;
-  }
-  case IIT_ONE_EIGHTH_VEC_ARG: {
-    unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+  case IIT_ONE_NTH_ELTS_VEC_ARG: {
+    unsigned short ArgNo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+    unsigned short N = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
     OutputTable.push_back(
-        IITDescriptor::get(IITDescriptor::OneEighthVecArgument, ArgInfo));
+        IITDescriptor::get(IITDescriptor::OneNthEltsVecArgument, N, ArgNo));
     return;
   }
   case IIT_SAME_VEC_WIDTH_ARG: {
@@ -598,18 +563,9 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
     int SubDivs = D.Kind == IITDescriptor::Subdivide2Argument ? 1 : 2;
     return VectorType::getSubdividedVectorType(VTy, SubDivs);
   }
-  case IITDescriptor::HalfVecArgument:
-    return VectorType::getHalfElementsVectorType(
-        cast<VectorType>(Tys[D.getArgumentNumber()]));
-  case IITDescriptor::OneThirdVecArgument:
-  case IITDescriptor::OneFourthVecArgument:
-  case IITDescriptor::OneFifthVecArgument:
-  case IITDescriptor::OneSixthVecArgument:
-  case IITDescriptor::OneSeventhVecArgument:
-  case IITDescriptor::OneEighthVecArgument:
+  case IITDescriptor::OneNthEltsVecArgument:
     return VectorType::getOneNthElementsVectorType(
-        cast<VectorType>(Tys[D.getArgumentNumber()]),
-        3 + (D.Kind - IITDescriptor::OneThirdVecArgument));
+        cast<VectorType>(Tys[D.getRefArgNumber()]), D.getVectorDivisor());
   case IITDescriptor::SameVecWidthArgument: {
     Type *EltTy = DecodeFixedType(Infos, Tys, Context);
     Type *Ty = Tys[D.getArgumentNumber()];
@@ -987,26 +943,14 @@ matchIntrinsicType(Type *Ty, ArrayRef<Intrinsic::IITDescriptor> &Infos,
 
     return Ty != NewTy;
   }
-  case IITDescriptor::HalfVecArgument:
+  case IITDescriptor::OneNthEltsVecArgument:
     // If this is a forward reference, defer the check for later.
-    if (D.getArgumentNumber() >= ArgTys.size())
-      return IsDeferredCheck || DeferCheck(Ty);
-    return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
-           VectorType::getHalfElementsVectorType(
-               cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
-  case IITDescriptor::OneThirdVecArgument:
-  case IITDescriptor::OneFourthVecArgument:
-  case IITDescriptor::OneFifthVecArgument:
-  case IITDescriptor::OneSixthVecArgument:
-  case IITDescriptor::OneSeventhVecArgument:
-  case IITDescriptor::OneEighthVecArgument:
-    // If this is a forward reference, defer the check for later.
-    if (D.getArgumentNumber() >= ArgTys.size())
+    if (D.getRefArgNumber() >= ArgTys.size())
       return IsDeferredCheck || DeferCheck(Ty);
-    return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
+    return !isa<VectorType>(ArgTys[D.getRefArgNumber()]) ||
            VectorType::getOneNthElementsVectorType(
-               cast<VectorType>(ArgTys[D.getArgumentNumber()]),
-               3 + (D.Kind - IITDescriptor::OneThirdVecArgument)) != Ty;
+               cast<VectorType>(ArgTys[D.getRefArgNumber()]),
+               D.getVectorDivisor()) != Ty;
   case IITDescriptor::SameVecWidthArgument: {
     if (D.getArgumentNumber() >= ArgTys.size()) {
       // Defer check and subsequent check for the vector element type.



More information about the llvm-commits mailing list