[llvm] dd68f3c - [RISCV] Support insertion of misaligned subvectors

Tue Feb 23 02:37:25 PST 2021

Author: Fraser Cormack
Date: 2021-02-23T10:31:06Z
New Revision: dd68f3cf2899c554cab7baf3ccdcd3f987d77736

URL: https://github.com/llvm/llvm-project/commit/dd68f3cf2899c554cab7baf3ccdcd3f987d77736
DIFF: https://github.com/llvm/llvm-project/commit/dd68f3cf2899c554cab7baf3ccdcd3f987d77736.diff

LOG: [RISCV] Support insertion of misaligned subvectors

This patch extends the support for RVV INSERT_SUBVECTOR to cover those
which don't align to a vector register boundary. Like the support for
EXTRACT_SUBVECTOR in D96959, it accomplishes this by extracting the
nearest register-sized subvector (a subregister operation), then sliding
the vector down with VSLIDEDOWN, inserting the subvector to the first
position, and sliding the vector back up again afterwards.

Unlike subvector extraction, for vectors that occupy less than a full
vector register we must preserve the untouched elements. We do this by
lowering to an LMUL=1 INSERT_SUBVECTOR using the above method and
lowering that to a VSLIDEUP with a zero offset. This uses a
tail-undisturbed policy and so has the effect of "sliding in" the
subvector elements while preserving the surrounding ones.

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D96972

Added: 
    

Modified: 
    llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
    llvm/lib/Target/RISCV/RISCVISelLowering.cpp
    llvm/lib/Target/RISCV/RISCVISelLowering.h
    llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 14fd9de5ca01..8bccf41a1123 100644

--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -929,6 +929,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     SDValue SubV = Node->getOperand(1);
     SDLoc DL(SubV);
     auto Idx = Node->getConstantOperandVal(2);
+    MVT XLenVT = Subtarget->getXLenVT();
     MVT SubVecVT = Node->getOperand(1).getSimpleValueType();
 
     // TODO: This method of selecting INSERT_SUBVECTOR should work
@@ -936,24 +937,6 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     // correctly identify the canonical register class for fixed-length types.
     // For now, keep the two paths separate.
     if (VT.isScalableVector() && SubVecVT.isScalableVector()) {
-      bool IsFullVecReg = false;
-      switch (RISCVTargetLowering::getLMUL(SubVecVT)) {
-      default:
-        break;
-      case RISCVVLMUL::LMUL_1:
-      case RISCVVLMUL::LMUL_2:
-      case RISCVVLMUL::LMUL_4:
-      case RISCVVLMUL::LMUL_8:
-        IsFullVecReg = true;
-        break;
-      }
-
-      // If the subvector doesn't occupy a full vector register then we can't
-      // insert it purely using subregister manipulation. We must not clobber
-      // the untouched elements (say, in the upper half of the VR register).
-      if (!IsFullVecReg)
-        break;
-
       const auto *TRI = Subtarget->getRegisterInfo();
       unsigned SubRegIdx;
       std::tie(SubRegIdx, Idx) =
@@ -966,9 +949,32 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       if (Idx != 0)
         break;
 
+      RISCVVLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecVT);
+      bool IsSubVecPartReg = SubVecLMUL == RISCVVLMUL::LMUL_F2 ||
+                             SubVecLMUL == RISCVVLMUL::LMUL_F4 ||
+                             SubVecLMUL == RISCVVLMUL::LMUL_F8;
+      (void)IsSubVecPartReg; // Silence unused variable warning without asserts.
+      assert((!IsSubVecPartReg || V.isUndef()) &&
+             "Expecting lowering to have created legal INSERT_SUBVECTORs when "
+             "the subvector is smaller than a full-sized register");
+
+      // If we haven't set a SubRegIdx, then we must be going between LMUL<=1
+      // types (VR -> VR). This can be done as a copy.
+      if (SubRegIdx == RISCV::NoSubRegister) {
+        unsigned InRegClassID = RISCVTargetLowering::getRegClassIDForVecVT(VT);
+        assert(RISCVTargetLowering::getRegClassIDForVecVT(SubVecVT) ==
+                   RISCV::VRRegClassID &&
+               InRegClassID == RISCV::VRRegClassID &&
+               "Unexpected subvector extraction");
+        SDValue RC = CurDAG->getTargetConstant(InRegClassID, DL, XLenVT);
+        SDNode *NewNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
+                                                 DL, VT, SubV, RC);
+        return ReplaceNode(Node, NewNode);
+      }
+
       SDNode *NewNode = CurDAG->getMachineNode(
           TargetOpcode::INSERT_SUBREG, DL, VT, V, SubV,
-          CurDAG->getTargetConstant(SubRegIdx, DL, Subtarget->getXLenVT()));
+          CurDAG->getTargetConstant(SubRegIdx, DL, XLenVT));
       return ReplaceNode(Node, NewNode);
     }
 
@@ -981,8 +987,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
 
       unsigned RegClassID = RISCVTargetLowering::getRegClassIDForVecVT(VT);
 
-      SDValue RC =
-          CurDAG->getTargetConstant(RegClassID, DL, Subtarget->getXLenVT());
+      SDValue RC = CurDAG->getTargetConstant(RegClassID, DL, XLenVT);
       SDNode *NewNode = CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS,
                                                DL, VT, SubV, RC);
       ReplaceNode(Node, NewNode);

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index de0f0bb41edd..53668002b1d9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -465,6 +465,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
       setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
 
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
     }
 
@@ -501,6 +502,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::VECREDUCE_SEQ_FADD, VT, Custom);
       setOperationAction(ISD::FCOPYSIGN, VT, Legal);
 
+      setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
     };
 
@@ -1327,6 +1329,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
   case ISD::VECREDUCE_FADD:
   case ISD::VECREDUCE_SEQ_FADD:
     return lowerFPVECREDUCE(Op, DAG);
+  case ISD::INSERT_SUBVECTOR:
+    return lowerINSERT_SUBVECTOR(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR:
     return lowerEXTRACT_SUBVECTOR(Op, DAG);
   case ISD::BUILD_VECTOR:
@@ -2234,6 +2238,87 @@ static MVT getLMUL1VT(MVT VT) {
       RISCV::RVVBitsPerBlock / VT.getVectorElementType().getSizeInBits());
 }
 
+SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  SDValue Vec = Op.getOperand(0);
+  SDValue SubVec = Op.getOperand(1);
+  MVT VecVT = Vec.getSimpleValueType();
+  MVT SubVecVT = SubVec.getSimpleValueType();
+
+  // TODO: Only handle scalable->scalable inserts for now, and revisit this for
+  // fixed-length vectors later.
+  if (!SubVecVT.isScalableVector() || !VecVT.isScalableVector())
+    return Op;
+
+  SDLoc DL(Op);
+  unsigned OrigIdx = Op.getConstantOperandVal(2);
+  const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+  unsigned SubRegIdx, RemIdx;
+  std::tie(SubRegIdx, RemIdx) =
+      RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+          VecVT, SubVecVT, OrigIdx, TRI);
+
+  RISCVVLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecVT);
+  bool IsSubVecPartReg = SubVecLMUL == RISCVVLMUL::LMUL_F2 ||
+                         SubVecLMUL == RISCVVLMUL::LMUL_F4 ||
+                         SubVecLMUL == RISCVVLMUL::LMUL_F8;
+
+  // If the Idx has been completely eliminated and this subvector's size is a
+  // vector register or a multiple thereof, or the surrounding elements are
+  // undef, then this is a subvector insert which naturally aligns to a vector
+  // register. These can easily be handled using subregister manipulation.
+  // If the subvector is smaller than a vector register, then the insertion
+  // must preserve the undisturbed elements of the register. We do this by
+  // lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector type
+  // (which resolves to a subregister copy), performing a VSLIDEUP to place the
+  // subvector within the vector register, and an INSERT_SUBVECTOR of that
+  // LMUL=1 type back into the larger vector (resolving to another subregister
+  // operation). See below for how our VSLIDEUP works. We go via a LMUL=1 type
+  // to avoid allocating a large register group to hold our subvector.
+  if (RemIdx == 0 && (!IsSubVecPartReg || Vec.isUndef()))
+    return Op;
+
+  // VSLIDEUP works by leaving elements 0<i<OFFSET undisturbed, elements
+  // OFFSET<=i<VL set to the "subvector" and vl<=i<VLMAX set to the tail policy
+  // (in our case undisturbed). This means we can set up a subvector insertion
+  // where OFFSET is the insertion offset, and the VL is the OFFSET plus the
+  // size of the subvector.
+  MVT XLenVT = Subtarget.getXLenVT();
+  MVT InterSubVT = getLMUL1VT(VecVT);
+
+  // Extract a subvector equal to the nearest full vector register type. This
+  // should resolve to a EXTRACT_SUBREG instruction.
+  unsigned AlignedIdx = OrigIdx - RemIdx;
+  SDValue AlignedExtract =
+      DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InterSubVT, Vec,
+                  DAG.getConstant(AlignedIdx, DL, XLenVT));
+
+  SDValue SlideupAmt = DAG.getConstant(RemIdx, DL, XLenVT);
+  // For scalable vectors this must be further multiplied by vscale.
+  SlideupAmt = DAG.getNode(ISD::VSCALE, DL, XLenVT, SlideupAmt);
+
+  SDValue Mask, VL;
+  std::tie(Mask, VL) = getDefaultScalableVLOps(VecVT, DL, DAG, Subtarget);
+
+  // Construct the vector length corresponding to RemIdx + length(SubVecVT).
+  VL = DAG.getConstant(SubVecVT.getVectorMinNumElements(), DL, XLenVT);
+  VL = DAG.getNode(ISD::VSCALE, DL, XLenVT, VL);
+  VL = DAG.getNode(ISD::ADD, DL, XLenVT, SlideupAmt, VL);
+
+  SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InterSubVT,
+                       DAG.getUNDEF(InterSubVT), SubVec,
+                       DAG.getConstant(0, DL, XLenVT));
+
+  SDValue Slideup = DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, InterSubVT,
+                                AlignedExtract, SubVec, SlideupAmt, Mask, VL);
+
+  // Insert this subvector into the correct vector register. This should
+  // resolve to an INSERT_SUBREG instruction.
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, Vec, Slideup,
+                     DAG.getConstant(AlignedIdx, DL, XLenVT));
+}
+
 SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
                                                     SelectionDAG &DAG) const {
   SDValue Vec = Op.getOperand(0);

diff  --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 66f599a5a913..fde650f8c633 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -417,6 +417,7 @@ class RISCVTargetLowering : public TargetLowering {
   SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFPVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFixedLengthVectorLoadToRVV(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFixedLengthVectorStoreToRVV(SDValue Op, SelectionDAG &DAG) const;

diff  --git a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
index 6538ec5dd06c..cb2e875d797d 100644
--- a/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple riscv64 -mattr=+experimental-v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple riscv64 -mattr=+m,+d,+experimental-zfh,+experimental-v -verify-machineinstrs < %s | FileCheck %s
 
 define <vscale x 8 x i32> @insert_nxv8i32_nxv4i32_0(<vscale x 8 x i32> %vec, <vscale x 4 x i32> %subvec) {
 ; CHECK-LABEL: insert_nxv8i32_nxv4i32_0:
@@ -181,21 +181,191 @@ define <vscale x 16 x i32> @insert_nxv16i32_nxv2i32_14(<vscale x 16 x i32> %vec,
   ret <vscale x 16 x i32> %v
 }
 
-; TODO: Inserts that are less than LMUL=1 are not yet supported. In this case
-; we need mask out the unaffected elements (top half of the VR %subvec
-; register)
-;define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_0(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec) {
-;  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 0)
-;  ret <vscale x 16 x i32> %v
-;}
+define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_0(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv16i32_nxv1i32_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetvli a0, a0, e32,m1,tu,mu
+; CHECK-NEXT:    vslideup.vi v8, v16, 0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 0)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_1(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv16i32_nxv1i32_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    add a1, a0, a0
+; CHECK-NEXT:    vsetvli a1, a1, e32,m1,tu,mu
+; CHECK-NEXT:    vslideup.vx v8, v16, a0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 1)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_6(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec) {
+; CHECK-LABEL: insert_nxv16i32_nxv1i32_6:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetvli a0, a0, e32,m1,tu,mu
+; CHECK-NEXT:    vslideup.vi v11, v16, 0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 6)
+  ret <vscale x 16 x i32> %v
+}
+
+define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_0(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec) {
+; CHECK-LABEL: insert_nxv16i8_nxv1i8_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    vsetvli a0, a0, e8,m1,tu,mu
+; CHECK-NEXT:    vslideup.vi v8, v10, 0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 0)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_1(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec) {
+; CHECK-LABEL: insert_nxv16i8_nxv1i8_1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    add a1, a0, a0
+; CHECK-NEXT:    vsetvli a1, a1, e8,m1,tu,mu
+; CHECK-NEXT:    vslideup.vx v8, v10, a0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 1)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_2(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec) {
+; CHECK-LABEL: insert_nxv16i8_nxv1i8_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    vsetvli a0, a0, e8,m1,tu,mu
+; CHECK-NEXT:    vslideup.vx v8, v10, a1
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 2)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_3(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec) {
+; CHECK-LABEL: insert_nxv16i8_nxv1i8_3:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a1, a1, a0
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    vsetvli a0, a0, e8,m1,tu,mu
+; CHECK-NEXT:    vslideup.vx v8, v10, a1
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 3)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_7(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec) {
+; CHECK-LABEL: insert_nxv16i8_nxv1i8_7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    vsetvli a1, a1, e8,m1,tu,mu
+; CHECK-NEXT:    vslideup.vx v8, v10, a0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 7)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_15(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec) {
+; CHECK-LABEL: insert_nxv16i8_nxv1i8_15:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    slli a1, a0, 3
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    vsetvli a1, a1, e8,m1,tu,mu
+; CHECK-NEXT:    vslideup.vx v9, v10, a0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 15)
+  ret <vscale x 16 x i8> %v
+}
+
+define <vscale x 32 x half> @insert_nxv32f16_nxv2f16_0(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec) {
+; CHECK-LABEL: insert_nxv32f16_nxv2f16_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    vsetvli a0, a0, e16,m1,tu,mu
+; CHECK-NEXT:    vslideup.vi v8, v16, 0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x half> @llvm.experimental.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec, i64 0)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @insert_nxv32f16_nxv2f16_2(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec) {
+; CHECK-LABEL: insert_nxv32f16_nxv2f16_2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a1, a0, a0
+; CHECK-NEXT:    vsetvli a1, a1, e16,m1,tu,mu
+; CHECK-NEXT:    vslideup.vx v8, v16, a0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x half> @llvm.experimental.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec, i64 2)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @insert_nxv32f16_nxv2f16_26(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec) {
+; CHECK-LABEL: insert_nxv32f16_nxv2f16_26:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    add a1, a0, a0
+; CHECK-NEXT:    vsetvli a1, a1, e16,m1,tu,mu
+; CHECK-NEXT:    vslideup.vx v14, v16, a0
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x half> @llvm.experimental.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half> %vec, <vscale x 2 x half> %subvec, i64 26)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @insert_nxv32f16_undef_nxv1f16_0(<vscale x 1 x half> %subvec) {
+; CHECK-LABEL: insert_nxv32f16_undef_nxv1f16_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $v8 killed $v8 def $v8m8
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x half> @llvm.experimental.vector.insert.nxv1f16.nxv32f16(<vscale x 32 x half> undef, <vscale x 1 x half> %subvec, i64 0)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @insert_nxv32f16_undef_nxv1f16_26(<vscale x 1 x half> %subvec) {
+; CHECK-LABEL: insert_nxv32f16_undef_nxv1f16_26:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    slli a1, a0, 1
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    vsetvli a0, a0, e16,m1,ta,mu
+; CHECK-NEXT:    vslideup.vx v22, v8, a1
+; CHECK-NEXT:    vmv8r.v v8, v16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 32 x half> @llvm.experimental.vector.insert.nxv1f16.nxv32f16(<vscale x 32 x half> undef, <vscale x 1 x half> %subvec, i64 26)
+  ret <vscale x 32 x half> %v
+}
+
+declare <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8>, <vscale x 1 x i8>, i64)
 
-; TODO: Inserts that don't align to a vector register are not yet supported.
-; In this case we want to insert the subvector into the upper half of the
-; lowest VR subregister in the LMUL group.
-;define <vscale x 16 x i32> @insert_nxv16i32_nxv1i32_1(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec) {
-;  %v = call <vscale x 16 x i32> @llvm.experimental.vector.insert.nxv1i32.nxv16i32(<vscale x 16 x i32> %vec, <vscale x 1 x i32> %subvec, i64 1)
-;  ret <vscale x 16 x i32> %v
-;}
+declare <vscale x 32 x half> @llvm.experimental.vector.insert.nxv1f16.nxv32f16(<vscale x 32 x half>, <vscale x 1 x half>, i64)
+declare <vscale x 32 x half> @llvm.experimental.vector.insert.nxv2f16.nxv32f16(<vscale x 32 x half>, <vscale x 2 x half>, i64)
 
 declare <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv2i32.nxv8i32(<vscale x 8 x i32>, <vscale x 2 x i32>, i64 %idx)
 declare <vscale x 8 x i32> @llvm.experimental.vector.insert.nxv4i32.nxv8i32(<vscale x 8 x i32>, <vscale x 4 x i32>, i64 %idx)