[llvm] [RISCV][GISEL] Legalize G_INSERT_SUBVECTOR (PR #108859)

Tue Sep 17 10:16:45 PDT 2024

https://github.com/michaelmaitland updated https://github.com/llvm/llvm-project/pull/108859

>From bcb6d153d317c3b1f1ec77ae1321beee324cd1e2 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Mon, 16 Sep 2024 10:10:46 -0700
Subject: [PATCH 1/7] [RISCV][GISEL] Legalize G_INSERT_SUBVECTOR

This code is heavily based on the SelectionDAG lowerINSERT_SUBVECTOR code.
---
 .../Target/RISCV/GISel/RISCVLegalizerInfo.cpp | 156 +++++++
 .../Target/RISCV/GISel/RISCVLegalizerInfo.h   |   1 +
 llvm/lib/Target/RISCV/RISCVInstrGISel.td      |  18 +
 .../rvv/legalize-insert-subvector.mir         | 402 ++++++++++++++++++
 4 files changed, 577 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index c204683f4e79f8..8396316dc47fdc 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -581,6 +581,12 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
 
   SplatActions.clampScalar(1, sXLen, sXLen);
 
+  getActionDefinitionsBuilder(G_INSERT_SUBVECTOR)
+      .customIf(all(typeIsLegalBoolVec(0, BoolVecTys, ST),
+                    typeIsLegalBoolVec(1, BoolVecTys, ST)))
+      .customIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
+                    typeIsLegalIntOrFPVec(1, IntOrFPVecTys, ST)));
+
   getLegacyLegalizerInfo().computeTables();
 }
 
@@ -915,6 +921,154 @@ bool RISCVLegalizerInfo::legalizeSplatVector(MachineInstr &MI,
   return true;
 }
 
+static LLT getLMUL1Ty(LLT VecTy) {
+  assert(VecTy.getElementType().getSizeInBits() <= 64 &&
+         "Unexpected vector LLT");
+  return LLT::scalable_vector(RISCV::RVVBitsPerBlock /
+                                  VecTy.getElementType().getSizeInBits(),
+                              VecTy.getElementType());
+}
+
+bool RISCVLegalizerInfo::legalizeInsertSubvector(MachineInstr &MI,
+                                                 MachineIRBuilder &MIB) const {
+  assert(MI.getOpcode() == TargetOpcode::G_INSERT_SUBVECTOR);
+
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src1 = MI.getOperand(1).getReg();
+  Register Src2 = MI.getOperand(2).getReg();
+  uint64_t Idx = MI.getOperand(3).getImm();
+
+  LLT BigTy = MRI.getType(Src1);
+  LLT LitTy = MRI.getType(Src2);
+  Register BigVec = Src1;
+  Register LitVec = Src2;
+
+  // We don't have the ability to slide mask vectors up indexed by their i1
+  // elements; the smallest we can do is i8. Often we are able to bitcast to
+  // equivalent i8 vectors. Otherwise, we can must zeroextend to equivalent i8
+  // vectors and truncate down after the insert.
+  if (LitTy.getElementType() == LLT::scalar(1) &&
+      (Idx != 0 ||
+       MRI.getVRegDef(BigVec)->getOpcode() != TargetOpcode::G_IMPLICIT_DEF)) {
+    auto BigTyMinElts = BigTy.getElementCount().getKnownMinValue();
+    auto LitTyMinElts = LitTy.getElementCount().getKnownMinValue();
+    if (BigTyMinElts >= 8 && LitTyMinElts >= 8) {
+      assert(Idx % 8 == 0 && "Invalid index");
+      assert(BigTyMinElts % 8 == 0 && LitTyMinElts % 8 == 0 &&
+             "Unexpected mask vector lowering");
+      Idx /= 8;
+      BigTy = LLT::vector(BigTy.getElementCount().divideCoefficientBy(8), 8);
+      LitTy = LLT::vector(LitTy.getElementCount().divideCoefficientBy(8), 8);
+      BigVec = MIB.buildBitcast(BigTy, BigVec).getReg(0);
+      LitVec = MIB.buildBitcast(LitTy, LitVec).getReg(0);
+    } else {
+      // We can't slide this mask vector up indexed by its i1 elements.
+      // This poses a problem when we wish to insert a scalable vector which
+      // can't be re-expressed as a larger type. Just choose the slow path and
+      // extend to a larger type, then truncate back down.
+      LLT ExtBigTy = BigTy.changeElementType(LLT::scalar(8));
+      LLT ExtLitTy = LitTy.changeElementType(LLT::scalar(8));
+      auto BigZExt = MIB.buildZExt(ExtBigTy, BigVec);
+      auto LitZExt = MIB.buildZExt(ExtLitTy, LitVec);
+      auto Insert = MIB.buildInsertSubvector(ExtBigTy, BigZExt, LitZExt, Idx);
+      auto SplatZero = MIB.buildSplatVector(
+          ExtBigTy, MIB.buildConstant(ExtBigTy.getElementType(), 0));
+      MIB.buildICmp(CmpInst::Predicate::ICMP_NE, Dst, Insert, SplatZero);
+      MI.eraseFromParent();
+      return true;
+    }
+  }
+
+  const RISCVRegisterInfo *TRI = STI.getRegisterInfo();
+  MVT LitTyMVT = getMVTForLLT(LitTy);
+  unsigned SubRegIdx, RemIdx;
+  std::tie(SubRegIdx, RemIdx) =
+      RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+          getMVTForLLT(BigTy), LitTyMVT, Idx, TRI);
+
+  RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(LitTy));
+  bool IsSubVecPartReg = SubVecLMUL == RISCVII::VLMUL::LMUL_F2 ||
+                         SubVecLMUL == RISCVII::VLMUL::LMUL_F4 ||
+                         SubVecLMUL == RISCVII::VLMUL::LMUL_F8;
+
+  // If the Idx has been completely eliminated and this subvector's size is a
+  // vector register or a multiple thereof, or the surrounding elements are
+  // undef, then this is a subvector insert which naturally aligns to a vector
+  // register. These can easily be handled using subregister manipulation.
+  if (RemIdx == 0 && (!IsSubVecPartReg || MRI.getVRegDef(Src1)->getOpcode() ==
+                                              TargetOpcode::G_IMPLICIT_DEF))
+    return true;
+
+  // If the subvector is smaller than a vector register, then the insertion
+  // must preserve the undisturbed elements of the register. We do this by
+  // lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector type
+  // (which resolves to a subregister copy), performing a VSLIDEUP to place the
+  // subvector within the vector register, and an INSERT_SUBVECTOR of that
+  // LMUL=1 type back into the larger vector (resolving to another subregister
+  // operation). See below for how our VSLIDEUP works. We go via a LMUL=1 type
+  // to avoid allocating a large register group to hold our subvector.
+
+  // VSLIDEUP works by leaving elements 0<i<OFFSET undisturbed, elements
+  // OFFSET<=i<VL set to the "subvector" and vl<=i<VLMAX set to the tail policy
+  // (in our case undisturbed). This means we can set up a subvector insertion
+  // where OFFSET is the insertion offset, and the VL is the OFFSET plus the
+  // size of the subvector.
+  const LLT XLenTy(STI.getXLenVT());
+  LLT InterLitTy = BigTy;
+  Register AlignedExtract = Src1;
+  unsigned AlignedIdx = Idx - RemIdx;
+  if (TypeSize::isKnownGT(BigTy.getSizeInBits(),
+                          getLMUL1Ty(BigTy).getSizeInBits())) {
+    InterLitTy = getLMUL1Ty(BigTy);
+    // Extract a subvector equal to the nearest full vector register type. This
+    // should resolve to a G_EXTRACT on a subreg.
+    AlignedExtract =
+        MIB.buildExtractSubvector(InterLitTy, BigVec, AlignedIdx).getReg(0);
+  }
+
+  auto Insert = MIB.buildInsertSubvector(InterLitTy, MIB.buildUndef(InterLitTy),
+                                         LitVec, 0);
+
+  auto [Mask, _] = buildDefaultVLOps(BigTy, MIB, MRI);
+  auto VL = MIB.buildVScale(XLenTy, LitTy.getElementCount().getKnownMinValue());
+
+  // Use tail agnostic policy if we're inserting over InterLitTy's tail.
+  ElementCount EndIndex =
+      ElementCount::getScalable(RemIdx) + LitTy.getElementCount();
+  uint64_t Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
+  if (EndIndex == InterLitTy.getElementCount())
+    Policy = RISCVII::TAIL_AGNOSTIC;
+
+  // If we're inserting into the lowest elements, use a tail undisturbed
+  // vmv.v.v.
+  MachineInstrBuilder Inserted;
+  if (RemIdx == 0) {
+    Inserted = MIB.buildInstr(RISCV::G_VMV_V_V_VL, {InterLitTy},
+                              {AlignedExtract, Insert, VL});
+  } else {
+    auto SlideupAmt = MIB.buildVScale(XLenTy, RemIdx);
+    // Construct the vector length corresponding to RemIdx + length(LitTy).
+    VL = MIB.buildAdd(XLenTy, SlideupAmt, VL);
+    Inserted =
+        MIB.buildInstr(RISCV::G_VSLIDEUP_VL, {InterLitTy},
+                       {AlignedExtract, LitVec, SlideupAmt, Mask, VL, Policy});
+  }
+
+  // If required, insert this subvector back into the correct vector register.
+  // This should resolve to an INSERT_SUBREG instruction.
+  if (TypeSize::isKnownGT(BigTy.getSizeInBits(), InterLitTy.getSizeInBits()))
+    Inserted = MIB.buildInsert(BigTy, BigVec, LitVec, AlignedIdx);
+
+  // We might have bitcast from a mask type: cast back to the original type if
+  // required.
+  MIB.buildBitcast(Dst, Inserted);
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool RISCVLegalizerInfo::legalizeCustom(
     LegalizerHelper &Helper, MachineInstr &MI,
     LostDebugLocObserver &LocObserver) const {
@@ -985,6 +1139,8 @@ bool RISCVLegalizerInfo::legalizeCustom(
     return legalizeExt(MI, MIRBuilder);
   case TargetOpcode::G_SPLAT_VECTOR:
     return legalizeSplatVector(MI, MIRBuilder);
+  case TargetOpcode::G_INSERT_SUBVECTOR:
+    return legalizeInsertSubvector(MI, MIRBuilder);
   case TargetOpcode::G_LOAD:
   case TargetOpcode::G_STORE:
     return legalizeLoadStore(MI, Helper, MIRBuilder);
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
index 2fc28615e7630d..ccd8ac9fe4ec90 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
@@ -46,6 +46,7 @@ class RISCVLegalizerInfo : public LegalizerInfo {
   bool legalizeVScale(MachineInstr &MI, MachineIRBuilder &MIB) const;
   bool legalizeExt(MachineInstr &MI, MachineIRBuilder &MIRBuilder) const;
   bool legalizeSplatVector(MachineInstr &MI, MachineIRBuilder &MIB) const;
+  bool legalizeInsertSubvector(MachineInstr &MI, MachineIRBuilder &MIB) const;
   bool legalizeLoadStore(MachineInstr &MI, LegalizerHelper &Helper,
                          MachineIRBuilder &MIB) const;
 };
diff --git a/llvm/lib/Target/RISCV/RISCVInstrGISel.td b/llvm/lib/Target/RISCV/RISCVInstrGISel.td
index ba40662c49c1df..948167023d50a2 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrGISel.td
@@ -57,3 +57,21 @@ def G_SPLAT_VECTOR_SPLIT_I64_VL : RISCVGenericInstruction {
   let InOperandList = (ins type0:$passthru, type1:$hi, type1:$lo, type2:$vl);
   let hasSideEffects = false;
 }
+
+// Pseudo equivalent to a RISCVISD::VMV_V_V_VL
+def G_VMV_V_V_VL : RISCVGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$vec, type2:$vl);
+  let hasSideEffects = false;
+}
+def : GINodeEquiv<G_VMV_V_V_VL, riscv_vmv_v_v_vl>;
+
+// Pseudo equivalent to a RISCVISD::VSLIDEUP_VL
+def G_VSLIDEUP_VL : RISCVGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$merge, type0:$vec, type1:$idx, type2:$mask,
+                       type3:$vl, type4:$policy);
+  let hasSideEffects = false;
+}
+def : GINodeEquiv<G_VSLIDEUP_VL, riscv_slideup_vl>;
+
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir
new file mode 100644
index 00000000000000..a5f1228b8f8ca6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir
@@ -0,0 +1,402 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,RV32
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=legalizer %s -o - | FileCheck %s -check-prefixes=CHECK,RV64
+
+# Special handling for i1-element vectors with non-zero index
+---
+name:            insert_subvector_nxv2i1_nxv4i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: insert_subvector_nxv2i1_nxv4i1
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[DEF]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT2]](s64)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT3]](s64)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF1]](<vscale x 2 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C4]](s64)
+    ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C5]](s64)
+    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR1]], [[LSHR]]
+    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VSLIDEUP_VL [[SELECT]], [[SELECT1]], [[LSHR1]](s64), [[VMSET_VL]](<vscale x 4 x s1>), [[ADD]](s64), 1
+    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 4 x s8>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 4 x s8>)
+    ; RV32-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[ANYEXT4:%[0-9]+]]:_(s64) = G_ANYEXT [[C6]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT4]](s64)
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[BITCAST]](<vscale x 4 x s8>), [[SPLAT_VECTOR4]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv2i1_nxv4i1
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[DEF]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF1]](<vscale x 2 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C4]](s32)
+    ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C5]](s32)
+    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR1]], [[LSHR]]
+    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VSLIDEUP_VL [[SELECT]], [[SELECT1]], [[LSHR1]](s32), [[VMSET_VL]](<vscale x 4 x s1>), [[ADD]](s32), 1
+    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 4 x s8>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 4 x s8>)
+    ; RV64-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C6]](s32)
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[BITCAST]](<vscale x 4 x s8>), [[SPLAT_VECTOR4]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 4 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 4 x s1>), %1, 2
+    $v8 = COPY %2(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            insert_subvector_nxv4i1_nxv8i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: insert_subvector_nxv4i1_nxv8i1
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
+    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[DEF]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT2]](s64)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV32-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT3]](s64)
+    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF1]](<vscale x 2 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C4]](s64)
+    ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C5]](s64)
+    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR1]], [[LSHR]]
+    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEUP_VL [[SELECT]], [[SELECT1]], [[LSHR1]](s64), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s64), 0
+    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s8>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 8 x s8>)
+    ; RV32-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[ANYEXT4:%[0-9]+]]:_(s64) = G_ANYEXT [[C6]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT4]](s64)
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(ne), [[BITCAST]](<vscale x 8 x s8>), [[SPLAT_VECTOR4]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv4i1_nxv8i1
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C]](s32)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
+    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[DEF]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
+    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF1]](<vscale x 2 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C4]](s32)
+    ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C5]](s32)
+    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR1]], [[LSHR]]
+    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEUP_VL [[SELECT]], [[SELECT1]], [[LSHR1]](s32), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s32), 0
+    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s8>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 8 x s8>)
+    ; RV64-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C6]](s32)
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(ne), [[BITCAST]](<vscale x 8 x s8>), [[SPLAT_VECTOR4]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 8 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 8 x s1>), %1, 2
+    $v8 = COPY %2(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            insert_subvector_nxv32i1_nxv64i1
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: insert_subvector_nxv32i1_nxv64i1
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 32 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 4 x s8>) = G_BITCAST [[DEF1]](<vscale x 32 x s1>)
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C]](s64)
+    ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C1]](s64)
+    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR1]], [[LSHR]]
+    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEUP_VL [[DEF]], [[BITCAST]], [[LSHR1]](s64), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s64), 0
+    ; RV32-NEXT: [[BITCAST1:%[0-9]+]]:_(<vscale x 64 x s1>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[BITCAST1]](<vscale x 64 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv32i1_nxv64i1
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 32 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 4 x s8>) = G_BITCAST [[DEF1]](<vscale x 32 x s1>)
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C]](s32)
+    ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C1]](s32)
+    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR1]], [[LSHR]]
+    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEUP_VL [[DEF]], [[BITCAST]], [[LSHR1]](s32), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s32), 0
+    ; RV64-NEXT: [[BITCAST1:%[0-9]+]]:_(<vscale x 64 x s1>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[BITCAST1]](<vscale x 64 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 32 x s1>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 64 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 64 x s1>), %1, 16
+    $v8 = COPY %2(<vscale x 64 x s1>)
+    PseudoRET implicit $v8
+...
+
+# i1-element vectors with zero index
+---
+name:            insert_subvector_nxv2i1_nxv4i1_zero
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insert_subvector_nxv2i1_nxv4i1_zero
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 1 x s1>), 0
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 4 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 4 x s1>), %1, 0
+    $v8 = COPY %2(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            insert_subvector_nxv4i1_nxv8i1_zero
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insert_subvector_nxv4i1_nxv8i1_zero
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 2 x s1>), 0
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 8 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 8 x s1>), %1, 0
+    $v8 = COPY %2(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+...
+---
+name:            insert_subvector_nxv32i1_nxv64i1_zero
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insert_subvector_nxv32i1_nxv64i1_zero
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 64 x s1>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 16 x s1>), 0
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 64 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 64 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 64 x s1>), %1, 0
+    $v8 = COPY %2(<vscale x 64 x s1>)
+    PseudoRET implicit $v8
+...
+
+# Inserrt with zero index
+---
+name:            insert_subvector_nxv1i8_nxv2i8_zero
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insert_subvector_nxv1i8_nxv2i8_zero
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 1 x s8>), 0
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 2 x s8>) = G_INSERT_SUBVECTOR %0(<vscale x 2 x s8>), %1, 0
+    $v8 = COPY %2(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insert_subvector_nxv2i16_nxv4i16_zero
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insert_subvector_nxv2i16_nxv4i16_zero
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 1 x s16>), 0
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR %0(<vscale x 4 x s16>), %1, 0
+    $v8 = COPY %2(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insert_subvector_nxv4i32_nxv8i32_zero
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insert_subvector_nxv4i32_nxv8i32_zero
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 4 x s32>), 0
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 8 x s32>) = G_INSERT_SUBVECTOR %0(<vscale x 8 x s32>), %1, 0
+    $v8 = COPY %2(<vscale x 8 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insert_subvector_nxv2i64_nxv8i64_zero
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insert_subvector_nxv2i64_nxv8i64_zero
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 2 x s64>), 0
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 8 x s64>) = G_INSERT_SUBVECTOR %0(<vscale x 8 x s64>), %1, 0
+    $v8 = COPY %2(<vscale x 8 x s64>)
+    PseudoRET implicit $v8
+...
+
+# Extract with non-zero index
+---
+name:            insert_subvector_nxv1i8_nxv2i8
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insert_subvector_nxv1i8_nxv2i8
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 1 x s8>), 0
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 2 x s8>) = G_INSERT_SUBVECTOR %0(<vscale x 2 x s8>), %1, 0
+    $v8 = COPY %2(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+...
+---
+name:            insert_subvector_nxv2i16_nxv4i16
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insert_subvector_nxv2i16_nxv4i16
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 1 x s16>), 0
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR %0(<vscale x 4 x s16>), %1, 0
+    $v8 = COPY %2(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+...
+---
+name:            insert_subvector_nxv4i32_nxv8i32
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insert_subvector_nxv4i32_nxv8i32
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 4 x s32>), 0
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 8 x s32>) = G_INSERT_SUBVECTOR %0(<vscale x 8 x s32>), %1, 0
+    $v8 = COPY %2(<vscale x 8 x s32>)
+    PseudoRET implicit $v8
+...
+---
+name:            insert_subvector_nxv2i64_nxv8i64
+legalized:       false
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insert_subvector_nxv2i64_nxv8i64
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 1 x s64>), 0
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 8 x s64>) = G_INSERT_SUBVECTOR %0(<vscale x 8 x s64>), %1, 0
+    $v8 = COPY %2(<vscale x 8 x s64>)
+    PseudoRET implicit $v8
+...

>From 7e8e7d4bbef30e8cd48eb3489a657b5e60156954 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Mon, 16 Sep 2024 11:32:15 -0700
Subject: [PATCH 2/7] [GISEL] Add GInsertSubvector

---
 .../llvm/CodeGen/GlobalISel/GenericMachineInstrs.h   | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
index 132b7ec9aeef7c..e9305e0431ab91 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GenericMachineInstrs.h
@@ -800,6 +800,18 @@ class GInsertVectorElement : public GenericMachineInstr {
   }
 };
 
+/// Represents a insert subvector.
+class GInsertSubvector : public GenericMachineInstr {
+public:
+  Register getBigVec() const { return getOperand(1).getReg(); }
+  Register getSubVec() const { return getOperand(1).getReg(); }
+  uint64_t getIndexImm() const { return getOperand(3).getImm(); }
+
+  static bool classof(const MachineInstr *MI) {
+    return MI->getOpcode() == TargetOpcode::G_INSERT_SUBVECTOR;
+  }
+};
+
 /// Represents a freeze.
 class GFreeze : public GenericMachineInstr {
 public:

>From 0204adf33ed35547418c6336dcb61fa2b80994d6 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Mon, 16 Sep 2024 11:33:42 -0700
Subject: [PATCH 3/7] [GISEL] Fix MachineVerifier for G_INSERT_SUBVECTOR

---
 llvm/lib/CodeGen/MachineVerifier.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 6eed73c15f091d..1fcbeeec6f64cc 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1739,7 +1739,7 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
     }
 
     if (IndexOp.getImm() != 0 &&
-        Src1Ty.getElementCount().getKnownMinValue() % IndexOp.getImm() != 0) {
+        IndexOp.getImm() % Src1Ty.getElementCount().getKnownMinValue() != 0) {
       report("Index must be a multiple of the second source vector's "
              "minimum vector length",
              MI);

>From 9754903e7518aceebad6a5790dec8ea9ca1f63bd Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Mon, 16 Sep 2024 11:33:53 -0700
Subject: [PATCH 4/7] fixup! respond to reviews

---
 .../Target/RISCV/GISel/RISCVLegalizerInfo.cpp |  16 +-
 .../rvv/legalize-insert-subvector.mir         | 293 ++++++++++--------
 2 files changed, 172 insertions(+), 137 deletions(-)

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 8396316dc47fdc..14d7c78e80e47f 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -931,14 +931,14 @@ static LLT getLMUL1Ty(LLT VecTy) {
 
 bool RISCVLegalizerInfo::legalizeInsertSubvector(MachineInstr &MI,
                                                  MachineIRBuilder &MIB) const {
-  assert(MI.getOpcode() == TargetOpcode::G_INSERT_SUBVECTOR);
+  GInsertSubvector &IS = cast<GInsertSubvector>(MI);
 
   MachineRegisterInfo &MRI = *MIB.getMRI();
 
-  Register Dst = MI.getOperand(0).getReg();
-  Register Src1 = MI.getOperand(1).getReg();
-  Register Src2 = MI.getOperand(2).getReg();
-  uint64_t Idx = MI.getOperand(3).getImm();
+  Register Dst = IS.getOperand(0).getReg();
+  Register Src1 = IS.getBigVec();
+  Register Src2 = IS.getSubVec();
+  uint64_t Idx = IS.getIndexImm();
 
   LLT BigTy = MRI.getType(Src1);
   LLT LitTy = MRI.getType(Src2);
@@ -989,9 +989,7 @@ bool RISCVLegalizerInfo::legalizeInsertSubvector(MachineInstr &MI,
           getMVTForLLT(BigTy), LitTyMVT, Idx, TRI);
 
   RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(LitTy));
-  bool IsSubVecPartReg = SubVecLMUL == RISCVII::VLMUL::LMUL_F2 ||
-                         SubVecLMUL == RISCVII::VLMUL::LMUL_F4 ||
-                         SubVecLMUL == RISCVII::VLMUL::LMUL_F8;
+  bool IsSubVecPartReg = !RISCVVType::decodeVLMUL(SubVecLMUL).second;
 
   // If the Idx has been completely eliminated and this subvector's size is a
   // vector register or a multiple thereof, or the surrounding elements are
@@ -1059,7 +1057,7 @@ bool RISCVLegalizerInfo::legalizeInsertSubvector(MachineInstr &MI,
   // If required, insert this subvector back into the correct vector register.
   // This should resolve to an INSERT_SUBREG instruction.
   if (TypeSize::isKnownGT(BigTy.getSizeInBits(), InterLitTy.getSizeInBits()))
-    Inserted = MIB.buildInsert(BigTy, BigVec, LitVec, AlignedIdx);
+    Inserted = MIB.buildInsertSubvector(BigTy, BigVec, LitVec, AlignedIdx);
 
   // We might have bitcast from a mask type: cast back to the original type if
   // required.
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir
index a5f1228b8f8ca6..598c2e2a142cb4 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir
@@ -11,7 +11,6 @@ body:             |
   bb.0.entry:
     ; RV32-LABEL: name: insert_subvector_nxv2i1_nxv4i1
     ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
-    ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
     ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
     ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
@@ -19,56 +18,43 @@ body:             |
     ; RV32-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
     ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
     ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[DEF]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
-    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; RV32-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
-    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT2]](s64)
-    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; RV32-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
-    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT3]](s64)
-    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF1]](<vscale x 2 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
     ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
     ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
-    ; RV32-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C4]](s64)
+    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C2]](s64)
     ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB
-    ; RV32-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C5]](s64)
+    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C3]](s64)
     ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR1]], [[LSHR]]
-    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VSLIDEUP_VL [[SELECT]], [[SELECT1]], [[LSHR1]](s64), [[VMSET_VL]](<vscale x 4 x s1>), [[ADD]](s64), 1
+    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VSLIDEUP_VL [[SELECT]], [[SELECT]], [[LSHR1]](s64), [[VMSET_VL]](<vscale x 4 x s1>), [[ADD]](s64), 0
     ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 4 x s8>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 4 x s8>)
-    ; RV32-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; RV32-NEXT: [[ANYEXT4:%[0-9]+]]:_(s64) = G_ANYEXT [[C6]](s32)
-    ; RV32-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT4]](s64)
-    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[BITCAST]](<vscale x 4 x s8>), [[SPLAT_VECTOR4]]
+    ; RV32-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV32-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C4]](s32)
+    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT2]](s64)
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[BITCAST]](<vscale x 4 x s8>), [[SPLAT_VECTOR2]]
     ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
     ; RV32-NEXT: PseudoRET implicit $v8
     ;
     ; RV64-LABEL: name: insert_subvector_nxv2i1_nxv4i1
     ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
-    ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
     ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s32)
     ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
     ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
     ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[DEF]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
-    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
-    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
-    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF1]](<vscale x 2 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
     ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
     ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
-    ; RV64-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C4]](s32)
+    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C2]](s32)
     ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB
-    ; RV64-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C5]](s32)
+    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C3]](s32)
     ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR1]], [[LSHR]]
-    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VSLIDEUP_VL [[SELECT]], [[SELECT1]], [[LSHR1]](s32), [[VMSET_VL]](<vscale x 4 x s1>), [[ADD]](s32), 1
+    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VSLIDEUP_VL [[SELECT]], [[SELECT]], [[LSHR1]](s32), [[VMSET_VL]](<vscale x 4 x s1>), [[ADD]](s32), 0
     ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 4 x s8>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 4 x s8>)
-    ; RV64-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; RV64-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C6]](s32)
-    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[BITCAST]](<vscale x 4 x s8>), [[SPLAT_VECTOR4]]
+    ; RV64-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C4]](s32)
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[BITCAST]](<vscale x 4 x s8>), [[SPLAT_VECTOR2]]
     ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
     ; RV64-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
@@ -85,69 +71,38 @@ body:             |
   bb.0.entry:
     ; RV32-LABEL: name: insert_subvector_nxv4i1_nxv8i1
     ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
-    ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
-    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
-    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
-    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; RV32-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
-    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
-    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[DEF]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
-    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; RV32-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C2]](s32)
-    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT2]](s64)
-    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; RV32-NEXT: [[ANYEXT3:%[0-9]+]]:_(s64) = G_ANYEXT [[C3]](s32)
-    ; RV32-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[ANYEXT3]](s64)
-    ; RV32-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF1]](<vscale x 2 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
-    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 1 x s8>) = G_BITCAST [[DEF]](<vscale x 8 x s1>)
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL $x0
     ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
-    ; RV32-NEXT: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C4]](s64)
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C]](s64)
     ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB
-    ; RV32-NEXT: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C5]](s64)
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C1]](s64)
     ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR1]], [[LSHR]]
-    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEUP_VL [[SELECT]], [[SELECT1]], [[LSHR1]](s64), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s64), 0
-    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s8>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 8 x s8>)
-    ; RV32-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; RV32-NEXT: [[ANYEXT4:%[0-9]+]]:_(s64) = G_ANYEXT [[C6]](s32)
-    ; RV32-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[ANYEXT4]](s64)
-    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(ne), [[BITCAST]](<vscale x 8 x s8>), [[SPLAT_VECTOR4]]
-    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 1 x s8>) = G_VSLIDEUP_VL [[DEF]], [[BITCAST]], [[LSHR1]](s64), [[VMSET_VL]](<vscale x 1 x s1>), [[ADD]](s64), 0
+    ; RV32-NEXT: [[BITCAST1:%[0-9]+]]:_(<vscale x 8 x s1>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 1 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[BITCAST1]](<vscale x 8 x s1>)
     ; RV32-NEXT: PseudoRET implicit $v8
     ;
     ; RV64-LABEL: name: insert_subvector_nxv4i1_nxv8i1
     ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
-    ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
-    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C]](s32)
-    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
-    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SELECT [[DEF]](<vscale x 8 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
-    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C2]](s32)
-    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; RV64-NEXT: [[SPLAT_VECTOR3:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SPLAT_VECTOR [[C3]](s32)
-    ; RV64-NEXT: [[SELECT1:%[0-9]+]]:_(<vscale x 2 x s8>) = G_SELECT [[DEF1]](<vscale x 2 x s1>), [[SPLAT_VECTOR3]], [[SPLAT_VECTOR2]]
-    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 1 x s8>) = G_BITCAST [[DEF]](<vscale x 8 x s1>)
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL $x0
     ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
-    ; RV64-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C4]](s32)
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C]](s32)
     ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB
-    ; RV64-NEXT: [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C5]](s32)
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C1]](s32)
     ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR1]], [[LSHR]]
-    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEUP_VL [[SELECT]], [[SELECT1]], [[LSHR1]](s32), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s32), 0
-    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s8>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 8 x s8>)
-    ; RV64-NEXT: [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; RV64-NEXT: [[SPLAT_VECTOR4:%[0-9]+]]:_(<vscale x 8 x s8>) = G_SPLAT_VECTOR [[C6]](s32)
-    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 8 x s1>) = G_ICMP intpred(ne), [[BITCAST]](<vscale x 8 x s8>), [[SPLAT_VECTOR4]]
-    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 8 x s1>)
+    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 1 x s8>) = G_VSLIDEUP_VL [[DEF]], [[BITCAST]], [[LSHR1]](s32), [[VMSET_VL]](<vscale x 1 x s1>), [[ADD]](s32), 0
+    ; RV64-NEXT: [[BITCAST1:%[0-9]+]]:_(<vscale x 8 x s1>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 1 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[BITCAST1]](<vscale x 8 x s1>)
     ; RV64-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
     %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
-    %2:_(<vscale x 8 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 8 x s1>), %1, 2
+    %2:_(<vscale x 8 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 8 x s1>), %1, 8
     $v8 = COPY %2(<vscale x 8 x s1>)
     PseudoRET implicit $v8
 ...
@@ -159,40 +114,34 @@ body:             |
   bb.0.entry:
     ; RV32-LABEL: name: insert_subvector_nxv32i1_nxv64i1
     ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
-    ; RV32-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 32 x s1>) = G_IMPLICIT_DEF
-    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 4 x s8>) = G_BITCAST [[DEF1]](<vscale x 32 x s1>)
+    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s8>) = G_BITCAST [[DEF]](<vscale x 64 x s1>)
     ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
     ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
-    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C]](s64)
     ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB
-    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C1]](s64)
-    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR1]], [[LSHR]]
-    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEUP_VL [[DEF]], [[BITCAST]], [[LSHR1]](s64), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s64), 0
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C]](s64)
+    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR]], [[READ_VLENB]]
+    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEUP_VL [[DEF]], [[BITCAST]], [[LSHR]](s64), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s64), 0
     ; RV32-NEXT: [[BITCAST1:%[0-9]+]]:_(<vscale x 64 x s1>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 8 x s8>)
     ; RV32-NEXT: $v8 = COPY [[BITCAST1]](<vscale x 64 x s1>)
     ; RV32-NEXT: PseudoRET implicit $v8
     ;
     ; RV64-LABEL: name: insert_subvector_nxv32i1_nxv64i1
     ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
-    ; RV64-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 32 x s1>) = G_IMPLICIT_DEF
-    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 4 x s8>) = G_BITCAST [[DEF1]](<vscale x 32 x s1>)
+    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s8>) = G_BITCAST [[DEF]](<vscale x 64 x s1>)
     ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
     ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
-    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C]](s32)
     ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB
-    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C1]](s32)
-    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR1]], [[LSHR]]
-    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEUP_VL [[DEF]], [[BITCAST]], [[LSHR1]](s32), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s32), 0
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C]](s32)
+    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR]], [[READ_VLENB]]
+    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEUP_VL [[DEF]], [[BITCAST]], [[LSHR]](s32), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s32), 0
     ; RV64-NEXT: [[BITCAST1:%[0-9]+]]:_(<vscale x 64 x s1>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 8 x s8>)
     ; RV64-NEXT: $v8 = COPY [[BITCAST1]](<vscale x 64 x s1>)
     ; RV64-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
     %1:_(<vscale x 32 x s1>) = G_IMPLICIT_DEF
-    %2:_(<vscale x 64 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 64 x s1>), %1, 16
+    %2:_(<vscale x 64 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 64 x s1>), %1, 32
     $v8 = COPY %2(<vscale x 64 x s1>)
     PseudoRET implicit $v8
 ...
@@ -253,7 +202,7 @@ body:             |
     PseudoRET implicit $v8
 ...
 
-# Inserrt with zero index
+# Insert with zero index
 ---
 name:            insert_subvector_nxv1i8_nxv2i8_zero
 legalized:       false
@@ -327,22 +276,45 @@ body:             |
     PseudoRET implicit $v8
 ...
 
-# Extract with non-zero index
+# Insert with non-zero index
 ---
 name:            insert_subvector_nxv1i8_nxv2i8
 legalized:       false
 tracksRegLiveness: true
 body:             |
   bb.0.entry:
-    ; CHECK-LABEL: name: insert_subvector_nxv1i8_nxv2i8
-    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 1 x s8>), 0
-    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 2 x s8>)
-    ; CHECK-NEXT: PseudoRET implicit $v8
+    ; RV32-LABEL: name: insert_subvector_nxv1i8_nxv2i8
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
+    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C]](s64)
+    ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C1]](s64)
+    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR1]], [[LSHR]]
+    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 2 x s8>) = G_VSLIDEUP_VL [[DEF]], [[DEF]], [[LSHR1]](s64), [[VMSET_VL]](<vscale x 2 x s1>), [[ADD]](s64), 0
+    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 2 x s8>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 2 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[BITCAST]](<vscale x 2 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv1i8_nxv2i8
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
+    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C]](s32)
+    ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C1]](s32)
+    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR1]], [[LSHR]]
+    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 2 x s8>) = G_VSLIDEUP_VL [[DEF]], [[DEF]], [[LSHR1]](s32), [[VMSET_VL]](<vscale x 2 x s1>), [[ADD]](s32), 0
+    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 2 x s8>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 2 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[BITCAST]](<vscale x 2 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
     %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
-    %2:_(<vscale x 2 x s8>) = G_INSERT_SUBVECTOR %0(<vscale x 2 x s8>), %1, 0
+    %2:_(<vscale x 2 x s8>) = G_INSERT_SUBVECTOR %0(<vscale x 2 x s8>), %1, 1
     $v8 = COPY %2(<vscale x 2 x s8>)
     PseudoRET implicit $v8
 ...
@@ -352,15 +324,38 @@ legalized:       false
 tracksRegLiveness: true
 body:             |
   bb.0.entry:
-    ; CHECK-LABEL: name: insert_subvector_nxv2i16_nxv4i16
-    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 1 x s16>), 0
-    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 4 x s16>)
-    ; CHECK-NEXT: PseudoRET implicit $v8
+    ; RV32-LABEL: name: insert_subvector_nxv2i16_nxv4i16
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C]](s64)
+    ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
+    ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C1]](s64)
+    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR1]], [[LSHR]]
+    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 4 x s16>) = G_VSLIDEUP_VL [[DEF]], [[DEF]], [[LSHR1]](s64), [[VMSET_VL]](<vscale x 4 x s1>), [[ADD]](s64), 0
+    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 4 x s16>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 4 x s16>)
+    ; RV32-NEXT: $v8 = COPY [[BITCAST]](<vscale x 4 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv2i16_nxv4i16
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C]](s32)
+    ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
+    ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C1]](s32)
+    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR1]], [[LSHR]]
+    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 4 x s16>) = G_VSLIDEUP_VL [[DEF]], [[DEF]], [[LSHR1]](s32), [[VMSET_VL]](<vscale x 4 x s1>), [[ADD]](s32), 0
+    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 4 x s16>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 4 x s16>)
+    ; RV64-NEXT: $v8 = COPY [[BITCAST]](<vscale x 4 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
     %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
-    %2:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR %0(<vscale x 4 x s16>), %1, 0
+    %2:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR %0(<vscale x 4 x s16>), %1, 1
     $v8 = COPY %2(<vscale x 4 x s16>)
     PseudoRET implicit $v8
 ...
@@ -370,15 +365,34 @@ legalized:       false
 tracksRegLiveness: true
 body:             |
   bb.0.entry:
-    ; CHECK-LABEL: name: insert_subvector_nxv4i32_nxv8i32
-    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 4 x s32>), 0
-    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 8 x s32>)
-    ; CHECK-NEXT: PseudoRET implicit $v8
+    ; RV32-LABEL: name: insert_subvector_nxv4i32_nxv8i32
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_EXTRACT_SUBVECTOR [[DEF]], 0
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[READ_VLENB1]], [[READ_VLENB]]
+    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 2 x s32>) = G_VSLIDEUP_VL [[EXTRACT_SUBVECTOR]], [[DEF]], [[READ_VLENB1]](s64), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s64), 0
+    ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF]](<vscale x 8 x s32>), 0
+    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s32>) = G_BITCAST [[INSERT_SUBVECTOR]](<vscale x 8 x s32>)
+    ; RV32-NEXT: $v8 = COPY [[BITCAST]](<vscale x 8 x s32>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv4i32_nxv8i32
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_EXTRACT_SUBVECTOR [[DEF]], 0
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[READ_VLENB1]], [[READ_VLENB]]
+    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 2 x s32>) = G_VSLIDEUP_VL [[EXTRACT_SUBVECTOR]], [[DEF]], [[READ_VLENB1]](s32), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s32), 0
+    ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF]](<vscale x 8 x s32>), 0
+    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s32>) = G_BITCAST [[INSERT_SUBVECTOR]](<vscale x 8 x s32>)
+    ; RV64-NEXT: $v8 = COPY [[BITCAST]](<vscale x 8 x s32>)
+    ; RV64-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
     %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
-    %2:_(<vscale x 8 x s32>) = G_INSERT_SUBVECTOR %0(<vscale x 8 x s32>), %1, 0
+    %2:_(<vscale x 8 x s32>) = G_INSERT_SUBVECTOR %0(<vscale x 8 x s32>), %1, 8
     $v8 = COPY %2(<vscale x 8 x s32>)
     PseudoRET implicit $v8
 ...
@@ -388,15 +402,38 @@ legalized:       false
 tracksRegLiveness: true
 body:             |
   bb.0.entry:
-    ; CHECK-LABEL: name: insert_subvector_nxv2i64_nxv8i64
-    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 1 x s64>), 0
-    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 8 x s64>)
-    ; CHECK-NEXT: PseudoRET implicit $v8
+    ; RV32-LABEL: name: insert_subvector_nxv2i64_nxv8i64
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_EXTRACT_SUBVECTOR [[DEF]], 0
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C]](s64)
+    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR]], [[READ_VLENB]]
+    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 1 x s64>) = G_VSLIDEUP_VL [[EXTRACT_SUBVECTOR]], [[DEF]], [[LSHR]](s64), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s64), 0
+    ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF]](<vscale x 8 x s64>), 0
+    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s64>) = G_BITCAST [[INSERT_SUBVECTOR]](<vscale x 8 x s64>)
+    ; RV32-NEXT: $v8 = COPY [[BITCAST]](<vscale x 8 x s64>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv2i64_nxv8i64
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_EXTRACT_SUBVECTOR [[DEF]], 0
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
+    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C]](s32)
+    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR]], [[READ_VLENB]]
+    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 1 x s64>) = G_VSLIDEUP_VL [[EXTRACT_SUBVECTOR]], [[DEF]], [[LSHR]](s32), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s32), 0
+    ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF]](<vscale x 8 x s64>), 0
+    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s64>) = G_BITCAST [[INSERT_SUBVECTOR]](<vscale x 8 x s64>)
+    ; RV64-NEXT: $v8 = COPY [[BITCAST]](<vscale x 8 x s64>)
+    ; RV64-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
     %1:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
-    %2:_(<vscale x 8 x s64>) = G_INSERT_SUBVECTOR %0(<vscale x 8 x s64>), %1, 0
+    %2:_(<vscale x 8 x s64>) = G_INSERT_SUBVECTOR %0(<vscale x 8 x s64>), %1, 4
     $v8 = COPY %2(<vscale x 8 x s64>)
     PseudoRET implicit $v8
 ...

>From 86c8a0674bf465566459c4c924d8a626180860e3 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Mon, 16 Sep 2024 12:10:10 -0700
Subject: [PATCH 5/7] fixup! use getReg

---
 llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 14d7c78e80e47f..b1bb4ce273dd51 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -935,7 +935,7 @@ bool RISCVLegalizerInfo::legalizeInsertSubvector(MachineInstr &MI,
 
   MachineRegisterInfo &MRI = *MIB.getMRI();
 
-  Register Dst = IS.getOperand(0).getReg();
+  Register Dst = IS.getReg(0);
   Register Src1 = IS.getBigVec();
   Register Src2 = IS.getSubVec();
   uint64_t Idx = IS.getIndexImm();

>From 844455c830e3e673fa82e01cdeb9b84ce304791f Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Tue, 17 Sep 2024 08:20:56 -0700
Subject: [PATCH 6/7] [RISCV][GISEL] Introduce the RISCVPostLegalizerLowering
 pass

This is mostly a copy of the AArch64PostLegalizerLoweringPass, except it removes
all of the AArch64 combines.

This pass allows us to lower instructions after the generic post-legalization
combiner has had a chance to run.

We will be adding combines to this pass in future patches.
---
 llvm/lib/Target/RISCV/CMakeLists.txt          |   3 +
 .../GISel/RISCVPostLegalizerLowering.cpp      | 154 ++++++++++++++++++
 llvm/lib/Target/RISCV/RISCV.h                 |   3 +
 llvm/lib/Target/RISCV/RISCVCombine.td         |   7 +
 llvm/lib/Target/RISCV/RISCVTargetMachine.cpp  |   2 +
 .../GlobalISel/gisel-commandline-option.ll    |   1 +
 6 files changed, 170 insertions(+)
 create mode 100644 llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerLowering.cpp

diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt
index 124bc239451aed..7d02d630a28199 100644
--- a/llvm/lib/Target/RISCV/CMakeLists.txt
+++ b/llvm/lib/Target/RISCV/CMakeLists.txt
@@ -24,6 +24,8 @@ tablegen(LLVM RISCVGenPreLegalizeGICombiner.inc -gen-global-isel-combiner
               -combiners="RISCVPreLegalizerCombiner")
 tablegen(LLVM RISCVGenPostLegalizeGICombiner.inc -gen-global-isel-combiner
               -combiners="RISCVPostLegalizerCombiner")
+tablegen(LLVM RISCVGenPostLegalizeGILowering.inc -gen-global-isel-combiner
+              -combiners="RISCVPostLegalizerLowering")
 
 add_public_tablegen_target(RISCVCommonTableGen)
 
@@ -63,6 +65,7 @@ add_llvm_target(RISCVCodeGen
   GISel/RISCVInstructionSelector.cpp
   GISel/RISCVLegalizerInfo.cpp
   GISel/RISCVPostLegalizerCombiner.cpp
+  GISel/RISCVPostLegalizerLowering.cpp
   GISel/RISCVO0PreLegalizerCombiner.cpp
   GISel/RISCVPreLegalizerCombiner.cpp
   GISel/RISCVRegisterBankInfo.cpp
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerLowering.cpp
new file mode 100644
index 00000000000000..66db15e3a2e28c
--- /dev/null
+++ b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerLowering.cpp
@@ -0,0 +1,154 @@
+//===--------------- RISCVPostLegalizerLowering.cpp -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Post-legalization lowering for instructions.
+///
+/// This is used to offload pattern matching from the selector.
+///
+/// General optimization combines should be handled by either the
+/// RISCVPostLegalizerCombiner or the RISCVPreLegalizerCombiner.
+///
+//===----------------------------------------------------------------------===//
+
+#include "RISCVSubtarget.h"
+
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+
+#define GET_GICOMBINER_DEPS
+#include "RISCVGenPostLegalizeGILowering.inc"
+#undef GET_GICOMBINER_DEPS
+
+#define DEBUG_TYPE "riscv-postlegalizer-lowering"
+
+using namespace llvm;
+
+namespace {
+
+#define GET_GICOMBINER_TYPES
+#include "RISCVGenPostLegalizeGILowering.inc"
+#undef GET_GICOMBINER_TYPES
+
+class RISCVPostLegalizerLoweringImpl : public Combiner {
+protected:
+  // TODO: Make CombinerHelper methods const.
+  mutable CombinerHelper Helper;
+  const RISCVPostLegalizerLoweringImplRuleConfig &RuleConfig;
+  const RISCVSubtarget &STI;
+
+public:
+  RISCVPostLegalizerLoweringImpl(
+      MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
+      GISelCSEInfo *CSEInfo,
+      const RISCVPostLegalizerLoweringImplRuleConfig &RuleConfig,
+      const RISCVSubtarget &STI);
+
+  static const char *getName() { return "RISCVPreLegalizerCombiner"; }
+
+  bool tryCombineAll(MachineInstr &I) const override;
+
+private:
+#define GET_GICOMBINER_CLASS_MEMBERS
+#include "RISCVGenPostLegalizeGILowering.inc"
+#undef GET_GICOMBINER_CLASS_MEMBERS
+};
+
+#define GET_GICOMBINER_IMPL
+#include "RISCVGenPostLegalizeGILowering.inc"
+#undef GET_GICOMBINER_IMPL
+
+RISCVPostLegalizerLoweringImpl::RISCVPostLegalizerLoweringImpl(
+    MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
+    GISelCSEInfo *CSEInfo,
+    const RISCVPostLegalizerLoweringImplRuleConfig &RuleConfig,
+    const RISCVSubtarget &STI)
+    : Combiner(MF, CInfo, TPC, /*KB*/ nullptr, CSEInfo),
+      Helper(Observer, B, /*IsPreLegalize*/ true), RuleConfig(RuleConfig),
+      STI(STI),
+#define GET_GICOMBINER_CONSTRUCTOR_INITS
+#include "RISCVGenPostLegalizeGILowering.inc"
+#undef GET_GICOMBINER_CONSTRUCTOR_INITS
+{
+}
+
+class RISCVPostLegalizerLowering : public MachineFunctionPass {
+public:
+  static char ID;
+
+  RISCVPostLegalizerLowering();
+
+  StringRef getPassName() const override {
+    return "RISCVPostLegalizerLowering";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  RISCVPostLegalizerLoweringImplRuleConfig RuleConfig;
+};
+} // end anonymous namespace
+
+void RISCVPostLegalizerLowering::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetPassConfig>();
+  AU.setPreservesCFG();
+  getSelectionDAGFallbackAnalysisUsage(AU);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+RISCVPostLegalizerLowering::RISCVPostLegalizerLowering()
+    : MachineFunctionPass(ID) {
+  if (!RuleConfig.parseCommandLineOption())
+    report_fatal_error("Invalid rule identifier");
+}
+
+bool RISCVPostLegalizerLowering::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+  assert(MF.getProperties().hasProperty(
+             MachineFunctionProperties::Property::Legalized) &&
+         "Expected a legalized function?");
+  auto *TPC = &getAnalysis<TargetPassConfig>();
+  const Function &F = MF.getFunction();
+
+  const RISCVSubtarget &ST = MF.getSubtarget<RISCVSubtarget>();
+  CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
+                     /*LegalizerInfo*/ nullptr, /*OptEnabled=*/true,
+                     F.hasOptSize(), F.hasMinSize());
+  // Disable fixed-point iteration to reduce compile-time
+  CInfo.MaxIterations = 1;
+  CInfo.ObserverLvl = CombinerInfo::ObserverLevel::SinglePass;
+  // PostLegalizerCombiner performs DCE, so a full DCE pass is unnecessary.
+  CInfo.EnableFullDCE = false;
+  RISCVPostLegalizerLoweringImpl Impl(MF, CInfo, TPC, /*CSEInfo*/ nullptr,
+                                      RuleConfig, ST);
+  return Impl.combineMachineInstrs();
+}
+
+char RISCVPostLegalizerLowering::ID = 0;
+INITIALIZE_PASS_BEGIN(RISCVPostLegalizerLowering, DEBUG_TYPE,
+                      "Lower RISC-V MachineInstrs after legalization", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(RISCVPostLegalizerLowering, DEBUG_TYPE,
+                    "Lower RISC-V MachineInstrs after legalization", false,
+                    false)
+
+namespace llvm {
+FunctionPass *createRISCVPostLegalizerLowering() {
+  return new RISCVPostLegalizerLowering();
+}
+} // end namespace llvm
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index 5a94ada8f8dd46..561e4895e19428 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -99,6 +99,9 @@ void initializeRISCVO0PreLegalizerCombinerPass(PassRegistry &);
 
 FunctionPass *createRISCVPreLegalizerCombiner();
 void initializeRISCVPreLegalizerCombinerPass(PassRegistry &);
+
+FunctionPass *createRISCVPostLegalizerLowering();
+void initializeRISCVPostLegalizerLoweringPass(PassRegistry &);
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/RISCV/RISCVCombine.td b/llvm/lib/Target/RISCV/RISCVCombine.td
index 3a5afb1b075c9e..d48698ae6f2bfb 100644
--- a/llvm/lib/Target/RISCV/RISCVCombine.td
+++ b/llvm/lib/Target/RISCV/RISCVCombine.td
@@ -19,6 +19,13 @@ def RISCVO0PreLegalizerCombiner: GICombiner<
   "RISCVO0PreLegalizerCombinerImpl", [optnone_combines]> {
 }
 
+// Post-legalization combines which should happen at all optimization levels.
+// (E.g. ones that facilitate matching for the selector) For example, matching
+// pseudos.
+def RISCVPostLegalizerLowering
+    : GICombiner<"RISCVPostLegalizerLoweringImpl", []> {
+}
+
 // Post-legalization combines which are primarily optimizations.
 // TODO: Add more combines.
 def RISCVPostLegalizerCombiner
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 794df2212dfa53..5cb9fbc8a9eb98 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -111,6 +111,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   initializeRISCVO0PreLegalizerCombinerPass(*PR);
   initializeRISCVPreLegalizerCombinerPass(*PR);
   initializeRISCVPostLegalizerCombinerPass(*PR);
+  initializeRISCVPostLegalizerLoweringPass(*PR);
   initializeKCFIPass(*PR);
   initializeRISCVDeadRegisterDefinitionsPass(*PR);
   initializeRISCVMakeCompressibleOptPass(*PR);
@@ -482,6 +483,7 @@ bool RISCVPassConfig::addLegalizeMachineIR() {
 void RISCVPassConfig::addPreRegBankSelect() {
   if (getOptLevel() != CodeGenOptLevel::None)
     addPass(createRISCVPostLegalizerCombiner());
+  addPass(createRISCVPostLegalizerLowering());
 }
 
 bool RISCVPassConfig::addRegBankSelect() {
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/gisel-commandline-option.ll b/llvm/test/CodeGen/RISCV/GlobalISel/gisel-commandline-option.ll
index 82c66c23b26ba9..75607bb81ff513 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/gisel-commandline-option.ll
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/gisel-commandline-option.ll
@@ -23,6 +23,7 @@
 ; ENABLED-NEXT:  Legalizer
 ; ENABLED-O1-NEXT:  MachineDominator Tree Construction
 ; ENABLED-O1-NEXT:  RISCVPostLegalizerCombiner
+; ENABLED-NEXT:  RISCVPostLegalizerLowering
 ; ENABLED-NEXT:  RegBankSelect
 ; ENABLED-NEXT:  Analysis for ComputingKnownBits
 ; ENABLED-O1-NEXT:  Lazy Branch Probability Analysis

>From 7fbcc48217cbaf991f38841879b42527f56ca552 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland at gmail.com>
Date: Tue, 17 Sep 2024 10:13:25 -0700
Subject: [PATCH 7/7] fixup! move to postlegalize lowering

---
 .../Target/RISCV/GISel/RISCVLegalizerInfo.cpp | 152 +-------
 .../Target/RISCV/GISel/RISCVLegalizerInfo.h   |   1 -
 .../GISel/RISCVPostLegalizerLowering.cpp      | 178 +++++++++
 llvm/lib/Target/RISCV/RISCVCombine.td         |   8 +-
 .../rvv/legalize-insert-subvector.mir         | 260 ++-----------
 .../rvv/insert-subvector.mir                  | 368 ++++++++++++++++++
 6 files changed, 597 insertions(+), 370 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/GlobalISel/postlegalizer-lowering/rvv/insert-subvector.mir

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index b1bb4ce273dd51..d03a0939ab1af3 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -582,9 +582,9 @@ RISCVLegalizerInfo::RISCVLegalizerInfo(const RISCVSubtarget &ST)
   SplatActions.clampScalar(1, sXLen, sXLen);
 
   getActionDefinitionsBuilder(G_INSERT_SUBVECTOR)
-      .customIf(all(typeIsLegalBoolVec(0, BoolVecTys, ST),
+      .legalIf(all(typeIsLegalBoolVec(0, BoolVecTys, ST),
                     typeIsLegalBoolVec(1, BoolVecTys, ST)))
-      .customIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
+      .legalIf(all(typeIsLegalIntOrFPVec(0, IntOrFPVecTys, ST),
                     typeIsLegalIntOrFPVec(1, IntOrFPVecTys, ST)));
 
   getLegacyLegalizerInfo().computeTables();
@@ -921,152 +921,6 @@ bool RISCVLegalizerInfo::legalizeSplatVector(MachineInstr &MI,
   return true;
 }
 
-static LLT getLMUL1Ty(LLT VecTy) {
-  assert(VecTy.getElementType().getSizeInBits() <= 64 &&
-         "Unexpected vector LLT");
-  return LLT::scalable_vector(RISCV::RVVBitsPerBlock /
-                                  VecTy.getElementType().getSizeInBits(),
-                              VecTy.getElementType());
-}
-
-bool RISCVLegalizerInfo::legalizeInsertSubvector(MachineInstr &MI,
-                                                 MachineIRBuilder &MIB) const {
-  GInsertSubvector &IS = cast<GInsertSubvector>(MI);
-
-  MachineRegisterInfo &MRI = *MIB.getMRI();
-
-  Register Dst = IS.getReg(0);
-  Register Src1 = IS.getBigVec();
-  Register Src2 = IS.getSubVec();
-  uint64_t Idx = IS.getIndexImm();
-
-  LLT BigTy = MRI.getType(Src1);
-  LLT LitTy = MRI.getType(Src2);
-  Register BigVec = Src1;
-  Register LitVec = Src2;
-
-  // We don't have the ability to slide mask vectors up indexed by their i1
-  // elements; the smallest we can do is i8. Often we are able to bitcast to
-  // equivalent i8 vectors. Otherwise, we can must zeroextend to equivalent i8
-  // vectors and truncate down after the insert.
-  if (LitTy.getElementType() == LLT::scalar(1) &&
-      (Idx != 0 ||
-       MRI.getVRegDef(BigVec)->getOpcode() != TargetOpcode::G_IMPLICIT_DEF)) {
-    auto BigTyMinElts = BigTy.getElementCount().getKnownMinValue();
-    auto LitTyMinElts = LitTy.getElementCount().getKnownMinValue();
-    if (BigTyMinElts >= 8 && LitTyMinElts >= 8) {
-      assert(Idx % 8 == 0 && "Invalid index");
-      assert(BigTyMinElts % 8 == 0 && LitTyMinElts % 8 == 0 &&
-             "Unexpected mask vector lowering");
-      Idx /= 8;
-      BigTy = LLT::vector(BigTy.getElementCount().divideCoefficientBy(8), 8);
-      LitTy = LLT::vector(LitTy.getElementCount().divideCoefficientBy(8), 8);
-      BigVec = MIB.buildBitcast(BigTy, BigVec).getReg(0);
-      LitVec = MIB.buildBitcast(LitTy, LitVec).getReg(0);
-    } else {
-      // We can't slide this mask vector up indexed by its i1 elements.
-      // This poses a problem when we wish to insert a scalable vector which
-      // can't be re-expressed as a larger type. Just choose the slow path and
-      // extend to a larger type, then truncate back down.
-      LLT ExtBigTy = BigTy.changeElementType(LLT::scalar(8));
-      LLT ExtLitTy = LitTy.changeElementType(LLT::scalar(8));
-      auto BigZExt = MIB.buildZExt(ExtBigTy, BigVec);
-      auto LitZExt = MIB.buildZExt(ExtLitTy, LitVec);
-      auto Insert = MIB.buildInsertSubvector(ExtBigTy, BigZExt, LitZExt, Idx);
-      auto SplatZero = MIB.buildSplatVector(
-          ExtBigTy, MIB.buildConstant(ExtBigTy.getElementType(), 0));
-      MIB.buildICmp(CmpInst::Predicate::ICMP_NE, Dst, Insert, SplatZero);
-      MI.eraseFromParent();
-      return true;
-    }
-  }
-
-  const RISCVRegisterInfo *TRI = STI.getRegisterInfo();
-  MVT LitTyMVT = getMVTForLLT(LitTy);
-  unsigned SubRegIdx, RemIdx;
-  std::tie(SubRegIdx, RemIdx) =
-      RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
-          getMVTForLLT(BigTy), LitTyMVT, Idx, TRI);
-
-  RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(LitTy));
-  bool IsSubVecPartReg = !RISCVVType::decodeVLMUL(SubVecLMUL).second;
-
-  // If the Idx has been completely eliminated and this subvector's size is a
-  // vector register or a multiple thereof, or the surrounding elements are
-  // undef, then this is a subvector insert which naturally aligns to a vector
-  // register. These can easily be handled using subregister manipulation.
-  if (RemIdx == 0 && (!IsSubVecPartReg || MRI.getVRegDef(Src1)->getOpcode() ==
-                                              TargetOpcode::G_IMPLICIT_DEF))
-    return true;
-
-  // If the subvector is smaller than a vector register, then the insertion
-  // must preserve the undisturbed elements of the register. We do this by
-  // lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector type
-  // (which resolves to a subregister copy), performing a VSLIDEUP to place the
-  // subvector within the vector register, and an INSERT_SUBVECTOR of that
-  // LMUL=1 type back into the larger vector (resolving to another subregister
-  // operation). See below for how our VSLIDEUP works. We go via a LMUL=1 type
-  // to avoid allocating a large register group to hold our subvector.
-
-  // VSLIDEUP works by leaving elements 0<i<OFFSET undisturbed, elements
-  // OFFSET<=i<VL set to the "subvector" and vl<=i<VLMAX set to the tail policy
-  // (in our case undisturbed). This means we can set up a subvector insertion
-  // where OFFSET is the insertion offset, and the VL is the OFFSET plus the
-  // size of the subvector.
-  const LLT XLenTy(STI.getXLenVT());
-  LLT InterLitTy = BigTy;
-  Register AlignedExtract = Src1;
-  unsigned AlignedIdx = Idx - RemIdx;
-  if (TypeSize::isKnownGT(BigTy.getSizeInBits(),
-                          getLMUL1Ty(BigTy).getSizeInBits())) {
-    InterLitTy = getLMUL1Ty(BigTy);
-    // Extract a subvector equal to the nearest full vector register type. This
-    // should resolve to a G_EXTRACT on a subreg.
-    AlignedExtract =
-        MIB.buildExtractSubvector(InterLitTy, BigVec, AlignedIdx).getReg(0);
-  }
-
-  auto Insert = MIB.buildInsertSubvector(InterLitTy, MIB.buildUndef(InterLitTy),
-                                         LitVec, 0);
-
-  auto [Mask, _] = buildDefaultVLOps(BigTy, MIB, MRI);
-  auto VL = MIB.buildVScale(XLenTy, LitTy.getElementCount().getKnownMinValue());
-
-  // Use tail agnostic policy if we're inserting over InterLitTy's tail.
-  ElementCount EndIndex =
-      ElementCount::getScalable(RemIdx) + LitTy.getElementCount();
-  uint64_t Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
-  if (EndIndex == InterLitTy.getElementCount())
-    Policy = RISCVII::TAIL_AGNOSTIC;
-
-  // If we're inserting into the lowest elements, use a tail undisturbed
-  // vmv.v.v.
-  MachineInstrBuilder Inserted;
-  if (RemIdx == 0) {
-    Inserted = MIB.buildInstr(RISCV::G_VMV_V_V_VL, {InterLitTy},
-                              {AlignedExtract, Insert, VL});
-  } else {
-    auto SlideupAmt = MIB.buildVScale(XLenTy, RemIdx);
-    // Construct the vector length corresponding to RemIdx + length(LitTy).
-    VL = MIB.buildAdd(XLenTy, SlideupAmt, VL);
-    Inserted =
-        MIB.buildInstr(RISCV::G_VSLIDEUP_VL, {InterLitTy},
-                       {AlignedExtract, LitVec, SlideupAmt, Mask, VL, Policy});
-  }
-
-  // If required, insert this subvector back into the correct vector register.
-  // This should resolve to an INSERT_SUBREG instruction.
-  if (TypeSize::isKnownGT(BigTy.getSizeInBits(), InterLitTy.getSizeInBits()))
-    Inserted = MIB.buildInsertSubvector(BigTy, BigVec, LitVec, AlignedIdx);
-
-  // We might have bitcast from a mask type: cast back to the original type if
-  // required.
-  MIB.buildBitcast(Dst, Inserted);
-
-  MI.eraseFromParent();
-  return true;
-}
-
 bool RISCVLegalizerInfo::legalizeCustom(
     LegalizerHelper &Helper, MachineInstr &MI,
     LostDebugLocObserver &LocObserver) const {
@@ -1137,8 +991,6 @@ bool RISCVLegalizerInfo::legalizeCustom(
     return legalizeExt(MI, MIRBuilder);
   case TargetOpcode::G_SPLAT_VECTOR:
     return legalizeSplatVector(MI, MIRBuilder);
-  case TargetOpcode::G_INSERT_SUBVECTOR:
-    return legalizeInsertSubvector(MI, MIRBuilder);
   case TargetOpcode::G_LOAD:
   case TargetOpcode::G_STORE:
     return legalizeLoadStore(MI, Helper, MIRBuilder);
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
index ccd8ac9fe4ec90..2fc28615e7630d 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
@@ -46,7 +46,6 @@ class RISCVLegalizerInfo : public LegalizerInfo {
   bool legalizeVScale(MachineInstr &MI, MachineIRBuilder &MIB) const;
   bool legalizeExt(MachineInstr &MI, MachineIRBuilder &MIRBuilder) const;
   bool legalizeSplatVector(MachineInstr &MI, MachineIRBuilder &MIB) const;
-  bool legalizeInsertSubvector(MachineInstr &MI, MachineIRBuilder &MIB) const;
   bool legalizeLoadStore(MachineInstr &MI, LegalizerHelper &Helper,
                          MachineIRBuilder &MIB) const;
 };
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerLowering.cpp b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerLowering.cpp
index 66db15e3a2e28c..955cc74629a348 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerLowering.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerLowering.cpp
@@ -41,6 +41,184 @@ namespace {
 #include "RISCVGenPostLegalizeGILowering.inc"
 #undef GET_GICOMBINER_TYPES
 
+static LLT getLMUL1Ty(LLT VecTy) {
+  assert(VecTy.getElementType().getSizeInBits() <= 64 &&
+         "Unexpected vector LLT");
+  return LLT::scalable_vector(RISCV::RVVBitsPerBlock /
+                                  VecTy.getElementType().getSizeInBits(),
+                              VecTy.getElementType());
+}
+
+/// Return the type of the mask type suitable for masking the provided
+/// vector type.  This is simply an i1 element type vector of the same
+/// (possibly scalable) length.
+static LLT getMaskTypeFor(LLT VecTy) {
+  assert(VecTy.isVector());
+  ElementCount EC = VecTy.getElementCount();
+  return LLT::vector(EC, LLT::scalar(1));
+}
+
+/// Creates an all ones mask suitable for masking a vector of type VecTy with
+/// vector length VL.
+static MachineInstrBuilder buildAllOnesMask(LLT VecTy, const SrcOp &VL,
+                                            MachineIRBuilder &MIB,
+                                            MachineRegisterInfo &MRI) {
+  LLT MaskTy = getMaskTypeFor(VecTy);
+  return MIB.buildInstr(RISCV::G_VMSET_VL, {MaskTy}, {VL});
+}
+
+/// Gets the two common "VL" operands: an all-ones mask and the vector length.
+/// VecTy is a scalable vector type.
+static std::pair<MachineInstrBuilder, Register>
+buildDefaultVLOps(const DstOp &Dst, MachineIRBuilder &MIB,
+                  MachineRegisterInfo &MRI) {
+  LLT VecTy = Dst.getLLTTy(MRI);
+  assert(VecTy.isScalableVector() && "Expecting scalable container type");
+  Register VL(RISCV::X0);
+  MachineInstrBuilder Mask = buildAllOnesMask(VecTy, VL, MIB, MRI);
+  return {Mask, VL};
+}
+
+/// Lowers G_INSERT_SUBVECTOR. We know we can lower it here since the legalizer
+/// marked it as legal.
+void lowerInsertSubvector(MachineInstr &MI, const RISCVSubtarget &STI) {
+  GInsertSubvector &IS = cast<GInsertSubvector>(MI);
+
+  MachineIRBuilder MIB(MI);
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+
+  Register Dst = IS.getReg(0);
+  Register Src1 = IS.getBigVec();
+  Register Src2 = IS.getSubVec();
+  uint64_t Idx = IS.getIndexImm();
+
+  LLT BigTy = MRI.getType(Src1);
+  LLT LitTy = MRI.getType(Src2);
+  Register BigVec = Src1;
+  Register LitVec = Src2;
+
+  // We don't have the ability to slide mask vectors up indexed by their i1
+  // elements; the smallest we can do is i8. Often we are able to bitcast to
+  // equivalent i8 vectors. Otherwise, we can must zeroextend to equivalent i8
+  // vectors and truncate down after the insert.
+  if (LitTy.getElementType() == LLT::scalar(1) &&
+      (Idx != 0 ||
+       MRI.getVRegDef(BigVec)->getOpcode() != TargetOpcode::G_IMPLICIT_DEF)) {
+    auto BigTyMinElts = BigTy.getElementCount().getKnownMinValue();
+    auto LitTyMinElts = LitTy.getElementCount().getKnownMinValue();
+    if (BigTyMinElts >= 8 && LitTyMinElts >= 8) {
+      assert(Idx % 8 == 0 && "Invalid index");
+      assert(BigTyMinElts % 8 == 0 && LitTyMinElts % 8 == 0 &&
+             "Unexpected mask vector lowering");
+      Idx /= 8;
+      BigTy = LLT::vector(BigTy.getElementCount().divideCoefficientBy(8), 8);
+      LitTy = LLT::vector(LitTy.getElementCount().divideCoefficientBy(8), 8);
+      BigVec = MIB.buildBitcast(BigTy, BigVec).getReg(0);
+      LitVec = MIB.buildBitcast(LitTy, LitVec).getReg(0);
+    } else {
+      // We can't slide this mask vector up indexed by its i1 elements.
+      // This poses a problem when we wish to insert a scalable vector which
+      // can't be re-expressed as a larger type. Just choose the slow path and
+      // extend to a larger type, then truncate back down.
+      LLT ExtBigTy = BigTy.changeElementType(LLT::scalar(8));
+      LLT ExtLitTy = LitTy.changeElementType(LLT::scalar(8));
+      auto BigZExt = MIB.buildZExt(ExtBigTy, BigVec);
+      auto LitZExt = MIB.buildZExt(ExtLitTy, LitVec);
+      auto Insert = MIB.buildInsertSubvector(ExtBigTy, BigZExt, LitZExt, Idx);
+      auto SplatZero = MIB.buildSplatVector(
+          ExtBigTy, MIB.buildConstant(ExtBigTy.getElementType(), 0));
+      MIB.buildICmp(CmpInst::Predicate::ICMP_NE, Dst, Insert, SplatZero);
+      MI.eraseFromParent();
+      return;
+    }
+  }
+
+  const RISCVRegisterInfo *TRI = STI.getRegisterInfo();
+  MVT LitTyMVT = getMVTForLLT(LitTy);
+  unsigned SubRegIdx, RemIdx;
+  std::tie(SubRegIdx, RemIdx) =
+      RISCVTargetLowering::decomposeSubvectorInsertExtractToSubRegs(
+          getMVTForLLT(BigTy), LitTyMVT, Idx, TRI);
+
+  RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(getMVTForLLT(LitTy));
+  bool IsSubVecPartReg = !RISCVVType::decodeVLMUL(SubVecLMUL).second;
+
+  // If the Idx has been completely eliminated and this subvector's size is a
+  // vector register or a multiple thereof, or the surrounding elements are
+  // undef, then this is a subvector insert which naturally aligns to a vector
+  // register. These can easily be handled using subregister manipulation.
+  if (RemIdx == 0 && (!IsSubVecPartReg || MRI.getVRegDef(Src1)->getOpcode() ==
+                                              TargetOpcode::G_IMPLICIT_DEF))
+    return;
+
+  // If the subvector is smaller than a vector register, then the insertion
+  // must preserve the undisturbed elements of the register. We do this by
+  // lowering to an EXTRACT_SUBVECTOR grabbing the nearest LMUL=1 vector type
+  // (which resolves to a subregister copy), performing a VSLIDEUP to place the
+  // subvector within the vector register, and an INSERT_SUBVECTOR of that
+  // LMUL=1 type back into the larger vector (resolving to another subregister
+  // operation). See below for how our VSLIDEUP works. We go via a LMUL=1 type
+  // to avoid allocating a large register group to hold our subvector.
+
+  // VSLIDEUP works by leaving elements 0<i<OFFSET undisturbed, elements
+  // OFFSET<=i<VL set to the "subvector" and vl<=i<VLMAX set to the tail policy
+  // (in our case undisturbed). This means we can set up a subvector insertion
+  // where OFFSET is the insertion offset, and the VL is the OFFSET plus the
+  // size of the subvector.
+  const LLT XLenTy(STI.getXLenVT());
+  LLT InterLitTy = BigTy;
+  Register AlignedExtract = Src1;
+  unsigned AlignedIdx = Idx - RemIdx;
+  if (TypeSize::isKnownGT(BigTy.getSizeInBits(),
+                          getLMUL1Ty(BigTy).getSizeInBits())) {
+    InterLitTy = getLMUL1Ty(BigTy);
+    // Extract a subvector equal to the nearest full vector register type. This
+    // should resolve to a G_EXTRACT on a subreg.
+    AlignedExtract =
+        MIB.buildExtractSubvector(InterLitTy, BigVec, AlignedIdx).getReg(0);
+  }
+
+  auto Insert = MIB.buildInsertSubvector(InterLitTy, MIB.buildUndef(InterLitTy),
+                                         LitVec, 0);
+
+  auto [Mask, _] = buildDefaultVLOps(BigTy, MIB, MRI);
+  auto VL = MIB.buildVScale(XLenTy, LitTy.getElementCount().getKnownMinValue());
+
+  // Use tail agnostic policy if we're inserting over InterLitTy's tail.
+  ElementCount EndIndex =
+      ElementCount::getScalable(RemIdx) + LitTy.getElementCount();
+  uint64_t Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
+  if (EndIndex == InterLitTy.getElementCount())
+    Policy = RISCVII::TAIL_AGNOSTIC;
+
+  // If we're inserting into the lowest elements, use a tail undisturbed
+  // vmv.v.v.
+  MachineInstrBuilder Inserted;
+  if (RemIdx == 0) {
+    Inserted = MIB.buildInstr(RISCV::G_VMV_V_V_VL, {InterLitTy},
+                              {AlignedExtract, Insert, VL});
+  } else {
+    auto SlideupAmt = MIB.buildVScale(XLenTy, RemIdx);
+    // Construct the vector length corresponding to RemIdx + length(LitTy).
+    VL = MIB.buildAdd(XLenTy, SlideupAmt, VL);
+    Inserted =
+        MIB.buildInstr(RISCV::G_VSLIDEUP_VL, {InterLitTy},
+                       {AlignedExtract, LitVec, SlideupAmt, Mask, VL, Policy});
+  }
+
+  // If required, insert this subvector back into the correct vector register.
+  // This should resolve to an INSERT_SUBREG instruction.
+  if (TypeSize::isKnownGT(BigTy.getSizeInBits(), InterLitTy.getSizeInBits()))
+    Inserted = MIB.buildInsertSubvector(BigTy, BigVec, LitVec, AlignedIdx);
+
+  // We might have bitcast from a mask type: cast back to the original type if
+  // required.
+  MIB.buildBitcast(Dst, Inserted);
+
+  MI.eraseFromParent();
+  return;
+}
+
 class RISCVPostLegalizerLoweringImpl : public Combiner {
 protected:
   // TODO: Make CombinerHelper methods const.
diff --git a/llvm/lib/Target/RISCV/RISCVCombine.td b/llvm/lib/Target/RISCV/RISCVCombine.td
index d48698ae6f2bfb..902f3a05bc0fcf 100644
--- a/llvm/lib/Target/RISCV/RISCVCombine.td
+++ b/llvm/lib/Target/RISCV/RISCVCombine.td
@@ -19,11 +19,17 @@ def RISCVO0PreLegalizerCombiner: GICombiner<
   "RISCVO0PreLegalizerCombinerImpl", [optnone_combines]> {
 }
 
+def lower_insert_subvector : GICombineRule<
+  (defs root:$root),
+  (match (G_INSERT_SUBVECTOR $dst, $src1, $src2, $idx):$root),
+  (apply [{ lowerInsertSubvector(*${root}, STI); }])
+>;
+
 // Post-legalization combines which should happen at all optimization levels.
 // (E.g. ones that facilitate matching for the selector) For example, matching
 // pseudos.
 def RISCVPostLegalizerLowering
-    : GICombiner<"RISCVPostLegalizerLoweringImpl", []> {
+    : GICombiner<"RISCVPostLegalizerLoweringImpl", [lower_insert_subvector]> {
 }
 
 // Post-legalization combines which are primarily optimizations.
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir
index 598c2e2a142cb4..440486a81a0c44 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer/rvv/legalize-insert-subvector.mir
@@ -9,54 +9,12 @@ legalized:       false
 tracksRegLiveness: true
 body:             |
   bb.0.entry:
-    ; RV32-LABEL: name: insert_subvector_nxv2i1_nxv4i1
-    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
-    ; RV32-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; RV32-NEXT: [[ANYEXT:%[0-9]+]]:_(s64) = G_ANYEXT [[C]](s32)
-    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT]](s64)
-    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; RV32-NEXT: [[ANYEXT1:%[0-9]+]]:_(s64) = G_ANYEXT [[C1]](s32)
-    ; RV32-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT1]](s64)
-    ; RV32-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[DEF]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
-    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
-    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
-    ; RV32-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C2]](s64)
-    ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB
-    ; RV32-NEXT: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C3]](s64)
-    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR1]], [[LSHR]]
-    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VSLIDEUP_VL [[SELECT]], [[SELECT]], [[LSHR1]](s64), [[VMSET_VL]](<vscale x 4 x s1>), [[ADD]](s64), 0
-    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 4 x s8>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 4 x s8>)
-    ; RV32-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; RV32-NEXT: [[ANYEXT2:%[0-9]+]]:_(s64) = G_ANYEXT [[C4]](s32)
-    ; RV32-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[ANYEXT2]](s64)
-    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[BITCAST]](<vscale x 4 x s8>), [[SPLAT_VECTOR2]]
-    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
-    ; RV32-NEXT: PseudoRET implicit $v8
-    ;
-    ; RV64-LABEL: name: insert_subvector_nxv2i1_nxv4i1
-    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
-    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s32)
-    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; RV64-NEXT: [[SPLAT_VECTOR1:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C1]](s32)
-    ; RV64-NEXT: [[SELECT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SELECT [[DEF]](<vscale x 4 x s1>), [[SPLAT_VECTOR1]], [[SPLAT_VECTOR]]
-    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
-    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
-    ; RV64-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C2]](s32)
-    ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB
-    ; RV64-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C3]](s32)
-    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR1]], [[LSHR]]
-    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VSLIDEUP_VL [[SELECT]], [[SELECT]], [[LSHR1]](s32), [[VMSET_VL]](<vscale x 4 x s1>), [[ADD]](s32), 0
-    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 4 x s8>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 4 x s8>)
-    ; RV64-NEXT: [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; RV64-NEXT: [[SPLAT_VECTOR2:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C4]](s32)
-    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[BITCAST]](<vscale x 4 x s8>), [[SPLAT_VECTOR2]]
-    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
-    ; RV64-NEXT: PseudoRET implicit $v8
+    ; CHECK-LABEL: name: insert_subvector_nxv2i1_nxv4i1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 2 x s1>), 2
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
     %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
     %2:_(<vscale x 4 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 4 x s1>), %1, 2
@@ -69,37 +27,12 @@ legalized:       false
 tracksRegLiveness: true
 body:             |
   bb.0.entry:
-    ; RV32-LABEL: name: insert_subvector_nxv4i1_nxv8i1
-    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
-    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 1 x s8>) = G_BITCAST [[DEF]](<vscale x 8 x s1>)
-    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL $x0
-    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
-    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
-    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C]](s64)
-    ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB
-    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
-    ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C1]](s64)
-    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR1]], [[LSHR]]
-    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 1 x s8>) = G_VSLIDEUP_VL [[DEF]], [[BITCAST]], [[LSHR1]](s64), [[VMSET_VL]](<vscale x 1 x s1>), [[ADD]](s64), 0
-    ; RV32-NEXT: [[BITCAST1:%[0-9]+]]:_(<vscale x 8 x s1>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 1 x s8>)
-    ; RV32-NEXT: $v8 = COPY [[BITCAST1]](<vscale x 8 x s1>)
-    ; RV32-NEXT: PseudoRET implicit $v8
-    ;
-    ; RV64-LABEL: name: insert_subvector_nxv4i1_nxv8i1
-    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
-    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 1 x s8>) = G_BITCAST [[DEF]](<vscale x 8 x s1>)
-    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL $x0
-    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
-    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
-    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C]](s32)
-    ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB
-    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
-    ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C1]](s32)
-    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR1]], [[LSHR]]
-    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 1 x s8>) = G_VSLIDEUP_VL [[DEF]], [[BITCAST]], [[LSHR1]](s32), [[VMSET_VL]](<vscale x 1 x s1>), [[ADD]](s32), 0
-    ; RV64-NEXT: [[BITCAST1:%[0-9]+]]:_(<vscale x 8 x s1>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 1 x s8>)
-    ; RV64-NEXT: $v8 = COPY [[BITCAST1]](<vscale x 8 x s1>)
-    ; RV64-NEXT: PseudoRET implicit $v8
+    ; CHECK-LABEL: name: insert_subvector_nxv4i1_nxv8i1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 2 x s1>), 8
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
     %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
     %2:_(<vscale x 8 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 8 x s1>), %1, 8
@@ -112,33 +45,12 @@ legalized:       false
 tracksRegLiveness: true
 body:             |
   bb.0.entry:
-    ; RV32-LABEL: name: insert_subvector_nxv32i1_nxv64i1
-    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
-    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s8>) = G_BITCAST [[DEF]](<vscale x 64 x s1>)
-    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
-    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
-    ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB
-    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C]](s64)
-    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR]], [[READ_VLENB]]
-    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEUP_VL [[DEF]], [[BITCAST]], [[LSHR]](s64), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s64), 0
-    ; RV32-NEXT: [[BITCAST1:%[0-9]+]]:_(<vscale x 64 x s1>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 8 x s8>)
-    ; RV32-NEXT: $v8 = COPY [[BITCAST1]](<vscale x 64 x s1>)
-    ; RV32-NEXT: PseudoRET implicit $v8
-    ;
-    ; RV64-LABEL: name: insert_subvector_nxv32i1_nxv64i1
-    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
-    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s8>) = G_BITCAST [[DEF]](<vscale x 64 x s1>)
-    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
-    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
-    ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB
-    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C]](s32)
-    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR]], [[READ_VLENB]]
-    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEUP_VL [[DEF]], [[BITCAST]], [[LSHR]](s32), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s32), 0
-    ; RV64-NEXT: [[BITCAST1:%[0-9]+]]:_(<vscale x 64 x s1>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 8 x s8>)
-    ; RV64-NEXT: $v8 = COPY [[BITCAST1]](<vscale x 64 x s1>)
-    ; RV64-NEXT: PseudoRET implicit $v8
+    ; CHECK-LABEL: name: insert_subvector_nxv32i1_nxv64i1
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 32 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 64 x s1>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 32 x s1>), 32
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 64 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
     %1:_(<vscale x 32 x s1>) = G_IMPLICIT_DEF
     %2:_(<vscale x 64 x s1>) = G_INSERT_SUBVECTOR %0(<vscale x 64 x s1>), %1, 32
@@ -283,35 +195,12 @@ legalized:       false
 tracksRegLiveness: true
 body:             |
   bb.0.entry:
-    ; RV32-LABEL: name: insert_subvector_nxv1i8_nxv2i8
-    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
-    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
-    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
-    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
-    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C]](s64)
-    ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB
-    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
-    ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C1]](s64)
-    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR1]], [[LSHR]]
-    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 2 x s8>) = G_VSLIDEUP_VL [[DEF]], [[DEF]], [[LSHR1]](s64), [[VMSET_VL]](<vscale x 2 x s1>), [[ADD]](s64), 0
-    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 2 x s8>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 2 x s8>)
-    ; RV32-NEXT: $v8 = COPY [[BITCAST]](<vscale x 2 x s8>)
-    ; RV32-NEXT: PseudoRET implicit $v8
-    ;
-    ; RV64-LABEL: name: insert_subvector_nxv1i8_nxv2i8
-    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
-    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
-    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
-    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C]](s32)
-    ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB
-    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
-    ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C1]](s32)
-    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR1]], [[LSHR]]
-    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 2 x s8>) = G_VSLIDEUP_VL [[DEF]], [[DEF]], [[LSHR1]](s32), [[VMSET_VL]](<vscale x 2 x s1>), [[ADD]](s32), 0
-    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 2 x s8>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 2 x s8>)
-    ; RV64-NEXT: $v8 = COPY [[BITCAST]](<vscale x 2 x s8>)
-    ; RV64-NEXT: PseudoRET implicit $v8
+    ; CHECK-LABEL: name: insert_subvector_nxv1i8_nxv2i8
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 1 x s8>), 1
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
     %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
     %2:_(<vscale x 2 x s8>) = G_INSERT_SUBVECTOR %0(<vscale x 2 x s8>), %1, 1
@@ -324,35 +213,12 @@ legalized:       false
 tracksRegLiveness: true
 body:             |
   bb.0.entry:
-    ; RV32-LABEL: name: insert_subvector_nxv2i16_nxv4i16
-    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
-    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
-    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
-    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB]], [[C]](s64)
-    ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB
-    ; RV32-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 3
-    ; RV32-NEXT: [[LSHR1:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C1]](s64)
-    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR1]], [[LSHR]]
-    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 4 x s16>) = G_VSLIDEUP_VL [[DEF]], [[DEF]], [[LSHR1]](s64), [[VMSET_VL]](<vscale x 4 x s1>), [[ADD]](s64), 0
-    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 4 x s16>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 4 x s16>)
-    ; RV32-NEXT: $v8 = COPY [[BITCAST]](<vscale x 4 x s16>)
-    ; RV32-NEXT: PseudoRET implicit $v8
-    ;
-    ; RV64-LABEL: name: insert_subvector_nxv2i16_nxv4i16
-    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
-    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
-    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
-    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB]], [[C]](s32)
-    ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB
-    ; RV64-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 3
-    ; RV64-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C1]](s32)
-    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR1]], [[LSHR]]
-    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 4 x s16>) = G_VSLIDEUP_VL [[DEF]], [[DEF]], [[LSHR1]](s32), [[VMSET_VL]](<vscale x 4 x s1>), [[ADD]](s32), 0
-    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 4 x s16>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 4 x s16>)
-    ; RV64-NEXT: $v8 = COPY [[BITCAST]](<vscale x 4 x s16>)
-    ; RV64-NEXT: PseudoRET implicit $v8
+    ; CHECK-LABEL: name: insert_subvector_nxv2i16_nxv4i16
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 1 x s16>), 1
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
     %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
     %2:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR %0(<vscale x 4 x s16>), %1, 1
@@ -365,31 +231,12 @@ legalized:       false
 tracksRegLiveness: true
 body:             |
   bb.0.entry:
-    ; RV32-LABEL: name: insert_subvector_nxv4i32_nxv8i32
-    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
-    ; RV32-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_EXTRACT_SUBVECTOR [[DEF]], 0
-    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
-    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
-    ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB
-    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[READ_VLENB1]], [[READ_VLENB]]
-    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 2 x s32>) = G_VSLIDEUP_VL [[EXTRACT_SUBVECTOR]], [[DEF]], [[READ_VLENB1]](s64), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s64), 0
-    ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF]](<vscale x 8 x s32>), 0
-    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s32>) = G_BITCAST [[INSERT_SUBVECTOR]](<vscale x 8 x s32>)
-    ; RV32-NEXT: $v8 = COPY [[BITCAST]](<vscale x 8 x s32>)
-    ; RV32-NEXT: PseudoRET implicit $v8
-    ;
-    ; RV64-LABEL: name: insert_subvector_nxv4i32_nxv8i32
-    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
-    ; RV64-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<vscale x 2 x s32>) = G_EXTRACT_SUBVECTOR [[DEF]], 0
-    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
-    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
-    ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB
-    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[READ_VLENB1]], [[READ_VLENB]]
-    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 2 x s32>) = G_VSLIDEUP_VL [[EXTRACT_SUBVECTOR]], [[DEF]], [[READ_VLENB1]](s32), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s32), 0
-    ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF]](<vscale x 8 x s32>), 0
-    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s32>) = G_BITCAST [[INSERT_SUBVECTOR]](<vscale x 8 x s32>)
-    ; RV64-NEXT: $v8 = COPY [[BITCAST]](<vscale x 8 x s32>)
-    ; RV64-NEXT: PseudoRET implicit $v8
+    ; CHECK-LABEL: name: insert_subvector_nxv4i32_nxv8i32
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 4 x s32>), 8
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
     %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
     %2:_(<vscale x 8 x s32>) = G_INSERT_SUBVECTOR %0(<vscale x 8 x s32>), %1, 8
@@ -402,35 +249,12 @@ legalized:       false
 tracksRegLiveness: true
 body:             |
   bb.0.entry:
-    ; RV32-LABEL: name: insert_subvector_nxv2i64_nxv8i64
-    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
-    ; RV32-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_EXTRACT_SUBVECTOR [[DEF]], 0
-    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
-    ; RV32-NEXT: [[READ_VLENB:%[0-9]+]]:_(s64) = G_READ_VLENB
-    ; RV32-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s64) = G_READ_VLENB
-    ; RV32-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; RV32-NEXT: [[LSHR:%[0-9]+]]:_(s64) = G_LSHR [[READ_VLENB1]], [[C]](s64)
-    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[LSHR]], [[READ_VLENB]]
-    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 1 x s64>) = G_VSLIDEUP_VL [[EXTRACT_SUBVECTOR]], [[DEF]], [[LSHR]](s64), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s64), 0
-    ; RV32-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF]](<vscale x 8 x s64>), 0
-    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s64>) = G_BITCAST [[INSERT_SUBVECTOR]](<vscale x 8 x s64>)
-    ; RV32-NEXT: $v8 = COPY [[BITCAST]](<vscale x 8 x s64>)
-    ; RV32-NEXT: PseudoRET implicit $v8
-    ;
-    ; RV64-LABEL: name: insert_subvector_nxv2i64_nxv8i64
-    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
-    ; RV64-NEXT: [[EXTRACT_SUBVECTOR:%[0-9]+]]:_(<vscale x 1 x s64>) = G_EXTRACT_SUBVECTOR [[DEF]], 0
-    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
-    ; RV64-NEXT: [[READ_VLENB:%[0-9]+]]:_(s32) = G_READ_VLENB
-    ; RV64-NEXT: [[READ_VLENB1:%[0-9]+]]:_(s32) = G_READ_VLENB
-    ; RV64-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-    ; RV64-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[READ_VLENB1]], [[C]](s32)
-    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LSHR]], [[READ_VLENB]]
-    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 1 x s64>) = G_VSLIDEUP_VL [[EXTRACT_SUBVECTOR]], [[DEF]], [[LSHR]](s32), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s32), 0
-    ; RV64-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF]](<vscale x 8 x s64>), 0
-    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s64>) = G_BITCAST [[INSERT_SUBVECTOR]](<vscale x 8 x s64>)
-    ; RV64-NEXT: $v8 = COPY [[BITCAST]](<vscale x 8 x s64>)
-    ; RV64-NEXT: PseudoRET implicit $v8
+    ; CHECK-LABEL: name: insert_subvector_nxv2i64_nxv8i64
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 1 x s64>), 4
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
     %0:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
     %1:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
     %2:_(<vscale x 8 x s64>) = G_INSERT_SUBVECTOR %0(<vscale x 8 x s64>), %1, 4
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/postlegalizer-lowering/rvv/insert-subvector.mir b/llvm/test/CodeGen/RISCV/GlobalISel/postlegalizer-lowering/rvv/insert-subvector.mir
new file mode 100644
index 00000000000000..080e3a2a615d6b
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/postlegalizer-lowering/rvv/insert-subvector.mir
@@ -0,0 +1,368 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=riscv64 -mattr=+v -run-pass=riscv-postlegalizer-lowering %s \
+# RUN:   -o - | FileCheck %s -check-prefixes=CHECK,RV32
+# RUN: llc -mtriple=riscv32 -mattr=+v -run-pass=riscv-postlegalizer-lowering %s \
+# RUN:   -o - | FileCheck %s -check-prefixes=CHECK,RV64
+
+# Special handling for i1-element vectors with non-zero index
+---
+name:            insert_subvector_nxv2i1_nxv4i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: insert_subvector_nxv2i1_nxv4i1
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_ZEXT [[DEF]](<vscale x 4 x s1>)
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; RV32-NEXT: [[VSCALE:%[0-9]+]]:_(s64) = G_VSCALE i64 4
+    ; RV32-NEXT: [[VSCALE1:%[0-9]+]]:_(s64) = G_VSCALE i64 2
+    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[VSCALE1]], [[VSCALE]]
+    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VSLIDEUP_VL [[ZEXT]], [[ZEXT]], [[VSCALE1]](s64), [[VMSET_VL]](<vscale x 4 x s1>), [[ADD]](s64), 0
+    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 4 x s8>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 4 x s8>)
+    ; RV32-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
+    ; RV32-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s8)
+    ; RV32-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[BITCAST]](<vscale x 4 x s8>), [[SPLAT_VECTOR]]
+    ; RV32-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv2i1_nxv4i1
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[ZEXT:%[0-9]+]]:_(<vscale x 4 x s8>) = G_ZEXT [[DEF]](<vscale x 4 x s1>)
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; RV64-NEXT: [[VSCALE:%[0-9]+]]:_(s32) = G_VSCALE i32 4
+    ; RV64-NEXT: [[VSCALE1:%[0-9]+]]:_(s32) = G_VSCALE i32 2
+    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[VSCALE1]], [[VSCALE]]
+    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 4 x s8>) = G_VSLIDEUP_VL [[ZEXT]], [[ZEXT]], [[VSCALE1]](s32), [[VMSET_VL]](<vscale x 4 x s1>), [[ADD]](s32), 0
+    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 4 x s8>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 4 x s8>)
+    ; RV64-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 0
+    ; RV64-NEXT: [[SPLAT_VECTOR:%[0-9]+]]:_(<vscale x 4 x s8>) = G_SPLAT_VECTOR [[C]](s8)
+    ; RV64-NEXT: [[ICMP:%[0-9]+]]:_(<vscale x 4 x s1>) = G_ICMP intpred(ne), [[BITCAST]](<vscale x 4 x s8>), [[SPLAT_VECTOR]]
+    ; RV64-NEXT: $v8 = COPY [[ICMP]](<vscale x 4 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 4 x s1>) = G_INSERT_SUBVECTOR %0, %1(<vscale x 2 x s1>), 2
+    $v8 = COPY %2(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            insert_subvector_nxv4i1_nxv8i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: insert_subvector_nxv4i1_nxv8i1
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 1 x s8>) = G_BITCAST [[DEF]](<vscale x 8 x s1>)
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL $x0
+    ; RV32-NEXT: [[VSCALE:%[0-9]+]]:_(s64) = G_VSCALE i64 1
+    ; RV32-NEXT: [[VSCALE1:%[0-9]+]]:_(s64) = G_VSCALE i64 1
+    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[VSCALE1]], [[VSCALE]]
+    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 1 x s8>) = G_VSLIDEUP_VL [[DEF]], [[BITCAST]], [[VSCALE1]](s64), [[VMSET_VL]](<vscale x 1 x s1>), [[ADD]](s64), 0
+    ; RV32-NEXT: [[BITCAST1:%[0-9]+]]:_(<vscale x 8 x s1>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 1 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[BITCAST1]](<vscale x 8 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv4i1_nxv8i1
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 1 x s8>) = G_BITCAST [[DEF]](<vscale x 8 x s1>)
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 1 x s1>) = G_VMSET_VL $x0
+    ; RV64-NEXT: [[VSCALE:%[0-9]+]]:_(s32) = G_VSCALE i32 1
+    ; RV64-NEXT: [[VSCALE1:%[0-9]+]]:_(s32) = G_VSCALE i32 1
+    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[VSCALE1]], [[VSCALE]]
+    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 1 x s8>) = G_VSLIDEUP_VL [[DEF]], [[BITCAST]], [[VSCALE1]](s32), [[VMSET_VL]](<vscale x 1 x s1>), [[ADD]](s32), 0
+    ; RV64-NEXT: [[BITCAST1:%[0-9]+]]:_(<vscale x 8 x s1>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 1 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[BITCAST1]](<vscale x 8 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 8 x s1>) = G_INSERT_SUBVECTOR %0, %1(<vscale x 2 x s1>), 8
+    $v8 = COPY %2(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            insert_subvector_nxv32i1_nxv64i1
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: insert_subvector_nxv32i1_nxv64i1
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s8>) = G_BITCAST [[DEF]](<vscale x 64 x s1>)
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; RV32-NEXT: [[VSCALE:%[0-9]+]]:_(s64) = G_VSCALE i64 8
+    ; RV32-NEXT: [[VSCALE1:%[0-9]+]]:_(s64) = G_VSCALE i64 4
+    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[VSCALE1]], [[VSCALE]]
+    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEUP_VL [[DEF]], [[BITCAST]], [[VSCALE1]](s64), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s64), 0
+    ; RV32-NEXT: [[BITCAST1:%[0-9]+]]:_(<vscale x 64 x s1>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 8 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[BITCAST1]](<vscale x 64 x s1>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv32i1_nxv64i1
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s8>) = G_BITCAST [[DEF]](<vscale x 64 x s1>)
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 8 x s1>) = G_VMSET_VL $x0
+    ; RV64-NEXT: [[VSCALE:%[0-9]+]]:_(s32) = G_VSCALE i32 8
+    ; RV64-NEXT: [[VSCALE1:%[0-9]+]]:_(s32) = G_VSCALE i32 4
+    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[VSCALE1]], [[VSCALE]]
+    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 8 x s8>) = G_VSLIDEUP_VL [[DEF]], [[BITCAST]], [[VSCALE1]](s32), [[VMSET_VL]](<vscale x 8 x s1>), [[ADD]](s32), 0
+    ; RV64-NEXT: [[BITCAST1:%[0-9]+]]:_(<vscale x 64 x s1>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 8 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[BITCAST1]](<vscale x 64 x s1>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 32 x s1>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 64 x s1>) = G_INSERT_SUBVECTOR %0, %1(<vscale x 32 x s1>), 32
+    $v8 = COPY %2(<vscale x 64 x s1>)
+    PseudoRET implicit $v8
+
+...
+
+# i1-element vectors with zero index
+---
+name:            insert_subvector_nxv2i1_nxv4i1_zero
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insert_subvector_nxv2i1_nxv4i1_zero
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s1>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 1 x s1>), 0
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 4 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s1>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 4 x s1>) = G_INSERT_SUBVECTOR %0, %1(<vscale x 1 x s1>), 0
+    $v8 = COPY %2(<vscale x 4 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            insert_subvector_nxv4i1_nxv8i1_zero
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insert_subvector_nxv4i1_nxv8i1_zero
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s1>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 2 x s1>), 0
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 8 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s1>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 8 x s1>) = G_INSERT_SUBVECTOR %0, %1(<vscale x 2 x s1>), 0
+    $v8 = COPY %2(<vscale x 8 x s1>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            insert_subvector_nxv32i1_nxv64i1_zero
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insert_subvector_nxv32i1_nxv64i1_zero
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 64 x s1>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 16 x s1>), 0
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 64 x s1>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 64 x s1>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 16 x s1>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 64 x s1>) = G_INSERT_SUBVECTOR %0, %1(<vscale x 16 x s1>), 0
+    $v8 = COPY %2(<vscale x 64 x s1>)
+    PseudoRET implicit $v8
+
+...
+
+# Insert with zero index
+---
+name:            insert_subvector_nxv1i8_nxv2i8_zero
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insert_subvector_nxv1i8_nxv2i8_zero
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 2 x s8>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 1 x s8>), 0
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 2 x s8>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 2 x s8>) = G_INSERT_SUBVECTOR %0, %1(<vscale x 1 x s8>), 0
+    $v8 = COPY %2(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            insert_subvector_nxv2i16_nxv4i16_zero
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insert_subvector_nxv2i16_nxv4i16_zero
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 1 x s16>), 0
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 4 x s16>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR %0, %1(<vscale x 1 x s16>), 0
+    $v8 = COPY %2(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            insert_subvector_nxv4i32_nxv8i32_zero
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insert_subvector_nxv4i32_nxv8i32_zero
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 4 x s32>), 0
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 8 x s32>) = G_INSERT_SUBVECTOR %0, %1(<vscale x 4 x s32>), 0
+    $v8 = COPY %2(<vscale x 8 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            insert_subvector_nxv2i64_nxv8i64_zero
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insert_subvector_nxv2i64_nxv8i64_zero
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF1]](<vscale x 2 x s64>), 0
+    ; CHECK-NEXT: $v8 = COPY [[INSERT_SUBVECTOR]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 2 x s64>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 8 x s64>) = G_INSERT_SUBVECTOR %0, %1(<vscale x 2 x s64>), 0
+    $v8 = COPY %2(<vscale x 8 x s64>)
+    PseudoRET implicit $v8
+
+...
+
+# Insert with non-zero index
+---
+name:            insert_subvector_nxv1i8_nxv2i8
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: insert_subvector_nxv1i8_nxv2i8
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
+    ; RV32-NEXT: [[VSCALE:%[0-9]+]]:_(s64) = G_VSCALE i64 2
+    ; RV32-NEXT: [[VSCALE1:%[0-9]+]]:_(s64) = G_VSCALE i64 1
+    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[VSCALE1]], [[VSCALE]]
+    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 2 x s8>) = G_VSLIDEUP_VL [[DEF]], [[DEF]], [[VSCALE1]](s64), [[VMSET_VL]](<vscale x 2 x s1>), [[ADD]](s64), 0
+    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 2 x s8>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 2 x s8>)
+    ; RV32-NEXT: $v8 = COPY [[BITCAST]](<vscale x 2 x s8>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv1i8_nxv2i8
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 2 x s1>) = G_VMSET_VL $x0
+    ; RV64-NEXT: [[VSCALE:%[0-9]+]]:_(s32) = G_VSCALE i32 2
+    ; RV64-NEXT: [[VSCALE1:%[0-9]+]]:_(s32) = G_VSCALE i32 1
+    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[VSCALE1]], [[VSCALE]]
+    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 2 x s8>) = G_VSLIDEUP_VL [[DEF]], [[DEF]], [[VSCALE1]](s32), [[VMSET_VL]](<vscale x 2 x s1>), [[ADD]](s32), 0
+    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 2 x s8>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 2 x s8>)
+    ; RV64-NEXT: $v8 = COPY [[BITCAST]](<vscale x 2 x s8>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 2 x s8>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s8>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 2 x s8>) = G_INSERT_SUBVECTOR %0, %1(<vscale x 1 x s8>), 1
+    $v8 = COPY %2(<vscale x 2 x s8>)
+    PseudoRET implicit $v8
+
+...
+
+# i1-element vectors with zero index
+---
+name:            insert_subvector_nxv2i16_nxv4i16
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; RV32-LABEL: name: insert_subvector_nxv2i16_nxv4i16
+    ; RV32: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; RV32-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; RV32-NEXT: [[VSCALE:%[0-9]+]]:_(s64) = G_VSCALE i64 4
+    ; RV32-NEXT: [[VSCALE1:%[0-9]+]]:_(s64) = G_VSCALE i64 1
+    ; RV32-NEXT: [[ADD:%[0-9]+]]:_(s64) = G_ADD [[VSCALE1]], [[VSCALE]]
+    ; RV32-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 4 x s16>) = G_VSLIDEUP_VL [[DEF]], [[DEF]], [[VSCALE1]](s64), [[VMSET_VL]](<vscale x 4 x s1>), [[ADD]](s64), 0
+    ; RV32-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 4 x s16>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 4 x s16>)
+    ; RV32-NEXT: $v8 = COPY [[BITCAST]](<vscale x 4 x s16>)
+    ; RV32-NEXT: PseudoRET implicit $v8
+    ;
+    ; RV64-LABEL: name: insert_subvector_nxv2i16_nxv4i16
+    ; RV64: [[DEF:%[0-9]+]]:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    ; RV64-NEXT: [[VMSET_VL:%[0-9]+]]:_(<vscale x 4 x s1>) = G_VMSET_VL $x0
+    ; RV64-NEXT: [[VSCALE:%[0-9]+]]:_(s32) = G_VSCALE i32 4
+    ; RV64-NEXT: [[VSCALE1:%[0-9]+]]:_(s32) = G_VSCALE i32 1
+    ; RV64-NEXT: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[VSCALE1]], [[VSCALE]]
+    ; RV64-NEXT: [[VSLIDEUP_VL:%[0-9]+]]:_(<vscale x 4 x s16>) = G_VSLIDEUP_VL [[DEF]], [[DEF]], [[VSCALE1]](s32), [[VMSET_VL]](<vscale x 4 x s1>), [[ADD]](s32), 0
+    ; RV64-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 4 x s16>) = G_BITCAST [[VSLIDEUP_VL]](<vscale x 4 x s16>)
+    ; RV64-NEXT: $v8 = COPY [[BITCAST]](<vscale x 4 x s16>)
+    ; RV64-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 4 x s16>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s16>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 4 x s16>) = G_INSERT_SUBVECTOR %0, %1(<vscale x 1 x s16>), 1
+    $v8 = COPY %2(<vscale x 4 x s16>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            insert_subvector_nxv4i32_nxv8i32
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insert_subvector_nxv4i32_nxv8i32
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s32>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF]](<vscale x 8 x s32>), 0
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s32>) = G_BITCAST [[INSERT_SUBVECTOR]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: $v8 = COPY [[BITCAST]](<vscale x 8 x s32>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s32>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 4 x s32>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 8 x s32>) = G_INSERT_SUBVECTOR %0, %1(<vscale x 4 x s32>), 8
+    $v8 = COPY %2(<vscale x 8 x s32>)
+    PseudoRET implicit $v8
+
+...
+---
+name:            insert_subvector_nxv2i64_nxv8i64
+legalized:       true
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    ; CHECK-LABEL: name: insert_subvector_nxv2i64_nxv8i64
+    ; CHECK: [[DEF:%[0-9]+]]:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    ; CHECK-NEXT: [[INSERT_SUBVECTOR:%[0-9]+]]:_(<vscale x 8 x s64>) = G_INSERT_SUBVECTOR [[DEF]], [[DEF]](<vscale x 8 x s64>), 0
+    ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<vscale x 8 x s64>) = G_BITCAST [[INSERT_SUBVECTOR]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: $v8 = COPY [[BITCAST]](<vscale x 8 x s64>)
+    ; CHECK-NEXT: PseudoRET implicit $v8
+    %0:_(<vscale x 8 x s64>) = G_IMPLICIT_DEF
+    %1:_(<vscale x 1 x s64>) = G_IMPLICIT_DEF
+    %2:_(<vscale x 8 x s64>) = G_INSERT_SUBVECTOR %0, %1(<vscale x 1 x s64>), 4
+    $v8 = COPY %2(<vscale x 8 x s64>)
+    PseudoRET implicit $v8
+
+...