[llvm] [LLVM][CodeGen] Remove failure cases when widening EXTRACT/INSERT_SUBVECTOR. (PR #162308)

Tue Oct 7 08:48:10 PDT 2025

https://github.com/paulwalker-arm created https://github.com/llvm/llvm-project/pull/162308

This PR implements catch all handling for widening the scalable subvector operand (INSERT_SUBVECTOR) or result (EXTRACT_SUBVECTOR). It does this via the stack using masked memory operations. With general handling available we can add optimiations for specific cases.

>From 4b4fd331fc3ca88a21e3307099b282ed4c3f04d2 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Fri, 26 Sep 2025 12:00:46 +0100
Subject: [PATCH] [LLVM][CodeGen] Remove failure cases when widening
 EXTRACT/INSERT_SUBVECTOR.

This PR implements catch all handling for widening the scalable
subvector operand (INSERT_SUBVECTOR) or result (EXTRACT_SUBVECTOR). It
does this via the stack using masked memory operations. With general
handling available we can add optimiations for specific cases.
---
 llvm/include/llvm/CodeGen/SelectionDAG.h      |   6 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |  90 ++++--
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |   8 +
 .../AArch64/sve-extract-scalable-vector.ll    | 253 +++++++++++----
 .../test/CodeGen/AArch64/sve-insert-vector.ll | 287 +++++++++++++++---
 5 files changed, 518 insertions(+), 126 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index d9d6f0bcdcb84..f935442404cde 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1186,6 +1186,12 @@ class SelectionDAG {
   LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
                                    bool ConstantFold = true);
 
+  /// Return a vector with the first 'Len' lanes set to true and remaining lanes
+  /// set to false. The mask's ValueType is the same as when comparing vectors
+  /// of type VT.
+  LLVM_ABI SDValue getMaskFromElementCount(const SDLoc &DL, EVT VT,
+                                           ElementCount Len);
+
   /// Return a GLOBAL_OFFSET_TABLE node. This does not have a useful SDLoc.
   SDValue getGLOBAL_OFFSET_TABLE(EVT VT) {
     return getNode(ISD::GLOBAL_OFFSET_TABLE, SDLoc(), VT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 87d5453cd98cf..bcaac40de5459 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -6201,8 +6201,33 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
       return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, Parts);
     }
 
-    report_fatal_error("Don't know how to widen the result of "
-                       "EXTRACT_SUBVECTOR for scalable vectors");
+    // Fallback to extracting through memory.
+
+    Align Alignment = DAG.getReducedAlign(InVT, /*UseABI=*/false);
+    SDValue StackPtr = DAG.CreateStackTemporary(InVT.getStoreSize(), Alignment);
+    auto &MF = DAG.getMachineFunction();
+    auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+    auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+    MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
+        PtrInfo, MachineMemOperand::MOStore,
+        LocationSize::beforeOrAfterPointer(), Alignment);
+    MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
+        PtrInfo, MachineMemOperand::MOLoad,
+        LocationSize::beforeOrAfterPointer(), Alignment);
+
+    // Write out the input vector.
+    SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, StoreMMO);
+
+    // Build a mask to match the length of the non-widened result.
+    SDValue Mask =
+        DAG.getMaskFromElementCount(dl, WidenVT, VT.getVectorElementCount());
+
+    // Read back the sub-vector setting the remaining lanes to poison.
+    StackPtr = TLI.getVectorSubVecPointer(DAG, StackPtr, InVT, VT, Idx);
+    return DAG.getMaskedLoad(
+        WidenVT, dl, Ch, StackPtr, DAG.getUNDEF(StackPtr.getValueType()), Mask,
+        DAG.getPOISON(WidenVT), VT, LoadMMO, ISD::UNINDEXED, ISD::NON_EXTLOAD);
   }
 
   // We could try widening the input to the right length but for now, extract
@@ -6306,11 +6331,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
   if (VT.isVector()) {
     // If all else fails replace the load with a wide masked load.
     SDLoc DL(N);
-    EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
-
-    SDValue Len = DAG.getElementCount(DL, IdxVT, VT.getVectorElementCount());
-    SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT,
-                               DAG.getConstant(0, DL, IdxVT), Len);
+    SDValue Mask =
+        DAG.getMaskFromElementCount(DL, WideVT, VT.getVectorElementCount());
 
     SDValue NewLoad = DAG.getMaskedLoad(
         WideVT, DL, LD->getChain(), LD->getBasePtr(), LD->getOffset(), Mask,
@@ -7447,9 +7469,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_INSERT_SUBVECTOR(SDNode *N) {
   SDValue InVec = N->getOperand(0);
 
   EVT OrigVT = SubVec.getValueType();
-  if (getTypeAction(SubVec.getValueType()) == TargetLowering::TypeWidenVector)
-    SubVec = GetWidenedVector(SubVec);
-
+  SubVec = GetWidenedVector(SubVec);
   EVT SubVT = SubVec.getValueType();
 
   // Whether or not all the elements of the widened SubVec will be inserted into
@@ -7471,17 +7491,52 @@ SDValue DAGTypeLegalizer::WidenVecOp_INSERT_SUBVECTOR(SDNode *N) {
     }
   }
 
+  if (!IndicesValid)
+    report_fatal_error(
+        "Don't know how to widen the operands for INSERT_SUBVECTOR");
+
   SDLoc DL(N);
 
   // We need to make sure that the indices are still valid, otherwise we might
   // widen what was previously well-defined to something undefined.
-  if (IndicesValid && InVec.isUndef() && N->getConstantOperandVal(2) == 0)
+  if (InVec.isUndef() && N->getConstantOperandVal(2) == 0)
     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, InVec, SubVec,
                        N->getOperand(2));
 
-  if (!IndicesValid || OrigVT.isScalableVector())
-    report_fatal_error(
-        "Don't know how to widen the operands for INSERT_SUBVECTOR");
+  if (OrigVT.isScalableVector()) {
+    // Fallback to inserting through memory.
+
+    Align Alignment = DAG.getReducedAlign(VT, /*UseABI=*/false);
+    SDValue StackPtr = DAG.CreateStackTemporary(VT.getStoreSize(), Alignment);
+    auto &MF = DAG.getMachineFunction();
+    auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+    auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+    MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
+        PtrInfo, MachineMemOperand::MOStore,
+        LocationSize::beforeOrAfterPointer(), Alignment);
+    MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
+        PtrInfo, MachineMemOperand::MOLoad,
+        LocationSize::beforeOrAfterPointer(), Alignment);
+
+    // Write out the vector being inserting into.
+    SDValue Ch =
+        DAG.getStore(DAG.getEntryNode(), DL, InVec, StackPtr, StoreMMO);
+
+    // Build a mask to match the length of the sub-vector.
+    SDValue Mask =
+        DAG.getMaskFromElementCount(DL, SubVT, OrigVT.getVectorElementCount());
+
+    // Overwrite the sub-vector at the required offset.
+    StackPtr =
+        TLI.getVectorSubVecPointer(DAG, StackPtr, VT, OrigVT, N->getOperand(2));
+    Ch = DAG.getMaskedStore(Ch, DL, SubVec, StackPtr,
+                            DAG.getUNDEF(StackPtr.getValueType()), Mask, VT,
+                            StoreMMO, ISD::UNINDEXED, ISD::NON_EXTLOAD);
+
+    // Read back the result.
+    return DAG.getLoad(VT, DL, Ch, StackPtr, LoadMMO);
+  }
 
   // If the operands can't be widened legally, just replace the INSERT_SUBVECTOR
   // with a series of INSERT_VECTOR_ELT
@@ -7560,12 +7615,9 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
   if (StVT.isVector()) {
     // If all else fails replace the store with a wide masked store.
     SDLoc DL(N);
-    EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
-
     SDValue WideStVal = GetWidenedVector(StVal);
-    SDValue Len = DAG.getElementCount(DL, IdxVT, StVT.getVectorElementCount());
-    SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT,
-                               DAG.getConstant(0, DL, IdxVT), Len);
+    SDValue Mask =
+        DAG.getMaskFromElementCount(DL, WideVT, StVT.getVectorElementCount());
 
     return DAG.getMaskedStore(ST->getChain(), DL, WideStVal, ST->getBasePtr(),
                               ST->getOffset(), Mask, ST->getMemoryVT(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 95f53fe0bfdba..d976c0ce1b901 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2112,6 +2112,14 @@ SDValue SelectionDAG::getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
   return getConstant(EC.getKnownMinValue(), DL, VT);
 }
 
+SDValue SelectionDAG::getMaskFromElementCount(const SDLoc &DL, EVT DataVT,
+                                              ElementCount EC) {
+  EVT IdxVT = TLI->getVectorIdxTy(getDataLayout());
+  EVT MaskVT = TLI->getSetCCResultType(getDataLayout(), *getContext(), DataVT);
+  return getNode(ISD::GET_ACTIVE_LANE_MASK, DL, MaskVT,
+                 getConstant(0, DL, IdxVT), getElementCount(DL, IdxVT, EC));
+}
+
 SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT) {
   APInt One(ResVT.getScalarSizeInBits(), 1);
   return getStepVector(DL, ResVT, One);
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
index 4aaa25e5e66c5..8d0c71502f1e3 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
@@ -3,12 +3,199 @@
 
 ; Extracting illegal subvectors
 
-define <vscale x 1 x i32> @extract_nxv1i32_nxv4i32(<vscale x 4 x i32> %vec) nounwind {
-; CHECK-LABEL: extract_nxv1i32_nxv4i32:
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_0(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_0:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 0)
-  ret <vscale x 1 x i32> %retval
+  %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 0)
+  %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+  ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_1(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str z0, [sp]
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 1)
+  %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+  ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_2(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str z0, [sp]
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 2)
+  %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+  ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_3(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str z0, [sp]
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    cntw x8, all, mul #3
+; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 3)
+  %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+  ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 2 x float> @extract_nxv1f32_nxv2f32_0(<vscale x 2 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv2f32_0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv2f32(<vscale x 2 x float> %vec, i64 0)
+  %retval = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.nxv1f32(<vscale x 2 x float> poison, <vscale x 1 x float> %e, i64 0)
+  ret <vscale x 2 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 2 x float> @extract_nxv1f32_nxv2f32_1(<vscale x 2 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv2f32_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    addpl x9, sp, #4
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    whilelo p1.d, xzr, x8
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv2f32(<vscale x 2 x float> %vec, i64 1)
+  %retval = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.nxv1f32(<vscale x 2 x float> poison, <vscale x 1 x float> %e, i64 0)
+  ret <vscale x 2 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_0(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 0)
+  %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+  ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_1(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str z0, [sp]
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x8]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 1)
+  %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+  ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_2(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    str z0, [sp]
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 2)
+  %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+  ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_3(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str z0, [sp]
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    cntw x8, all, mul #3
+; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x8]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 3)
+  %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+  ret <vscale x 4 x float> %retval
 }
 
 define <vscale x 1 x i16> @extract_nxv1i16_nxv6i16(<vscale x 6 x i16> %vec) nounwind {
@@ -19,9 +206,6 @@ define <vscale x 1 x i16> @extract_nxv1i16_nxv6i16(<vscale x 6 x i16> %vec) noun
   ret <vscale x 1 x i16> %retval
 }
 
-declare <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32>, i64)
-declare <vscale x 1 x i16> @llvm.vector.extract.nxv1i16.nxv6i16(<vscale x 6 x i16>, i64)
-
 ;
 ; Extract half i1 vector that needs promotion from legal type.
 ;
@@ -43,8 +227,6 @@ define <vscale x 8 x i1> @extract_nxv8i1_nxv16i1_8(<vscale x 16 x i1> %in) {
   ret <vscale x 8 x i1> %res
 }
 
-declare <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1>, i64)
-
 ;
 ; Extract i1 vector that needs widening from one that needs widening.
 ;
@@ -99,8 +281,6 @@ define <vscale x 14 x i1> @extract_nxv14i1_nxv28i1_14(<vscale x 28 x i1> %in) uw
   ret <vscale x 14 x i1> %res
 }
 
-declare <vscale x 14 x i1> @llvm.vector.extract.nxv14i1.nxv28i1(<vscale x 28 x i1>, i64)
-
 ;
 ; Extract half i1 vector that needs promotion from one that needs splitting.
 ;
@@ -140,8 +320,6 @@ define <vscale x 8 x i1> @extract_nxv8i1_nxv32i1_24(<vscale x 32 x i1> %in) {
   ret <vscale x 8 x i1> %res
 }
 
-declare <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv32i1(<vscale x 32 x i1>, i64)
-
 ;
 ; Extract 1/4th i1 vector that needs promotion from legal type.
 ;
@@ -185,8 +363,6 @@ define <vscale x 4 x i1> @extract_nxv4i1_nxv16i1_12(<vscale x 16 x i1> %in) {
   ret <vscale x 4 x i1> %res
 }
 
-declare <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1>, i64)
-
 ;
 ; Extract 1/8th i1 vector that needs promotion from legal type.
 ;
@@ -278,8 +454,6 @@ define <vscale x 2 x i1> @extract_nxv2i1_nxv16i1_14(<vscale x 16 x i1> %in) {
   ret <vscale x 2 x i1> %res
 }
 
-declare <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv16i1(<vscale x 16 x i1>, i64)
-
 ;
 ; Extract i1 vector that needs promotion from one that needs widening.
 ;
@@ -313,8 +487,6 @@ define <vscale x 4 x i1> @extract_nxv4i1_nxv12i1_8(<vscale x 12 x i1> %in) {
   ret <vscale x 4 x i1> %res
 }
 
-declare <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv12i1(<vscale x 12 x i1>, i64)
-
 ;
 ; Extract 1/8th i8 vector that needs promotion from legal type.
 ;
@@ -406,8 +578,6 @@ define <vscale x 2 x i8> @extract_nxv2i8_nxv16i8_14(<vscale x 16 x i8> %in) {
   ret <vscale x 2 x i8> %res
 }
 
-declare <vscale x 2 x i8> @llvm.vector.extract.nxv2i8.nxv16i8(<vscale x 16 x i8>, i64)
-
 ;
 ; Extract i8 vector that needs promotion from one that needs widening.
 ;
@@ -441,8 +611,6 @@ define <vscale x 4 x i8> @extract_nxv4i8_nxv12i8_8(<vscale x 12 x i8> %in) {
   ret <vscale x 4 x i8> %res
 }
 
-declare <vscale x 4 x i8> @llvm.vector.extract.nxv4i8.nxv12i8(<vscale x 12 x i8>, i64)
-
 ;
 ; Extract i8 vector that needs both widening + promotion from one that needs widening.
 ; (nxv6i8 -> nxv8i8 -> nxv8i16)
@@ -474,8 +642,6 @@ define <vscale x 6 x i8> @extract_nxv6i8_nxv12i8_6(<vscale x 12 x i8> %in) {
   ret <vscale x 6 x i8> %res
 }
 
-declare <vscale x 6 x i8> @llvm.vector.extract.nxv6i8.nxv12i8(<vscale x 12 x i8>, i64)
-
 ;
 ; Extract half i8 vector that needs promotion from one that needs splitting.
 ;
@@ -515,8 +681,6 @@ define <vscale x 8 x i8> @extract_nxv8i8_nxv32i8_24(<vscale x 32 x i8> %in) {
   ret <vscale x 8 x i8> %res
 }
 
-declare <vscale x 8 x i8> @llvm.vector.extract.nxv8i8.nxv32i8(<vscale x 32 x i8>, i64)
-
 ;
 ; Extract half i8 vector that needs promotion from legal type.
 ;
@@ -538,8 +702,6 @@ define <vscale x 8 x i8> @extract_nxv8i8_nxv16i8_8(<vscale x 16 x i8> %in) {
   ret <vscale x 8 x i8> %res
 }
 
-declare <vscale x 8 x i8> @llvm.vector.extract.nxv8i8.nxv16i8(<vscale x 16 x i8>, i64)
-
 ;
 ; Extract i8 vector that needs widening from one that needs widening.
 ;
@@ -625,8 +787,6 @@ define <vscale x 14 x i8> @extract_nxv14i8_nxv28i8_14(<vscale x 28 x i8> %in) {
   ret <vscale x 14 x i8> %res
 }
 
-declare <vscale x 14 x i8> @llvm.vector.extract.nxv14i8.nxv28i8(<vscale x 28 x i8>, i64)
-
 ;
 ; Extract 1/4th i8 vector that needs promotion from legal type.
 ;
@@ -670,8 +830,6 @@ define <vscale x 4 x i8> @extract_nxv4i8_nxv16i8_12(<vscale x 16 x i8> %in) {
   ret <vscale x 4 x i8> %res
 }
 
-declare <vscale x 4 x i8> @llvm.vector.extract.nxv4i8.nxv16i8(<vscale x 16 x i8>, i64)
-
 ;
 ; Extract f16 vector that needs promotion from one that needs widening.
 ;
@@ -705,8 +863,6 @@ define <vscale x 2 x half> @extract_nxv2f16_nxv6f16_4(<vscale x 6 x half> %in) {
   ret <vscale x 2 x half> %res
 }
 
-declare <vscale x 2 x half> @llvm.vector.extract.nxv2f16.nxv6f16(<vscale x 6 x half>, i64)
-
 ;
 ; Extract half f16 vector that needs promotion from legal type.
 ;
@@ -728,8 +884,6 @@ define <vscale x 4 x half> @extract_nxv4f16_nxv8f16_4(<vscale x 8 x half> %in) {
   ret <vscale x 4 x half> %res
 }
 
-declare <vscale x 4 x half> @llvm.vector.extract.nxv4f16.nxv8f16(<vscale x 8 x half>, i64)
-
 ;
 ; Extract f16 vector that needs widening from one that needs widening.
 ;
@@ -757,8 +911,6 @@ define <vscale x 6 x half> @extract_nxv6f16_nxv12f16_6(<vscale x 12 x half> %in)
   ret <vscale x 6 x half> %res
 }
 
-declare <vscale x 6 x half> @llvm.vector.extract.nxv6f16.nxv12f16(<vscale x 12 x half>, i64)
-
 ;
 ; Extract half f16 vector that needs promotion from one that needs splitting.
 ;
@@ -798,8 +950,6 @@ define <vscale x 4 x half> @extract_nxv4f16_nxv16f16_12(<vscale x 16 x half> %in
   ret <vscale x 4 x half> %res
 }
 
-declare <vscale x 4 x half> @llvm.vector.extract.nxv4f16.nxv16f16(<vscale x 16 x half>, i64)
-
 ;
 ; Extract 1/4th f16 vector that needs promotion from legal type.
 ;
@@ -843,8 +993,6 @@ define <vscale x 2 x half> @extract_nxv2f16_nxv8f16_6(<vscale x 8 x half> %in) {
   ret <vscale x 2 x half> %res
 }
 
-declare <vscale x 2 x half> @llvm.vector.extract.nxv2f16.nxv8f16(<vscale x 8 x half>, i64)
-
 ;
 ; Extract half bf16 vector that needs promotion from legal type.
 ;
@@ -866,8 +1014,6 @@ define <vscale x 4 x bfloat> @extract_nxv4bf16_nxv8bf16_4(<vscale x 8 x bfloat>
   ret <vscale x 4 x bfloat> %res
 }
 
-declare <vscale x 4 x bfloat> @llvm.vector.extract.nxv4bf16.nxv8bf16(<vscale x 8 x bfloat>, i64)
-
 ;
 ; Extract bf16 vector that needs widening from one that needs widening.
 ;
@@ -895,8 +1041,6 @@ define <vscale x 6 x bfloat> @extract_nxv6bf16_nxv12bf16_6(<vscale x 12 x bfloat
   ret <vscale x 6 x bfloat> %res
 }
 
-declare <vscale x 6 x bfloat> @llvm.vector.extract.nxv6bf16.nxv12bf16(<vscale x 12 x bfloat>, i64)
-
 ;
 ; Extract bf16 vector that needs promotion from one that needs widening.
 ;
@@ -930,8 +1074,6 @@ define <vscale x 2 x bfloat> @extract_nxv2bf16_nxv6bf16_4(<vscale x 6 x bfloat>
   ret <vscale x 2 x bfloat> %res
 }
 
-declare <vscale x 2 x bfloat> @llvm.vector.extract.nxv2bf16.nxv6bf16(<vscale x 6 x bfloat>, i64)
-
 ;
 ; Extract 1/4th bf16 vector that needs promotion from legal type.
 ;
@@ -975,8 +1117,6 @@ define <vscale x 2 x bfloat> @extract_nxv2bf16_nxv8bf16_6(<vscale x 8 x bfloat>
   ret <vscale x 2 x bfloat> %res
 }
 
-declare <vscale x 2 x bfloat> @llvm.vector.extract.nxv2bf16.nxv8bf16(<vscale x 8 x bfloat>, i64)
-
 ;
 ; Extract half bf16 vector that needs promotion from one that needs splitting.
 ;
@@ -1016,9 +1156,6 @@ define <vscale x 4 x bfloat> @extract_nxv4bf16_nxv16bf16_12(<vscale x 16 x bfloa
   ret <vscale x 4 x bfloat> %res
 }
 
-declare <vscale x 4 x bfloat> @llvm.vector.extract.nxv4bf16.nxv16bf16(<vscale x 16 x bfloat>, i64)
-
-
 ;
 ; Extract from a splat
 ;
@@ -1070,9 +1207,6 @@ define <vscale x 2 x i1> @extract_nxv2i1_nxv16i1_all_zero() {
   ret <vscale x 2 x i1> %ext
 }
 
-declare <vscale x 2 x float> @llvm.vector.extract.nxv2f32.nxv4f32(<vscale x 4 x float>, i64)
-declare <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32>, i64)
-
 ;
 ; Extract nxv1i1 type from: nxv2i1
 ;
@@ -1427,8 +1561,3 @@ define <vscale x 1 x i1> @extract_nxv1i1_nxv16i1_15(<vscale x 16 x i1> %in) {
   %res = call <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv16i1(<vscale x 16 x i1> %in, i64 15)
   ret <vscale x 1 x i1> %res
 }
-
-declare <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv2i1(<vscale x 2 x i1>, i64)
-declare <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv4i1(<vscale x 4 x i1>, i64)
-declare <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv8i1(<vscale x 8 x i1>, i64)
-declare <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv16i1(<vscale x 16 x i1>, i64)
diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
index 73c783d4735f8..26b4739ad4e61 100644
--- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
@@ -1322,49 +1322,246 @@ define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_15(<vscale x 16 x i1> %vec, <vs
   ret <vscale x 16 x i1> %res
 }
 
-attributes #0 = { vscale_range(2,2) }
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_0(<vscale x 4 x i32> %vec, <vscale x 4 x i32> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1i32_nxv4i32_0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    str z0, [sp]
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    st1w { z1.s }, p0, [sp]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %i = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i64.nxv4i32(<vscale x 4 x i32> %subvec, i64 0)
+  %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> %vec, <vscale x 1 x i32> %i, i64 0)
+  ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_1(<vscale x 4 x i32> %vec, <vscale x 4 x i32> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1i32_nxv4i32_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    str z0, [sp]
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    add x10, x9, x8
+; CHECK-NEXT:    st1w { z1.s }, p0, [x10]
+; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %i = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i64.nxv4i32(<vscale x 4 x i32> %subvec, i64 0)
+  %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> %vec, <vscale x 1 x i32> %i, i64 1)
+  ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_2(<vscale x 4 x i32> %vec, <vscale x 4 x i32> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1i32_nxv4i32_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    str z0, [sp]
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    add x10, x9, x8
+; CHECK-NEXT:    st1w { z1.s }, p0, [x10]
+; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %i = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i64.nxv4i32(<vscale x 4 x i32> %subvec, i64 0)
+  %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> %vec, <vscale x 1 x i32> %i, i64 2)
+  ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_3(<vscale x 4 x i32> %vec, <vscale x 4 x i32> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1i32_nxv4i32_3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    str z0, [sp]
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    cntw x8, all, mul #3
+; CHECK-NEXT:    add x10, x9, x8
+; CHECK-NEXT:    st1w { z1.s }, p0, [x10]
+; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %i = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i64.nxv4i32(<vscale x 4 x i32> %subvec, i64 0)
+  %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> %vec, <vscale x 1 x i32> %i, i64 3)
+  ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 2 x float> @insert_nxv1f32_nxv2f32_0(<vscale x 2 x float> %vec, <vscale x 2 x float> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1f32_nxv2f32_0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    whilelo p1.d, xzr, x8
+; CHECK-NEXT:    st1w { z1.d }, p1, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv2f32(<vscale x 2 x float> %subvec, i64 0)
+  %retval = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.nxv1f32(<vscale x 2 x float> %vec, <vscale x 1 x float> %i, i64 0)
+  ret <vscale x 2 x float> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 2 x float> @insert_nxv1f32_nxv2f32_1(<vscale x 2 x float> %vec, <vscale x 2 x float> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1f32_nxv2f32_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    addpl x9, sp, #4
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    whilelo p1.d, xzr, x8
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    st1w { z1.d }, p1, [x8]
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv2f32(<vscale x 2 x float> %subvec, i64 0)
+  %retval = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.nxv1f32(<vscale x 2 x float> %vec, <vscale x 1 x float> %i, i64 1)
+  ret <vscale x 2 x float> %retval
+}
 
-declare <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64)
-
-declare <vscale x 6 x i16> @llvm.vector.insert.nxv6i16.nxv1i16(<vscale x 6 x i16>, <vscale x 1 x i16>, i64)
-declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv2i16(<vscale x 8 x i16>, <vscale x 2 x i16>, i64)
-declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
-
-declare <vscale x 3 x i32> @llvm.vector.insert.nxv3i32.nxv2i32(<vscale x 3 x i32>, <vscale x 2 x i32>, i64)
-declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32>, <vscale x 1 x i32>, i64)
-declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
-declare <vscale x 12 x i32> @llvm.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32>, <vscale x 4 x i32>, i64)
-declare <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv2i32(<vscale x 6 x i32>, <vscale x 2 x i32>, i64)
-declare <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv3i32(<vscale x 6 x i32>, <vscale x 3 x i32>, i64)
-
-declare <vscale x 2 x bfloat> @llvm.vector.insert.nxv2bf16.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, i64)
-declare <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv2bf16(<vscale x 4 x bfloat>, <vscale x 2 x bfloat>, i64)
-declare <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, i64)
-declare <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.v4bf16(<vscale x 4 x bfloat>, <4 x bfloat>, i64)
-declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
-declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv4bf16(<vscale x 8 x bfloat>, <vscale x 4 x bfloat>, i64)
-declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat>, <8 x bfloat>, i64)
-
-declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64)
-declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64>, <4 x i64>, i64)
-declare <vscale x 16 x i64> @llvm.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64>, <vscale x 8 x i64>, i64)
-declare <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64>, <2 x i64>, i64)
-
-declare <vscale x 4 x half> @llvm.vector.insert.nxv4f16.nxv2f16(<vscale x 4 x half>, <vscale x 2 x half>, i64)
-declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv2f16(<vscale x 8 x half>, <vscale x 2 x half>, i64)
-declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv4f16(<vscale x 8 x half>, <vscale x 4 x half>, i64)
-
-declare <vscale x 3 x float> @llvm.vector.insert.nxv3f32.nxv2f32(<vscale x 3 x float>, <vscale x 2 x float>, i64)
-declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float>, <vscale x 1 x float>, i64)
-declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float>, <vscale x 2 x float>, i64)
-
-declare <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.v8i1(<vscale x 2 x i1>, <8 x i1>, i64)
-declare <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.v16i1(<vscale x 4 x i1>, <16 x i1>, i64)
-declare <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.v32i1(<vscale x 8 x i1>, <32 x i1>, i64)
-declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1>, <vscale x 1 x i1>, i64)
-declare <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1>, <vscale x 1 x i1>, i64)
-declare <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.nxv1i1(<vscale x 4 x i1>, <vscale x 1 x i1>, i64)
-declare <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.nxv1i1(<vscale x 2 x i1>, <vscale x 1 x i1>, i64)
-declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv4i1(<vscale x 16 x i1>, <vscale x 4 x i1>, i64)
-declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv8i1(<vscale x 16 x i1>, <vscale x 8 x i1>, i64)
-declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v64i1(<vscale x 16 x i1>, <64 x i1>, i64)
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_0(<vscale x 4 x float> %vec, <vscale x 4 x float> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1f32_nxv4f32_0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    str z0, [sp]
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    st1w { z1.d }, p0, [sp]
+; CHECK-NEXT:    ldr z0, [sp]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv4f32(<vscale x 4 x float> %subvec, i64 0)
+  %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> %vec, <vscale x 1 x float> %i, i64 0)
+  ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_1(<vscale x 4 x float> %vec, <vscale x 4 x float> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1f32_nxv4f32_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    str z0, [sp]
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    add x10, x9, x8
+; CHECK-NEXT:    st1w { z1.d }, p0, [x10]
+; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv4f32(<vscale x 4 x float> %subvec, i64 0)
+  %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> %vec, <vscale x 1 x float> %i, i64 1)
+  ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_2(<vscale x 4 x float> %vec, <vscale x 4 x float> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1f32_nxv4f32_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    str z0, [sp]
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    st1w { z1.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv4f32(<vscale x 4 x float> %subvec, i64 0)
+  %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> %vec, <vscale x 1 x float> %i, i64 2)
+  ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_3(<vscale x 4 x float> %vec, <vscale x 4 x float> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1f32_nxv4f32_3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    ptrue p1.b
+; CHECK-NEXT:    str z0, [sp]
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    cntw x8, all, mul #3
+; CHECK-NEXT:    add x10, x9, x8
+; CHECK-NEXT:    st1w { z1.d }, p0, [x10]
+; CHECK-NEXT:    ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv4f32(<vscale x 4 x float> %subvec, i64 0)
+  %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> %vec, <vscale x 1 x float> %i, i64 3)
+  ret <vscale x 4 x float> %retval
+}
+
+attributes #0 = { vscale_range(2,2) }