[llvm] [LLVM][CodeGen] Remove failure cases when widening EXTRACT/INSERT_SUBVECTOR. (PR #162308)

Tue Oct 7 08:48:44 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-aarch64

Author: Paul Walker (paulwalker-arm)

<details>
<summary>Changes</summary>

This PR implements catch all handling for widening the scalable subvector operand (INSERT_SUBVECTOR) or result (EXTRACT_SUBVECTOR). It does this via the stack using masked memory operations. With general handling available we can add optimiations for specific cases.

---

Patch is 41.08 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/162308.diff


5 Files Affected:

- (modified) llvm/include/llvm/CodeGen/SelectionDAG.h (+6) 
- (modified) llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp (+71-19) 
- (modified) llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp (+8) 
- (modified) llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll (+191-62) 
- (modified) llvm/test/CodeGen/AArch64/sve-insert-vector.ll (+242-45) 


``````````diff

diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index d9d6f0bcdcb84..f935442404cde 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1186,6 +1186,12 @@ class SelectionDAG {
   LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
                                    bool ConstantFold = true);
 
+  /// Return a vector with the first 'Len' lanes set to true and remaining lanes
+  /// set to false. The mask's ValueType is the same as when comparing vectors
+  /// of type VT.
+  LLVM_ABI SDValue getMaskFromElementCount(const SDLoc &DL, EVT VT,
+                                           ElementCount Len);
+
   /// Return a GLOBAL_OFFSET_TABLE node. This does not have a useful SDLoc.
   SDValue getGLOBAL_OFFSET_TABLE(EVT VT) {
     return getNode(ISD::GLOBAL_OFFSET_TABLE, SDLoc(), VT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 87d5453cd98cf..bcaac40de5459 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -6201,8 +6201,33 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
       return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, Parts);
     }
 
-    report_fatal_error("Don't know how to widen the result of "
-                       "EXTRACT_SUBVECTOR for scalable vectors");
+    // Fallback to extracting through memory.
+
+    Align Alignment = DAG.getReducedAlign(InVT, /*UseABI=*/false);
+    SDValue StackPtr = DAG.CreateStackTemporary(InVT.getStoreSize(), Alignment);
+    auto &MF = DAG.getMachineFunction();
+    auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+    auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+    MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
+        PtrInfo, MachineMemOperand::MOStore,
+        LocationSize::beforeOrAfterPointer(), Alignment);
+    MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
+        PtrInfo, MachineMemOperand::MOLoad,
+        LocationSize::beforeOrAfterPointer(), Alignment);
+
+    // Write out the input vector.
+    SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, StoreMMO);
+
+    // Build a mask to match the length of the non-widened result.
+    SDValue Mask =
+        DAG.getMaskFromElementCount(dl, WidenVT, VT.getVectorElementCount());
+
+    // Read back the sub-vector setting the remaining lanes to poison.
+    StackPtr = TLI.getVectorSubVecPointer(DAG, StackPtr, InVT, VT, Idx);
+    return DAG.getMaskedLoad(
+        WidenVT, dl, Ch, StackPtr, DAG.getUNDEF(StackPtr.getValueType()), Mask,
+        DAG.getPOISON(WidenVT), VT, LoadMMO, ISD::UNINDEXED, ISD::NON_EXTLOAD);
   }
 
   // We could try widening the input to the right length but for now, extract
@@ -6306,11 +6331,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
   if (VT.isVector()) {
     // If all else fails replace the load with a wide masked load.
     SDLoc DL(N);
-    EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
-
-    SDValue Len = DAG.getElementCount(DL, IdxVT, VT.getVectorElementCount());
-    SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT,
-                               DAG.getConstant(0, DL, IdxVT), Len);
+    SDValue Mask =
+        DAG.getMaskFromElementCount(DL, WideVT, VT.getVectorElementCount());
 
     SDValue NewLoad = DAG.getMaskedLoad(
         WideVT, DL, LD->getChain(), LD->getBasePtr(), LD->getOffset(), Mask,
@@ -7447,9 +7469,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_INSERT_SUBVECTOR(SDNode *N) {
   SDValue InVec = N->getOperand(0);
 
   EVT OrigVT = SubVec.getValueType();
-  if (getTypeAction(SubVec.getValueType()) == TargetLowering::TypeWidenVector)
-    SubVec = GetWidenedVector(SubVec);
-
+  SubVec = GetWidenedVector(SubVec);
   EVT SubVT = SubVec.getValueType();
 
   // Whether or not all the elements of the widened SubVec will be inserted into
@@ -7471,17 +7491,52 @@ SDValue DAGTypeLegalizer::WidenVecOp_INSERT_SUBVECTOR(SDNode *N) {
     }
   }
 
+  if (!IndicesValid)
+    report_fatal_error(
+        "Don't know how to widen the operands for INSERT_SUBVECTOR");
+
   SDLoc DL(N);
 
   // We need to make sure that the indices are still valid, otherwise we might
   // widen what was previously well-defined to something undefined.
-  if (IndicesValid && InVec.isUndef() && N->getConstantOperandVal(2) == 0)
+  if (InVec.isUndef() && N->getConstantOperandVal(2) == 0)
     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, InVec, SubVec,
                        N->getOperand(2));
 
-  if (!IndicesValid || OrigVT.isScalableVector())
-    report_fatal_error(
-        "Don't know how to widen the operands for INSERT_SUBVECTOR");
+  if (OrigVT.isScalableVector()) {
+    // Fallback to inserting through memory.
+
+    Align Alignment = DAG.getReducedAlign(VT, /*UseABI=*/false);
+    SDValue StackPtr = DAG.CreateStackTemporary(VT.getStoreSize(), Alignment);
+    auto &MF = DAG.getMachineFunction();
+    auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+    auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+    MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
+        PtrInfo, MachineMemOperand::MOStore,
+        LocationSize::beforeOrAfterPointer(), Alignment);
+    MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
+        PtrInfo, MachineMemOperand::MOLoad,
+        LocationSize::beforeOrAfterPointer(), Alignment);
+
+    // Write out the vector being inserting into.
+    SDValue Ch =
+        DAG.getStore(DAG.getEntryNode(), DL, InVec, StackPtr, StoreMMO);
+
+    // Build a mask to match the length of the sub-vector.
+    SDValue Mask =
+        DAG.getMaskFromElementCount(DL, SubVT, OrigVT.getVectorElementCount());
+
+    // Overwrite the sub-vector at the required offset.
+    StackPtr =
+        TLI.getVectorSubVecPointer(DAG, StackPtr, VT, OrigVT, N->getOperand(2));
+    Ch = DAG.getMaskedStore(Ch, DL, SubVec, StackPtr,
+                            DAG.getUNDEF(StackPtr.getValueType()), Mask, VT,
+                            StoreMMO, ISD::UNINDEXED, ISD::NON_EXTLOAD);
+
+    // Read back the result.
+    return DAG.getLoad(VT, DL, Ch, StackPtr, LoadMMO);
+  }
 
   // If the operands can't be widened legally, just replace the INSERT_SUBVECTOR
   // with a series of INSERT_VECTOR_ELT
@@ -7560,12 +7615,9 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
   if (StVT.isVector()) {
     // If all else fails replace the store with a wide masked store.
     SDLoc DL(N);
-    EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
-
     SDValue WideStVal = GetWidenedVector(StVal);
-    SDValue Len = DAG.getElementCount(DL, IdxVT, StVT.getVectorElementCount());
-    SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT,
-                               DAG.getConstant(0, DL, IdxVT), Len);
+    SDValue Mask =
+        DAG.getMaskFromElementCount(DL, WideVT, StVT.getVectorElementCount());
 
     return DAG.getMaskedStore(ST->getChain(), DL, WideStVal, ST->getBasePtr(),
                               ST->getOffset(), Mask, ST->getMemoryVT(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 95f53fe0bfdba..d976c0ce1b901 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2112,6 +2112,14 @@ SDValue SelectionDAG::getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
   return getConstant(EC.getKnownMinValue(), DL, VT);
 }
 
+SDValue SelectionDAG::getMaskFromElementCount(const SDLoc &DL, EVT DataVT,
+                                              ElementCount EC) {
+  EVT IdxVT = TLI->getVectorIdxTy(getDataLayout());
+  EVT MaskVT = TLI->getSetCCResultType(getDataLayout(), *getContext(), DataVT);
+  return getNode(ISD::GET_ACTIVE_LANE_MASK, DL, MaskVT,
+                 getConstant(0, DL, IdxVT), getElementCount(DL, IdxVT, EC));
+}
+
 SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT) {
   APInt One(ResVT.getScalarSizeInBits(), 1);
   return getStepVector(DL, ResVT, One);
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
index 4aaa25e5e66c5..8d0c71502f1e3 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
@@ -3,12 +3,199 @@
 
 ; Extracting illegal subvectors
 
-define <vscale x 1 x i32> @extract_nxv1i32_nxv4i32(<vscale x 4 x i32> %vec) nounwind {
-; CHECK-LABEL: extract_nxv1i32_nxv4i32:
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_0(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_0:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    ret
-  %retval = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 0)
-  ret <vscale x 1 x i32> %retval
+  %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 0)
+  %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+  ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_1(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str z0, [sp]
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 1)
+  %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+  ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_2(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str z0, [sp]
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    cnth x8
+; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 2)
+  %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+  ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_3(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str z0, [sp]
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.s, xzr, x8
+; CHECK-NEXT:    cntw x8, all, mul #3
+; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 3)
+  %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+  ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 2 x float> @extract_nxv1f32_nxv2f32_0(<vscale x 2 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv2f32_0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv2f32(<vscale x 2 x float> %vec, i64 0)
+  %retval = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.nxv1f32(<vscale x 2 x float> poison, <vscale x 1 x float> %e, i64 0)
+  ret <vscale x 2 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 2 x float> @extract_nxv1f32_nxv2f32_1(<vscale x 2 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv2f32_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    addpl x9, sp, #4
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    st1w { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT:    whilelo p1.d, xzr, x8
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    ld1w { z0.d }, p1/z, [x8]
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv2f32(<vscale x 2 x float> %vec, i64 1)
+  %retval = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.nxv1f32(<vscale x 2 x float> poison, <vscale x 1 x float> %e, i64 0)
+  ret <vscale x 2 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_0(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ret
+  %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 0)
+  %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+  ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_1(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str z0, [sp]
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    cntw x8
+; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x8]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 1)
+  %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+  ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_2(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_2:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    str z0, [sp]
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 2)
+  %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+  ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_3(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_3:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    rdvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    str z0, [sp]
+; CHECK-NEXT:    lsr x8, x8, #4
+; CHECK-NEXT:    whilelo p0.d, xzr, x8
+; CHECK-NEXT:    cntw x8, all, mul #3
+; CHECK-NEXT:    add x8, x9, x8
+; CHECK-NEXT:    ld1w { z0.d }, p0/z, [x8]
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT:    addvl sp, sp, #1
+; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 3)
+  %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+  ret <vscale x 4 x float> %retval
 }
 
 define <vscale x 1 x i16> @extract_nxv1i16_nxv6i16(<vscale x 6 x i16> %vec) nounwind {
@@ -19,9 +206,6 @@ define <vscale x 1 x i16> @extract_nxv1i16_nxv6i16(<vscale x 6 x i16> %vec) noun
   ret <vscale x 1 x i16> %retval
 }
 
-declare <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32>, i64)
-declare <vscale x 1 x i16> @llvm.vector.extract.nxv1i16.nxv6i16(<vscale x 6 x i16>, i64)
-
 ;
 ; Extract half i1 vector that needs promotion from legal type.
 ;
@@ -43,8 +227,6 @@ define <vscale x 8 x i1> @extract_nxv8i1_nxv16i1_8(<vscale x 16 x i1> %in) {
   ret <vscale x 8 x i1> %res
 }
 
-declare <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1>, i64)
-
 ;
 ; Extract i1 vector that needs widening from one that needs widening.
 ;
@@ -99,8 +281,6 @@ define <vscale x 14 x i1> @extract_nxv14i1_nxv28i1_14(<vscale x 28 x i1> %in) uw
   ret <vscale x 14 x i1> %res
 }
 
-declare <vscale x 14 x i1> @llvm.vector.extract.nxv14i1.nxv28i1(<vscale x 28 x i1>, i64)
-
 ;
 ; Extract half i1 vector that needs promotion from one that needs splitting.
 ;
@@ -140,8 +320,6 @@ define <vscale x 8 x i1> @extract_nxv8i1_nxv32i1_24(<vscale x 32 x i1> %in) {
   ret <vscale x 8 x i1> %res
 }
 
-declare <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv32i1(<vscale x 32 x i1>, i64)
-
 ;
 ; Extract 1/4th i1 vector that needs promotion from legal type.
 ;
@@ -185,8 +363,6 @@ define <vscale x 4 x i1> @extract_nxv4i1_nxv16i1_12(<vscale x 16 x i1> %in) {
   ret <vscale x 4 x i1> %res
 }
 
-declare <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1>, i64)
-
 ;
 ; Extract 1/8th i1 vector that needs promotion from legal type.
 ;
@@ -278,8 +454,6 @@ define <vscale x 2 x i1> @extract_nxv2i1_nxv16i1_14(<vscale x 16 x i1> %in) {
   ret <vscale x 2 x i1> %res
 }
 
-declare <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv16i1(<vscale x 16 x i1>, i64)
-
 ;
 ; Extract i1 vector that needs promotion from one that needs widening.
 ;
@@ -313,8 +487,6 @@ define <vscale x 4 x i1> @extract_nxv4i1_nxv12i1_8(<vscale x 12 x i1> %in) {
   ret <vscale x 4 x i1> %res
 }
 
-declare <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv12i1(<vscale x 12 x i1>, i64)
-
 ;
 ; Extract 1/8th i8 vector that needs promotion from legal type.
 ;
@@ -406,8 +578,6 @@ define <vscale x 2 x i8> @extract_nxv2i8_nxv16i8_14(<vscale x 16 x i8> %in) {
   ret <vscale x 2 x i8> %res
 }
 
-declare <vscale x 2 x i8> @llvm.vector.extract.nxv2i8.nxv16i8(<vscale x 16 x i8>, i64)
-
 ;
 ; Extract i8 vector that...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/162308