[llvm] [LLVM][CodeGen] Remove failure cases when widening EXTRACT/INSERT_SUBVECTOR. (PR #162308)
Paul Walker via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 26 04:07:24 PST 2025
https://github.com/paulwalker-arm updated https://github.com/llvm/llvm-project/pull/162308
>From 6c8f10038847d4e0974aa9232bc1af9a372d20c2 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Fri, 26 Sep 2025 12:00:46 +0100
Subject: [PATCH 1/4] [LLVM][CodeGen] Remove failure cases when widening
EXTRACT/INSERT_SUBVECTOR.
This PR implements catch all handling for widening the scalable
subvector operand (INSERT_SUBVECTOR) or result (EXTRACT_SUBVECTOR). It
does this via the stack using masked memory operations. With general
handling available we can add optimiations for specific cases.
---
llvm/include/llvm/CodeGen/SelectionDAG.h | 6 +
.../SelectionDAG/LegalizeVectorTypes.cpp | 90 ++++--
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 8 +
.../AArch64/sve-extract-scalable-vector.ll | 253 +++++++++++----
.../test/CodeGen/AArch64/sve-insert-vector.ll | 287 +++++++++++++++---
5 files changed, 518 insertions(+), 126 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index b024e8a68bd6e..216161e5afbca 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1190,6 +1190,12 @@ class SelectionDAG {
LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
bool ConstantFold = true);
+ /// Return a vector with the first 'Len' lanes set to true and remaining lanes
+ /// set to false. The mask's ValueType is the same as when comparing vectors
+ /// of type VT.
+ LLVM_ABI SDValue getMaskFromElementCount(const SDLoc &DL, EVT VT,
+ ElementCount Len);
+
/// Return a GLOBAL_OFFSET_TABLE node. This does not have a useful SDLoc.
SDValue getGLOBAL_OFFSET_TABLE(EVT VT) {
return getNode(ISD::GLOBAL_OFFSET_TABLE, SDLoc(), VT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 71eeee78bd868..69f005c182ba0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -6218,8 +6218,33 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, Parts);
}
- report_fatal_error("Don't know how to widen the result of "
- "EXTRACT_SUBVECTOR for scalable vectors");
+ // Fallback to extracting through memory.
+
+ Align Alignment = DAG.getReducedAlign(InVT, /*UseABI=*/false);
+ SDValue StackPtr = DAG.CreateStackTemporary(InVT.getStoreSize(), Alignment);
+ auto &MF = DAG.getMachineFunction();
+ auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+ MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOStore,
+ LocationSize::beforeOrAfterPointer(), Alignment);
+ MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOLoad,
+ LocationSize::beforeOrAfterPointer(), Alignment);
+
+ // Write out the input vector.
+ SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, StoreMMO);
+
+ // Build a mask to match the length of the non-widened result.
+ SDValue Mask =
+ DAG.getMaskFromElementCount(dl, WidenVT, VT.getVectorElementCount());
+
+ // Read back the sub-vector setting the remaining lanes to poison.
+ StackPtr = TLI.getVectorSubVecPointer(DAG, StackPtr, InVT, VT, Idx);
+ return DAG.getMaskedLoad(
+ WidenVT, dl, Ch, StackPtr, DAG.getUNDEF(StackPtr.getValueType()), Mask,
+ DAG.getPOISON(WidenVT), VT, LoadMMO, ISD::UNINDEXED, ISD::NON_EXTLOAD);
}
// We could try widening the input to the right length but for now, extract
@@ -6323,11 +6348,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
if (VT.isVector()) {
// If all else fails replace the load with a wide masked load.
SDLoc DL(N);
- EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
-
- SDValue Len = DAG.getElementCount(DL, IdxVT, VT.getVectorElementCount());
- SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT,
- DAG.getConstant(0, DL, IdxVT), Len);
+ SDValue Mask =
+ DAG.getMaskFromElementCount(DL, WideVT, VT.getVectorElementCount());
SDValue NewLoad = DAG.getMaskedLoad(
WideVT, DL, LD->getChain(), LD->getBasePtr(), LD->getOffset(), Mask,
@@ -7464,9 +7486,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_INSERT_SUBVECTOR(SDNode *N) {
SDValue InVec = N->getOperand(0);
EVT OrigVT = SubVec.getValueType();
- if (getTypeAction(SubVec.getValueType()) == TargetLowering::TypeWidenVector)
- SubVec = GetWidenedVector(SubVec);
-
+ SubVec = GetWidenedVector(SubVec);
EVT SubVT = SubVec.getValueType();
// Whether or not all the elements of the widened SubVec will be inserted into
@@ -7488,17 +7508,52 @@ SDValue DAGTypeLegalizer::WidenVecOp_INSERT_SUBVECTOR(SDNode *N) {
}
}
+ if (!IndicesValid)
+ report_fatal_error(
+ "Don't know how to widen the operands for INSERT_SUBVECTOR");
+
SDLoc DL(N);
// We need to make sure that the indices are still valid, otherwise we might
// widen what was previously well-defined to something undefined.
- if (IndicesValid && InVec.isUndef() && N->getConstantOperandVal(2) == 0)
+ if (InVec.isUndef() && N->getConstantOperandVal(2) == 0)
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, InVec, SubVec,
N->getOperand(2));
- if (!IndicesValid || OrigVT.isScalableVector())
- report_fatal_error(
- "Don't know how to widen the operands for INSERT_SUBVECTOR");
+ if (OrigVT.isScalableVector()) {
+ // Fallback to inserting through memory.
+
+ Align Alignment = DAG.getReducedAlign(VT, /*UseABI=*/false);
+ SDValue StackPtr = DAG.CreateStackTemporary(VT.getStoreSize(), Alignment);
+ auto &MF = DAG.getMachineFunction();
+ auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+ MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOStore,
+ LocationSize::beforeOrAfterPointer(), Alignment);
+ MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOLoad,
+ LocationSize::beforeOrAfterPointer(), Alignment);
+
+ // Write out the vector being inserting into.
+ SDValue Ch =
+ DAG.getStore(DAG.getEntryNode(), DL, InVec, StackPtr, StoreMMO);
+
+ // Build a mask to match the length of the sub-vector.
+ SDValue Mask =
+ DAG.getMaskFromElementCount(DL, SubVT, OrigVT.getVectorElementCount());
+
+ // Overwrite the sub-vector at the required offset.
+ StackPtr =
+ TLI.getVectorSubVecPointer(DAG, StackPtr, VT, OrigVT, N->getOperand(2));
+ Ch = DAG.getMaskedStore(Ch, DL, SubVec, StackPtr,
+ DAG.getUNDEF(StackPtr.getValueType()), Mask, VT,
+ StoreMMO, ISD::UNINDEXED, ISD::NON_EXTLOAD);
+
+ // Read back the result.
+ return DAG.getLoad(VT, DL, Ch, StackPtr, LoadMMO);
+ }
// If the operands can't be widened legally, just replace the INSERT_SUBVECTOR
// with a series of INSERT_VECTOR_ELT
@@ -7577,12 +7632,9 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
if (StVT.isVector()) {
// If all else fails replace the store with a wide masked store.
SDLoc DL(N);
- EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
-
SDValue WideStVal = GetWidenedVector(StVal);
- SDValue Len = DAG.getElementCount(DL, IdxVT, StVT.getVectorElementCount());
- SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT,
- DAG.getConstant(0, DL, IdxVT), Len);
+ SDValue Mask =
+ DAG.getMaskFromElementCount(DL, WideVT, StVT.getVectorElementCount());
return DAG.getMaskedStore(ST->getChain(), DL, WideStVal, ST->getBasePtr(),
ST->getOffset(), Mask, ST->getMemoryVT(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index c2b4c19846316..1692d07607c97 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2111,6 +2111,14 @@ SDValue SelectionDAG::getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
return getConstant(EC.getKnownMinValue(), DL, VT);
}
+SDValue SelectionDAG::getMaskFromElementCount(const SDLoc &DL, EVT DataVT,
+ ElementCount EC) {
+ EVT IdxVT = TLI->getVectorIdxTy(getDataLayout());
+ EVT MaskVT = TLI->getSetCCResultType(getDataLayout(), *getContext(), DataVT);
+ return getNode(ISD::GET_ACTIVE_LANE_MASK, DL, MaskVT,
+ getConstant(0, DL, IdxVT), getElementCount(DL, IdxVT, EC));
+}
+
SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT) {
APInt One(ResVT.getScalarSizeInBits(), 1);
return getStepVector(DL, ResVT, One);
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
index f6251ff66299e..fbd7c6877968c 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
@@ -3,12 +3,199 @@
; Extracting illegal subvectors
-define <vscale x 1 x i32> @extract_nxv1i32_nxv4i32(<vscale x 4 x i32> %vec) nounwind {
-; CHECK-LABEL: extract_nxv1i32_nxv4i32:
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_0(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_0:
; CHECK: // %bb.0:
; CHECK-NEXT: ret
- %retval = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 0)
- ret <vscale x 1 x i32> %retval
+ %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 0)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_1(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 1)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_2(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: cnth x8
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 2)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_3(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: cntw x8, all, mul #3
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 3)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 2 x float> @extract_nxv1f32_nxv2f32_0(<vscale x 2 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv2f32_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv2f32(<vscale x 2 x float> %vec, i64 0)
+ %retval = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.nxv1f32(<vscale x 2 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 2 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 2 x float> @extract_nxv1f32_nxv2f32_1(<vscale x 2 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv2f32_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: addpl x9, sp, #4
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: whilelo p1.d, xzr, x8
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.d }, p1/z, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv2f32(<vscale x 2 x float> %vec, i64 1)
+ %retval = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.nxv1f32(<vscale x 2 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 2 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_0(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 0)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_1(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 1)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_2(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 2)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_3(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: cntw x8, all, mul #3
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 3)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 4 x float> %retval
}
define <vscale x 1 x i16> @extract_nxv1i16_nxv6i16(<vscale x 6 x i16> %vec) nounwind {
@@ -19,9 +206,6 @@ define <vscale x 1 x i16> @extract_nxv1i16_nxv6i16(<vscale x 6 x i16> %vec) noun
ret <vscale x 1 x i16> %retval
}
-declare <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32>, i64)
-declare <vscale x 1 x i16> @llvm.vector.extract.nxv1i16.nxv6i16(<vscale x 6 x i16>, i64)
-
;
; Extract half i1 vector that needs promotion from legal type.
;
@@ -43,8 +227,6 @@ define <vscale x 8 x i1> @extract_nxv8i1_nxv16i1_8(<vscale x 16 x i1> %in) {
ret <vscale x 8 x i1> %res
}
-declare <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1>, i64)
-
;
; Extract i1 vector that needs widening from one that needs widening.
;
@@ -99,8 +281,6 @@ define <vscale x 14 x i1> @extract_nxv14i1_nxv28i1_14(<vscale x 28 x i1> %in) uw
ret <vscale x 14 x i1> %res
}
-declare <vscale x 14 x i1> @llvm.vector.extract.nxv14i1.nxv28i1(<vscale x 28 x i1>, i64)
-
;
; Extract half i1 vector that needs promotion from one that needs splitting.
;
@@ -140,8 +320,6 @@ define <vscale x 8 x i1> @extract_nxv8i1_nxv32i1_24(<vscale x 32 x i1> %in) {
ret <vscale x 8 x i1> %res
}
-declare <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv32i1(<vscale x 32 x i1>, i64)
-
;
; Extract 1/4th i1 vector that needs promotion from legal type.
;
@@ -185,8 +363,6 @@ define <vscale x 4 x i1> @extract_nxv4i1_nxv16i1_12(<vscale x 16 x i1> %in) {
ret <vscale x 4 x i1> %res
}
-declare <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1>, i64)
-
;
; Extract 1/8th i1 vector that needs promotion from legal type.
;
@@ -278,8 +454,6 @@ define <vscale x 2 x i1> @extract_nxv2i1_nxv16i1_14(<vscale x 16 x i1> %in) {
ret <vscale x 2 x i1> %res
}
-declare <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv16i1(<vscale x 16 x i1>, i64)
-
;
; Extract i1 vector that needs promotion from one that needs widening.
;
@@ -313,8 +487,6 @@ define <vscale x 4 x i1> @extract_nxv4i1_nxv12i1_8(<vscale x 12 x i1> %in) {
ret <vscale x 4 x i1> %res
}
-declare <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv12i1(<vscale x 12 x i1>, i64)
-
;
; Extract 1/8th i8 vector that needs promotion from legal type.
;
@@ -406,8 +578,6 @@ define <vscale x 2 x i8> @extract_nxv2i8_nxv16i8_14(<vscale x 16 x i8> %in) {
ret <vscale x 2 x i8> %res
}
-declare <vscale x 2 x i8> @llvm.vector.extract.nxv2i8.nxv16i8(<vscale x 16 x i8>, i64)
-
;
; Extract i8 vector that needs promotion from one that needs widening.
;
@@ -441,8 +611,6 @@ define <vscale x 4 x i8> @extract_nxv4i8_nxv12i8_8(<vscale x 12 x i8> %in) {
ret <vscale x 4 x i8> %res
}
-declare <vscale x 4 x i8> @llvm.vector.extract.nxv4i8.nxv12i8(<vscale x 12 x i8>, i64)
-
;
; Extract i8 vector that needs both widening + promotion from one that needs widening.
; (nxv6i8 -> nxv8i8 -> nxv8i16)
@@ -474,8 +642,6 @@ define <vscale x 6 x i8> @extract_nxv6i8_nxv12i8_6(<vscale x 12 x i8> %in) {
ret <vscale x 6 x i8> %res
}
-declare <vscale x 6 x i8> @llvm.vector.extract.nxv6i8.nxv12i8(<vscale x 12 x i8>, i64)
-
;
; Extract half i8 vector that needs promotion from one that needs splitting.
;
@@ -515,8 +681,6 @@ define <vscale x 8 x i8> @extract_nxv8i8_nxv32i8_24(<vscale x 32 x i8> %in) {
ret <vscale x 8 x i8> %res
}
-declare <vscale x 8 x i8> @llvm.vector.extract.nxv8i8.nxv32i8(<vscale x 32 x i8>, i64)
-
;
; Extract half i8 vector that needs promotion from legal type.
;
@@ -538,8 +702,6 @@ define <vscale x 8 x i8> @extract_nxv8i8_nxv16i8_8(<vscale x 16 x i8> %in) {
ret <vscale x 8 x i8> %res
}
-declare <vscale x 8 x i8> @llvm.vector.extract.nxv8i8.nxv16i8(<vscale x 16 x i8>, i64)
-
;
; Extract i8 vector that needs widening from one that needs widening.
;
@@ -625,8 +787,6 @@ define <vscale x 14 x i8> @extract_nxv14i8_nxv28i8_14(<vscale x 28 x i8> %in) {
ret <vscale x 14 x i8> %res
}
-declare <vscale x 14 x i8> @llvm.vector.extract.nxv14i8.nxv28i8(<vscale x 28 x i8>, i64)
-
;
; Extract 1/4th i8 vector that needs promotion from legal type.
;
@@ -670,8 +830,6 @@ define <vscale x 4 x i8> @extract_nxv4i8_nxv16i8_12(<vscale x 16 x i8> %in) {
ret <vscale x 4 x i8> %res
}
-declare <vscale x 4 x i8> @llvm.vector.extract.nxv4i8.nxv16i8(<vscale x 16 x i8>, i64)
-
;
; Extract f16 vector that needs promotion from one that needs widening.
;
@@ -705,8 +863,6 @@ define <vscale x 2 x half> @extract_nxv2f16_nxv6f16_4(<vscale x 6 x half> %in) {
ret <vscale x 2 x half> %res
}
-declare <vscale x 2 x half> @llvm.vector.extract.nxv2f16.nxv6f16(<vscale x 6 x half>, i64)
-
;
; Extract half f16 vector that needs promotion from legal type.
;
@@ -728,8 +884,6 @@ define <vscale x 4 x half> @extract_nxv4f16_nxv8f16_4(<vscale x 8 x half> %in) {
ret <vscale x 4 x half> %res
}
-declare <vscale x 4 x half> @llvm.vector.extract.nxv4f16.nxv8f16(<vscale x 8 x half>, i64)
-
;
; Extract f16 vector that needs widening from one that needs widening.
;
@@ -757,8 +911,6 @@ define <vscale x 6 x half> @extract_nxv6f16_nxv12f16_6(<vscale x 12 x half> %in)
ret <vscale x 6 x half> %res
}
-declare <vscale x 6 x half> @llvm.vector.extract.nxv6f16.nxv12f16(<vscale x 12 x half>, i64)
-
;
; Extract half f16 vector that needs promotion from one that needs splitting.
;
@@ -798,8 +950,6 @@ define <vscale x 4 x half> @extract_nxv4f16_nxv16f16_12(<vscale x 16 x half> %in
ret <vscale x 4 x half> %res
}
-declare <vscale x 4 x half> @llvm.vector.extract.nxv4f16.nxv16f16(<vscale x 16 x half>, i64)
-
;
; Extract 1/4th f16 vector that needs promotion from legal type.
;
@@ -843,8 +993,6 @@ define <vscale x 2 x half> @extract_nxv2f16_nxv8f16_6(<vscale x 8 x half> %in) {
ret <vscale x 2 x half> %res
}
-declare <vscale x 2 x half> @llvm.vector.extract.nxv2f16.nxv8f16(<vscale x 8 x half>, i64)
-
;
; Extract half bf16 vector that needs promotion from legal type.
;
@@ -866,8 +1014,6 @@ define <vscale x 4 x bfloat> @extract_nxv4bf16_nxv8bf16_4(<vscale x 8 x bfloat>
ret <vscale x 4 x bfloat> %res
}
-declare <vscale x 4 x bfloat> @llvm.vector.extract.nxv4bf16.nxv8bf16(<vscale x 8 x bfloat>, i64)
-
;
; Extract bf16 vector that needs widening from one that needs widening.
;
@@ -895,8 +1041,6 @@ define <vscale x 6 x bfloat> @extract_nxv6bf16_nxv12bf16_6(<vscale x 12 x bfloat
ret <vscale x 6 x bfloat> %res
}
-declare <vscale x 6 x bfloat> @llvm.vector.extract.nxv6bf16.nxv12bf16(<vscale x 12 x bfloat>, i64)
-
;
; Extract bf16 vector that needs promotion from one that needs widening.
;
@@ -930,8 +1074,6 @@ define <vscale x 2 x bfloat> @extract_nxv2bf16_nxv6bf16_4(<vscale x 6 x bfloat>
ret <vscale x 2 x bfloat> %res
}
-declare <vscale x 2 x bfloat> @llvm.vector.extract.nxv2bf16.nxv6bf16(<vscale x 6 x bfloat>, i64)
-
;
; Extract 1/4th bf16 vector that needs promotion from legal type.
;
@@ -975,8 +1117,6 @@ define <vscale x 2 x bfloat> @extract_nxv2bf16_nxv8bf16_6(<vscale x 8 x bfloat>
ret <vscale x 2 x bfloat> %res
}
-declare <vscale x 2 x bfloat> @llvm.vector.extract.nxv2bf16.nxv8bf16(<vscale x 8 x bfloat>, i64)
-
;
; Extract half bf16 vector that needs promotion from one that needs splitting.
;
@@ -1016,9 +1156,6 @@ define <vscale x 4 x bfloat> @extract_nxv4bf16_nxv16bf16_12(<vscale x 16 x bfloa
ret <vscale x 4 x bfloat> %res
}
-declare <vscale x 4 x bfloat> @llvm.vector.extract.nxv4bf16.nxv16bf16(<vscale x 16 x bfloat>, i64)
-
-
;
; Extract from a splat
;
@@ -1070,9 +1207,6 @@ define <vscale x 2 x i1> @extract_nxv2i1_nxv16i1_all_zero() {
ret <vscale x 2 x i1> %ext
}
-declare <vscale x 2 x float> @llvm.vector.extract.nxv2f32.nxv4f32(<vscale x 4 x float>, i64)
-declare <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32>, i64)
-
;
; Extract nxv1i1 type from: nxv2i1
;
@@ -1427,8 +1561,3 @@ define <vscale x 1 x i1> @extract_nxv1i1_nxv16i1_15(<vscale x 16 x i1> %in) {
%res = call <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv16i1(<vscale x 16 x i1> %in, i64 15)
ret <vscale x 1 x i1> %res
}
-
-declare <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv2i1(<vscale x 2 x i1>, i64)
-declare <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv4i1(<vscale x 4 x i1>, i64)
-declare <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv8i1(<vscale x 8 x i1>, i64)
-declare <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv16i1(<vscale x 16 x i1>, i64)
diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
index 00a08e505b943..4b3c322ad7e4e 100644
--- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
@@ -1322,49 +1322,246 @@ define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_15(<vscale x 16 x i1> %vec, <vs
ret <vscale x 16 x i1> %res
}
-attributes #0 = { vscale_range(2,2) }
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_0(<vscale x 4 x i32> %vec, <vscale x 4 x i32> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1i32_nxv4i32_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: st1w { z1.s }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %i = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i64.nxv4i32(<vscale x 4 x i32> %subvec, i64 0)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> %vec, <vscale x 1 x i32> %i, i64 0)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_1(<vscale x 4 x i32> %vec, <vscale x 4 x i32> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1i32_nxv4i32_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: add x10, x9, x8
+; CHECK-NEXT: st1w { z1.s }, p0, [x10]
+; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %i = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i64.nxv4i32(<vscale x 4 x i32> %subvec, i64 0)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> %vec, <vscale x 1 x i32> %i, i64 1)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_2(<vscale x 4 x i32> %vec, <vscale x 4 x i32> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1i32_nxv4i32_2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: cnth x8
+; CHECK-NEXT: add x10, x9, x8
+; CHECK-NEXT: st1w { z1.s }, p0, [x10]
+; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %i = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i64.nxv4i32(<vscale x 4 x i32> %subvec, i64 0)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> %vec, <vscale x 1 x i32> %i, i64 2)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_3(<vscale x 4 x i32> %vec, <vscale x 4 x i32> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1i32_nxv4i32_3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: cntw x8, all, mul #3
+; CHECK-NEXT: add x10, x9, x8
+; CHECK-NEXT: st1w { z1.s }, p0, [x10]
+; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %i = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i64.nxv4i32(<vscale x 4 x i32> %subvec, i64 0)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> %vec, <vscale x 1 x i32> %i, i64 3)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 2 x float> @insert_nxv1f32_nxv2f32_0(<vscale x 2 x float> %vec, <vscale x 2 x float> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1f32_nxv2f32_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: whilelo p1.d, xzr, x8
+; CHECK-NEXT: st1w { z1.d }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv2f32(<vscale x 2 x float> %subvec, i64 0)
+ %retval = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.nxv1f32(<vscale x 2 x float> %vec, <vscale x 1 x float> %i, i64 0)
+ ret <vscale x 2 x float> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 2 x float> @insert_nxv1f32_nxv2f32_1(<vscale x 2 x float> %vec, <vscale x 2 x float> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1f32_nxv2f32_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: addpl x9, sp, #4
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: whilelo p1.d, xzr, x8
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: st1w { z1.d }, p1, [x8]
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv2f32(<vscale x 2 x float> %subvec, i64 0)
+ %retval = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.nxv1f32(<vscale x 2 x float> %vec, <vscale x 1 x float> %i, i64 1)
+ ret <vscale x 2 x float> %retval
+}
-declare <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64)
-
-declare <vscale x 6 x i16> @llvm.vector.insert.nxv6i16.nxv1i16(<vscale x 6 x i16>, <vscale x 1 x i16>, i64)
-declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv2i16(<vscale x 8 x i16>, <vscale x 2 x i16>, i64)
-declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
-
-declare <vscale x 3 x i32> @llvm.vector.insert.nxv3i32.nxv2i32(<vscale x 3 x i32>, <vscale x 2 x i32>, i64)
-declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32>, <vscale x 1 x i32>, i64)
-declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
-declare <vscale x 12 x i32> @llvm.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32>, <vscale x 4 x i32>, i64)
-declare <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv2i32(<vscale x 6 x i32>, <vscale x 2 x i32>, i64)
-declare <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv3i32(<vscale x 6 x i32>, <vscale x 3 x i32>, i64)
-
-declare <vscale x 2 x bfloat> @llvm.vector.insert.nxv2bf16.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, i64)
-declare <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv2bf16(<vscale x 4 x bfloat>, <vscale x 2 x bfloat>, i64)
-declare <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, i64)
-declare <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.v4bf16(<vscale x 4 x bfloat>, <4 x bfloat>, i64)
-declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
-declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv4bf16(<vscale x 8 x bfloat>, <vscale x 4 x bfloat>, i64)
-declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat>, <8 x bfloat>, i64)
-
-declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64)
-declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64>, <4 x i64>, i64)
-declare <vscale x 16 x i64> @llvm.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64>, <vscale x 8 x i64>, i64)
-declare <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64>, <2 x i64>, i64)
-
-declare <vscale x 4 x half> @llvm.vector.insert.nxv4f16.nxv2f16(<vscale x 4 x half>, <vscale x 2 x half>, i64)
-declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv2f16(<vscale x 8 x half>, <vscale x 2 x half>, i64)
-declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv4f16(<vscale x 8 x half>, <vscale x 4 x half>, i64)
-
-declare <vscale x 3 x float> @llvm.vector.insert.nxv3f32.nxv2f32(<vscale x 3 x float>, <vscale x 2 x float>, i64)
-declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float>, <vscale x 1 x float>, i64)
-declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float>, <vscale x 2 x float>, i64)
-
-declare <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.v8i1(<vscale x 2 x i1>, <8 x i1>, i64)
-declare <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.v16i1(<vscale x 4 x i1>, <16 x i1>, i64)
-declare <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.v32i1(<vscale x 8 x i1>, <32 x i1>, i64)
-declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1>, <vscale x 1 x i1>, i64)
-declare <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1>, <vscale x 1 x i1>, i64)
-declare <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.nxv1i1(<vscale x 4 x i1>, <vscale x 1 x i1>, i64)
-declare <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.nxv1i1(<vscale x 2 x i1>, <vscale x 1 x i1>, i64)
-declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv4i1(<vscale x 16 x i1>, <vscale x 4 x i1>, i64)
-declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv8i1(<vscale x 16 x i1>, <vscale x 8 x i1>, i64)
-declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v64i1(<vscale x 16 x i1>, <64 x i1>, i64)
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_0(<vscale x 4 x float> %vec, <vscale x 4 x float> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1f32_nxv4f32_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: st1w { z1.d }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv4f32(<vscale x 4 x float> %subvec, i64 0)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> %vec, <vscale x 1 x float> %i, i64 0)
+ ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_1(<vscale x 4 x float> %vec, <vscale x 4 x float> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1f32_nxv4f32_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: add x10, x9, x8
+; CHECK-NEXT: st1w { z1.d }, p0, [x10]
+; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv4f32(<vscale x 4 x float> %subvec, i64 0)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> %vec, <vscale x 1 x float> %i, i64 1)
+ ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_2(<vscale x 4 x float> %vec, <vscale x 4 x float> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1f32_nxv4f32_2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: cnth x8
+; CHECK-NEXT: st1w { z1.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv4f32(<vscale x 4 x float> %subvec, i64 0)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> %vec, <vscale x 1 x float> %i, i64 2)
+ ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_3(<vscale x 4 x float> %vec, <vscale x 4 x float> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1f32_nxv4f32_3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: cntw x8, all, mul #3
+; CHECK-NEXT: add x10, x9, x8
+; CHECK-NEXT: st1w { z1.d }, p0, [x10]
+; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv4f32(<vscale x 4 x float> %subvec, i64 0)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> %vec, <vscale x 1 x float> %i, i64 3)
+ ret <vscale x 4 x float> %retval
+}
+
+attributes #0 = { vscale_range(2,2) }
>From 2d9f32a961cbf2c065320b4b487a015417fd3f62 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Wed, 26 Nov 2025 11:19:16 +0000
Subject: [PATCH 2/4] Reduce use auto and getMachineFunction calls.
---
.../CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 69f005c182ba0..645ee0e187c17 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -6222,14 +6222,14 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
Align Alignment = DAG.getReducedAlign(InVT, /*UseABI=*/false);
SDValue StackPtr = DAG.CreateStackTemporary(InVT.getStoreSize(), Alignment);
- auto &MF = DAG.getMachineFunction();
- auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ MachineFunction &MF = DAG.getMachineFunction();
+ int FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
- MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
+ MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
PtrInfo, MachineMemOperand::MOStore,
LocationSize::beforeOrAfterPointer(), Alignment);
- MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
+ MachineMemOperand *LoadMMO = MF.getMachineMemOperand(
PtrInfo, MachineMemOperand::MOLoad,
LocationSize::beforeOrAfterPointer(), Alignment);
@@ -7525,14 +7525,14 @@ SDValue DAGTypeLegalizer::WidenVecOp_INSERT_SUBVECTOR(SDNode *N) {
Align Alignment = DAG.getReducedAlign(VT, /*UseABI=*/false);
SDValue StackPtr = DAG.CreateStackTemporary(VT.getStoreSize(), Alignment);
- auto &MF = DAG.getMachineFunction();
- auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ MachineFunction &MF = DAG.getMachineFunction();
+ int FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
- MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
+ MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
PtrInfo, MachineMemOperand::MOStore,
LocationSize::beforeOrAfterPointer(), Alignment);
- MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
+ MachineMemOperand *LoadMMO = MF.getMachineMemOperand(
PtrInfo, MachineMemOperand::MOLoad,
LocationSize::beforeOrAfterPointer(), Alignment);
>From 9a12fd486ab0a9ca8ae26051c870b1fbd072fd38 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Wed, 26 Nov 2025 11:51:05 +0000
Subject: [PATCH 3/4] Fix address calculation when loading the result of an
expanded vector.insert.
---
.../SelectionDAG/LegalizeVectorTypes.cpp | 6 +--
.../test/CodeGen/AArch64/sve-insert-vector.ll | 50 ++++++++-----------
2 files changed, 24 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 645ee0e187c17..ce23bb4fa202f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -7545,10 +7545,10 @@ SDValue DAGTypeLegalizer::WidenVecOp_INSERT_SUBVECTOR(SDNode *N) {
DAG.getMaskFromElementCount(DL, SubVT, OrigVT.getVectorElementCount());
// Overwrite the sub-vector at the required offset.
- StackPtr =
+ SDValue SubVecPtr =
TLI.getVectorSubVecPointer(DAG, StackPtr, VT, OrigVT, N->getOperand(2));
- Ch = DAG.getMaskedStore(Ch, DL, SubVec, StackPtr,
- DAG.getUNDEF(StackPtr.getValueType()), Mask, VT,
+ Ch = DAG.getMaskedStore(Ch, DL, SubVec, SubVecPtr,
+ DAG.getUNDEF(SubVecPtr.getValueType()), Mask, VT,
StoreMMO, ISD::UNINDEXED, ISD::NON_EXTLOAD);
// Read back the result.
diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
index 4b3c322ad7e4e..dc113acb35a5b 100644
--- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
@@ -1352,14 +1352,13 @@ define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_1(<vscale x 4 x i32> %vec, <vs
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: rdvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: ptrue p1.b
-; CHECK-NEXT: lsr x8, x8, #4
; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
; CHECK-NEXT: whilelo p0.s, xzr, x8
; CHECK-NEXT: cntw x8
-; CHECK-NEXT: add x10, x9, x8
-; CHECK-NEXT: st1w { z1.s }, p0, [x10]
-; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: st1w { z1.s }, p0, [x8]
+; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -1377,14 +1376,13 @@ define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_2(<vscale x 4 x i32> %vec, <vs
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: rdvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: ptrue p1.b
-; CHECK-NEXT: lsr x8, x8, #4
; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
; CHECK-NEXT: whilelo p0.s, xzr, x8
; CHECK-NEXT: cnth x8
-; CHECK-NEXT: add x10, x9, x8
-; CHECK-NEXT: st1w { z1.s }, p0, [x10]
-; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: st1w { z1.s }, p0, [x8]
+; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -1402,14 +1400,13 @@ define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_3(<vscale x 4 x i32> %vec, <vs
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: rdvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: ptrue p1.b
-; CHECK-NEXT: lsr x8, x8, #4
; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
; CHECK-NEXT: whilelo p0.s, xzr, x8
; CHECK-NEXT: cntw x8, all, mul #3
-; CHECK-NEXT: add x10, x9, x8
-; CHECK-NEXT: st1w { z1.s }, p0, [x10]
-; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: st1w { z1.s }, p0, [x8]
+; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -1456,7 +1453,7 @@ define <vscale x 2 x float> @insert_nxv1f32_nxv2f32_1(<vscale x 2 x float> %vec,
; CHECK-NEXT: cntw x8
; CHECK-NEXT: add x8, x9, x8
; CHECK-NEXT: st1w { z1.d }, p1, [x8]
-; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8]
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -1498,13 +1495,12 @@ define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_1(<vscale x 4 x float> %vec,
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: lsr x8, x8, #4
-; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: whilelo p0.d, xzr, x8
; CHECK-NEXT: cntw x8
-; CHECK-NEXT: add x10, x9, x8
-; CHECK-NEXT: st1w { z1.d }, p0, [x10]
-; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: st1w { z1.d }, p0, [x8]
+; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -1522,14 +1518,11 @@ define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_2(<vscale x 4 x float> %vec,
; CHECK-NEXT: addvl sp, sp, #-1
; CHECK-NEXT: rdvl x8, #1
; CHECK-NEXT: uunpklo z1.d, z1.s
-; CHECK-NEXT: ptrue p1.b
-; CHECK-NEXT: lsr x8, x8, #4
; CHECK-NEXT: str z0, [sp]
-; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: lsr x8, x8, #4
; CHECK-NEXT: whilelo p0.d, xzr, x8
-; CHECK-NEXT: cnth x8
; CHECK-NEXT: st1w { z1.d }, p0, [sp, #1, mul vl]
-; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
@@ -1549,13 +1542,12 @@ define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_3(<vscale x 4 x float> %vec,
; CHECK-NEXT: uunpklo z1.d, z1.s
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: lsr x8, x8, #4
-; CHECK-NEXT: ptrue p1.b
; CHECK-NEXT: str z0, [sp]
; CHECK-NEXT: whilelo p0.d, xzr, x8
; CHECK-NEXT: cntw x8, all, mul #3
-; CHECK-NEXT: add x10, x9, x8
-; CHECK-NEXT: st1w { z1.d }, p0, [x10]
-; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: st1w { z1.d }, p0, [x8]
+; CHECK-NEXT: ldr z0, [sp]
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
>From 66a45563f40d6a68ec117907b9dacf81570cc9a2 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Wed, 26 Nov 2025 11:54:49 +0000
Subject: [PATCH 4/4] Corrected naming of vector.extract intrinsics.
---
.../test/CodeGen/AArch64/sve-insert-vector.ll | 20 +++++++++----------
1 file changed, 10 insertions(+), 10 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
index dc113acb35a5b..19827e24184ac 100644
--- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
@@ -1338,7 +1338,7 @@ define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_0(<vscale x 4 x i32> %vec, <vs
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
- %i = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i64.nxv4i32(<vscale x 4 x i32> %subvec, i64 0)
+ %i = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %subvec, i64 0)
%retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> %vec, <vscale x 1 x i32> %i, i64 0)
ret <vscale x 4 x i32> %retval
}
@@ -1362,7 +1362,7 @@ define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_1(<vscale x 4 x i32> %vec, <vs
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
- %i = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i64.nxv4i32(<vscale x 4 x i32> %subvec, i64 0)
+ %i = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %subvec, i64 0)
%retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> %vec, <vscale x 1 x i32> %i, i64 1)
ret <vscale x 4 x i32> %retval
}
@@ -1386,7 +1386,7 @@ define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_2(<vscale x 4 x i32> %vec, <vs
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
- %i = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i64.nxv4i32(<vscale x 4 x i32> %subvec, i64 0)
+ %i = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %subvec, i64 0)
%retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> %vec, <vscale x 1 x i32> %i, i64 2)
ret <vscale x 4 x i32> %retval
}
@@ -1410,7 +1410,7 @@ define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_3(<vscale x 4 x i32> %vec, <vs
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
- %i = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i64.nxv4i32(<vscale x 4 x i32> %subvec, i64 0)
+ %i = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %subvec, i64 0)
%retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> %vec, <vscale x 1 x i32> %i, i64 3)
ret <vscale x 4 x i32> %retval
}
@@ -1432,7 +1432,7 @@ define <vscale x 2 x float> @insert_nxv1f32_nxv2f32_0(<vscale x 2 x float> %vec,
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
- %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv2f32(<vscale x 2 x float> %subvec, i64 0)
+ %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv2f32(<vscale x 2 x float> %subvec, i64 0)
%retval = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.nxv1f32(<vscale x 2 x float> %vec, <vscale x 1 x float> %i, i64 0)
ret <vscale x 2 x float> %retval
}
@@ -1457,7 +1457,7 @@ define <vscale x 2 x float> @insert_nxv1f32_nxv2f32_1(<vscale x 2 x float> %vec,
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
- %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv2f32(<vscale x 2 x float> %subvec, i64 0)
+ %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv2f32(<vscale x 2 x float> %subvec, i64 0)
%retval = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.nxv1f32(<vscale x 2 x float> %vec, <vscale x 1 x float> %i, i64 1)
ret <vscale x 2 x float> %retval
}
@@ -1479,7 +1479,7 @@ define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_0(<vscale x 4 x float> %vec,
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
- %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv4f32(<vscale x 4 x float> %subvec, i64 0)
+ %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %subvec, i64 0)
%retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> %vec, <vscale x 1 x float> %i, i64 0)
ret <vscale x 4 x float> %retval
}
@@ -1504,7 +1504,7 @@ define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_1(<vscale x 4 x float> %vec,
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
- %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv4f32(<vscale x 4 x float> %subvec, i64 0)
+ %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %subvec, i64 0)
%retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> %vec, <vscale x 1 x float> %i, i64 1)
ret <vscale x 4 x float> %retval
}
@@ -1526,7 +1526,7 @@ define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_2(<vscale x 4 x float> %vec,
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
- %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv4f32(<vscale x 4 x float> %subvec, i64 0)
+ %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %subvec, i64 0)
%retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> %vec, <vscale x 1 x float> %i, i64 2)
ret <vscale x 4 x float> %retval
}
@@ -1551,7 +1551,7 @@ define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_3(<vscale x 4 x float> %vec,
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
- %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv4f32(<vscale x 4 x float> %subvec, i64 0)
+ %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %subvec, i64 0)
%retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> %vec, <vscale x 1 x float> %i, i64 3)
ret <vscale x 4 x float> %retval
}
More information about the llvm-commits
mailing list