[llvm] [LLVM][CodeGen] Remove failure cases when widening EXTRACT/INSERT_SUBVECTOR. (PR #162308)
Paul Walker via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 7 08:48:10 PDT 2025
https://github.com/paulwalker-arm created https://github.com/llvm/llvm-project/pull/162308
This PR implements catch all handling for widening the scalable subvector operand (INSERT_SUBVECTOR) or result (EXTRACT_SUBVECTOR). It does this via the stack using masked memory operations. With general handling available we can add optimiations for specific cases.
>From 4b4fd331fc3ca88a21e3307099b282ed4c3f04d2 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Fri, 26 Sep 2025 12:00:46 +0100
Subject: [PATCH] [LLVM][CodeGen] Remove failure cases when widening
EXTRACT/INSERT_SUBVECTOR.
This PR implements catch all handling for widening the scalable
subvector operand (INSERT_SUBVECTOR) or result (EXTRACT_SUBVECTOR). It
does this via the stack using masked memory operations. With general
handling available we can add optimiations for specific cases.
---
llvm/include/llvm/CodeGen/SelectionDAG.h | 6 +
.../SelectionDAG/LegalizeVectorTypes.cpp | 90 ++++--
.../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 8 +
.../AArch64/sve-extract-scalable-vector.ll | 253 +++++++++++----
.../test/CodeGen/AArch64/sve-insert-vector.ll | 287 +++++++++++++++---
5 files changed, 518 insertions(+), 126 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
index d9d6f0bcdcb84..f935442404cde 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1186,6 +1186,12 @@ class SelectionDAG {
LLVM_ABI SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
bool ConstantFold = true);
+ /// Return a vector with the first 'Len' lanes set to true and remaining lanes
+ /// set to false. The mask's ValueType is the same as when comparing vectors
+ /// of type VT.
+ LLVM_ABI SDValue getMaskFromElementCount(const SDLoc &DL, EVT VT,
+ ElementCount Len);
+
/// Return a GLOBAL_OFFSET_TABLE node. This does not have a useful SDLoc.
SDValue getGLOBAL_OFFSET_TABLE(EVT VT) {
return getNode(ISD::GLOBAL_OFFSET_TABLE, SDLoc(), VT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 87d5453cd98cf..bcaac40de5459 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -6201,8 +6201,33 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, Parts);
}
- report_fatal_error("Don't know how to widen the result of "
- "EXTRACT_SUBVECTOR for scalable vectors");
+ // Fallback to extracting through memory.
+
+ Align Alignment = DAG.getReducedAlign(InVT, /*UseABI=*/false);
+ SDValue StackPtr = DAG.CreateStackTemporary(InVT.getStoreSize(), Alignment);
+ auto &MF = DAG.getMachineFunction();
+ auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+ MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOStore,
+ LocationSize::beforeOrAfterPointer(), Alignment);
+ MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOLoad,
+ LocationSize::beforeOrAfterPointer(), Alignment);
+
+ // Write out the input vector.
+ SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, StoreMMO);
+
+ // Build a mask to match the length of the non-widened result.
+ SDValue Mask =
+ DAG.getMaskFromElementCount(dl, WidenVT, VT.getVectorElementCount());
+
+ // Read back the sub-vector setting the remaining lanes to poison.
+ StackPtr = TLI.getVectorSubVecPointer(DAG, StackPtr, InVT, VT, Idx);
+ return DAG.getMaskedLoad(
+ WidenVT, dl, Ch, StackPtr, DAG.getUNDEF(StackPtr.getValueType()), Mask,
+ DAG.getPOISON(WidenVT), VT, LoadMMO, ISD::UNINDEXED, ISD::NON_EXTLOAD);
}
// We could try widening the input to the right length but for now, extract
@@ -6306,11 +6331,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
if (VT.isVector()) {
// If all else fails replace the load with a wide masked load.
SDLoc DL(N);
- EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
-
- SDValue Len = DAG.getElementCount(DL, IdxVT, VT.getVectorElementCount());
- SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT,
- DAG.getConstant(0, DL, IdxVT), Len);
+ SDValue Mask =
+ DAG.getMaskFromElementCount(DL, WideVT, VT.getVectorElementCount());
SDValue NewLoad = DAG.getMaskedLoad(
WideVT, DL, LD->getChain(), LD->getBasePtr(), LD->getOffset(), Mask,
@@ -7447,9 +7469,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_INSERT_SUBVECTOR(SDNode *N) {
SDValue InVec = N->getOperand(0);
EVT OrigVT = SubVec.getValueType();
- if (getTypeAction(SubVec.getValueType()) == TargetLowering::TypeWidenVector)
- SubVec = GetWidenedVector(SubVec);
-
+ SubVec = GetWidenedVector(SubVec);
EVT SubVT = SubVec.getValueType();
// Whether or not all the elements of the widened SubVec will be inserted into
@@ -7471,17 +7491,52 @@ SDValue DAGTypeLegalizer::WidenVecOp_INSERT_SUBVECTOR(SDNode *N) {
}
}
+ if (!IndicesValid)
+ report_fatal_error(
+ "Don't know how to widen the operands for INSERT_SUBVECTOR");
+
SDLoc DL(N);
// We need to make sure that the indices are still valid, otherwise we might
// widen what was previously well-defined to something undefined.
- if (IndicesValid && InVec.isUndef() && N->getConstantOperandVal(2) == 0)
+ if (InVec.isUndef() && N->getConstantOperandVal(2) == 0)
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, InVec, SubVec,
N->getOperand(2));
- if (!IndicesValid || OrigVT.isScalableVector())
- report_fatal_error(
- "Don't know how to widen the operands for INSERT_SUBVECTOR");
+ if (OrigVT.isScalableVector()) {
+ // Fallback to inserting through memory.
+
+ Align Alignment = DAG.getReducedAlign(VT, /*UseABI=*/false);
+ SDValue StackPtr = DAG.CreateStackTemporary(VT.getStoreSize(), Alignment);
+ auto &MF = DAG.getMachineFunction();
+ auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+ auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+ MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOStore,
+ LocationSize::beforeOrAfterPointer(), Alignment);
+ MachineMemOperand *LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
+ PtrInfo, MachineMemOperand::MOLoad,
+ LocationSize::beforeOrAfterPointer(), Alignment);
+
+ // Write out the vector being inserting into.
+ SDValue Ch =
+ DAG.getStore(DAG.getEntryNode(), DL, InVec, StackPtr, StoreMMO);
+
+ // Build a mask to match the length of the sub-vector.
+ SDValue Mask =
+ DAG.getMaskFromElementCount(DL, SubVT, OrigVT.getVectorElementCount());
+
+ // Overwrite the sub-vector at the required offset.
+ StackPtr =
+ TLI.getVectorSubVecPointer(DAG, StackPtr, VT, OrigVT, N->getOperand(2));
+ Ch = DAG.getMaskedStore(Ch, DL, SubVec, StackPtr,
+ DAG.getUNDEF(StackPtr.getValueType()), Mask, VT,
+ StoreMMO, ISD::UNINDEXED, ISD::NON_EXTLOAD);
+
+ // Read back the result.
+ return DAG.getLoad(VT, DL, Ch, StackPtr, LoadMMO);
+ }
// If the operands can't be widened legally, just replace the INSERT_SUBVECTOR
// with a series of INSERT_VECTOR_ELT
@@ -7560,12 +7615,9 @@ SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
if (StVT.isVector()) {
// If all else fails replace the store with a wide masked store.
SDLoc DL(N);
- EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
-
SDValue WideStVal = GetWidenedVector(StVal);
- SDValue Len = DAG.getElementCount(DL, IdxVT, StVT.getVectorElementCount());
- SDValue Mask = DAG.getNode(ISD::GET_ACTIVE_LANE_MASK, DL, WideMaskVT,
- DAG.getConstant(0, DL, IdxVT), Len);
+ SDValue Mask =
+ DAG.getMaskFromElementCount(DL, WideVT, StVT.getVectorElementCount());
return DAG.getMaskedStore(ST->getChain(), DL, WideStVal, ST->getBasePtr(),
ST->getOffset(), Mask, ST->getMemoryVT(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 95f53fe0bfdba..d976c0ce1b901 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -2112,6 +2112,14 @@ SDValue SelectionDAG::getElementCount(const SDLoc &DL, EVT VT, ElementCount EC,
return getConstant(EC.getKnownMinValue(), DL, VT);
}
+SDValue SelectionDAG::getMaskFromElementCount(const SDLoc &DL, EVT DataVT,
+ ElementCount EC) {
+ EVT IdxVT = TLI->getVectorIdxTy(getDataLayout());
+ EVT MaskVT = TLI->getSetCCResultType(getDataLayout(), *getContext(), DataVT);
+ return getNode(ISD::GET_ACTIVE_LANE_MASK, DL, MaskVT,
+ getConstant(0, DL, IdxVT), getElementCount(DL, IdxVT, EC));
+}
+
SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT) {
APInt One(ResVT.getScalarSizeInBits(), 1);
return getStepVector(DL, ResVT, One);
diff --git a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
index 4aaa25e5e66c5..8d0c71502f1e3 100644
--- a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll
@@ -3,12 +3,199 @@
; Extracting illegal subvectors
-define <vscale x 1 x i32> @extract_nxv1i32_nxv4i32(<vscale x 4 x i32> %vec) nounwind {
-; CHECK-LABEL: extract_nxv1i32_nxv4i32:
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_0(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_0:
; CHECK: // %bb.0:
; CHECK-NEXT: ret
- %retval = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 0)
- ret <vscale x 1 x i32> %retval
+ %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 0)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_1(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 1)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_2(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: cnth x8
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 2)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x i32> @extract_nxv1i32_nxv4i32_3(<vscale x 4 x i32> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1i32_nxv4i32_3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: cntw x8, all, mul #3
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32> %vec, i64 3)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> poison, <vscale x 1 x i32> %e, i64 0)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 2 x float> @extract_nxv1f32_nxv2f32_0(<vscale x 2 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv2f32_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv2f32(<vscale x 2 x float> %vec, i64 0)
+ %retval = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.nxv1f32(<vscale x 2 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 2 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 2 x float> @extract_nxv1f32_nxv2f32_1(<vscale x 2 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv2f32_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: addpl x9, sp, #4
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: whilelo p1.d, xzr, x8
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.d }, p1/z, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv2f32(<vscale x 2 x float> %vec, i64 1)
+ %retval = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.nxv1f32(<vscale x 2 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 2 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_0(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 0)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_1(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 1)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_2(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 2)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Insert sub-vector into a legal type to avoid relying on an undefined
+; calling convention.
+define <vscale x 4 x float> @extract_nxv1f32_nxv4f32_3(<vscale x 4 x float> %vec) nounwind {
+; CHECK-LABEL: extract_nxv1f32_nxv4f32_3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: cntw x8, all, mul #3
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %e = call <vscale x 1 x float> @llvm.vector.extract.nxv1f32.nxv4f32(<vscale x 4 x float> %vec, i64 3)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> poison, <vscale x 1 x float> %e, i64 0)
+ ret <vscale x 4 x float> %retval
}
define <vscale x 1 x i16> @extract_nxv1i16_nxv6i16(<vscale x 6 x i16> %vec) nounwind {
@@ -19,9 +206,6 @@ define <vscale x 1 x i16> @extract_nxv1i16_nxv6i16(<vscale x 6 x i16> %vec) noun
ret <vscale x 1 x i16> %retval
}
-declare <vscale x 1 x i32> @llvm.vector.extract.nxv1i32.nxv4i32(<vscale x 4 x i32>, i64)
-declare <vscale x 1 x i16> @llvm.vector.extract.nxv1i16.nxv6i16(<vscale x 6 x i16>, i64)
-
;
; Extract half i1 vector that needs promotion from legal type.
;
@@ -43,8 +227,6 @@ define <vscale x 8 x i1> @extract_nxv8i1_nxv16i1_8(<vscale x 16 x i1> %in) {
ret <vscale x 8 x i1> %res
}
-declare <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv16i1(<vscale x 16 x i1>, i64)
-
;
; Extract i1 vector that needs widening from one that needs widening.
;
@@ -99,8 +281,6 @@ define <vscale x 14 x i1> @extract_nxv14i1_nxv28i1_14(<vscale x 28 x i1> %in) uw
ret <vscale x 14 x i1> %res
}
-declare <vscale x 14 x i1> @llvm.vector.extract.nxv14i1.nxv28i1(<vscale x 28 x i1>, i64)
-
;
; Extract half i1 vector that needs promotion from one that needs splitting.
;
@@ -140,8 +320,6 @@ define <vscale x 8 x i1> @extract_nxv8i1_nxv32i1_24(<vscale x 32 x i1> %in) {
ret <vscale x 8 x i1> %res
}
-declare <vscale x 8 x i1> @llvm.vector.extract.nxv8i1.nxv32i1(<vscale x 32 x i1>, i64)
-
;
; Extract 1/4th i1 vector that needs promotion from legal type.
;
@@ -185,8 +363,6 @@ define <vscale x 4 x i1> @extract_nxv4i1_nxv16i1_12(<vscale x 16 x i1> %in) {
ret <vscale x 4 x i1> %res
}
-declare <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv16i1(<vscale x 16 x i1>, i64)
-
;
; Extract 1/8th i1 vector that needs promotion from legal type.
;
@@ -278,8 +454,6 @@ define <vscale x 2 x i1> @extract_nxv2i1_nxv16i1_14(<vscale x 16 x i1> %in) {
ret <vscale x 2 x i1> %res
}
-declare <vscale x 2 x i1> @llvm.vector.extract.nxv2i1.nxv16i1(<vscale x 16 x i1>, i64)
-
;
; Extract i1 vector that needs promotion from one that needs widening.
;
@@ -313,8 +487,6 @@ define <vscale x 4 x i1> @extract_nxv4i1_nxv12i1_8(<vscale x 12 x i1> %in) {
ret <vscale x 4 x i1> %res
}
-declare <vscale x 4 x i1> @llvm.vector.extract.nxv4i1.nxv12i1(<vscale x 12 x i1>, i64)
-
;
; Extract 1/8th i8 vector that needs promotion from legal type.
;
@@ -406,8 +578,6 @@ define <vscale x 2 x i8> @extract_nxv2i8_nxv16i8_14(<vscale x 16 x i8> %in) {
ret <vscale x 2 x i8> %res
}
-declare <vscale x 2 x i8> @llvm.vector.extract.nxv2i8.nxv16i8(<vscale x 16 x i8>, i64)
-
;
; Extract i8 vector that needs promotion from one that needs widening.
;
@@ -441,8 +611,6 @@ define <vscale x 4 x i8> @extract_nxv4i8_nxv12i8_8(<vscale x 12 x i8> %in) {
ret <vscale x 4 x i8> %res
}
-declare <vscale x 4 x i8> @llvm.vector.extract.nxv4i8.nxv12i8(<vscale x 12 x i8>, i64)
-
;
; Extract i8 vector that needs both widening + promotion from one that needs widening.
; (nxv6i8 -> nxv8i8 -> nxv8i16)
@@ -474,8 +642,6 @@ define <vscale x 6 x i8> @extract_nxv6i8_nxv12i8_6(<vscale x 12 x i8> %in) {
ret <vscale x 6 x i8> %res
}
-declare <vscale x 6 x i8> @llvm.vector.extract.nxv6i8.nxv12i8(<vscale x 12 x i8>, i64)
-
;
; Extract half i8 vector that needs promotion from one that needs splitting.
;
@@ -515,8 +681,6 @@ define <vscale x 8 x i8> @extract_nxv8i8_nxv32i8_24(<vscale x 32 x i8> %in) {
ret <vscale x 8 x i8> %res
}
-declare <vscale x 8 x i8> @llvm.vector.extract.nxv8i8.nxv32i8(<vscale x 32 x i8>, i64)
-
;
; Extract half i8 vector that needs promotion from legal type.
;
@@ -538,8 +702,6 @@ define <vscale x 8 x i8> @extract_nxv8i8_nxv16i8_8(<vscale x 16 x i8> %in) {
ret <vscale x 8 x i8> %res
}
-declare <vscale x 8 x i8> @llvm.vector.extract.nxv8i8.nxv16i8(<vscale x 16 x i8>, i64)
-
;
; Extract i8 vector that needs widening from one that needs widening.
;
@@ -625,8 +787,6 @@ define <vscale x 14 x i8> @extract_nxv14i8_nxv28i8_14(<vscale x 28 x i8> %in) {
ret <vscale x 14 x i8> %res
}
-declare <vscale x 14 x i8> @llvm.vector.extract.nxv14i8.nxv28i8(<vscale x 28 x i8>, i64)
-
;
; Extract 1/4th i8 vector that needs promotion from legal type.
;
@@ -670,8 +830,6 @@ define <vscale x 4 x i8> @extract_nxv4i8_nxv16i8_12(<vscale x 16 x i8> %in) {
ret <vscale x 4 x i8> %res
}
-declare <vscale x 4 x i8> @llvm.vector.extract.nxv4i8.nxv16i8(<vscale x 16 x i8>, i64)
-
;
; Extract f16 vector that needs promotion from one that needs widening.
;
@@ -705,8 +863,6 @@ define <vscale x 2 x half> @extract_nxv2f16_nxv6f16_4(<vscale x 6 x half> %in) {
ret <vscale x 2 x half> %res
}
-declare <vscale x 2 x half> @llvm.vector.extract.nxv2f16.nxv6f16(<vscale x 6 x half>, i64)
-
;
; Extract half f16 vector that needs promotion from legal type.
;
@@ -728,8 +884,6 @@ define <vscale x 4 x half> @extract_nxv4f16_nxv8f16_4(<vscale x 8 x half> %in) {
ret <vscale x 4 x half> %res
}
-declare <vscale x 4 x half> @llvm.vector.extract.nxv4f16.nxv8f16(<vscale x 8 x half>, i64)
-
;
; Extract f16 vector that needs widening from one that needs widening.
;
@@ -757,8 +911,6 @@ define <vscale x 6 x half> @extract_nxv6f16_nxv12f16_6(<vscale x 12 x half> %in)
ret <vscale x 6 x half> %res
}
-declare <vscale x 6 x half> @llvm.vector.extract.nxv6f16.nxv12f16(<vscale x 12 x half>, i64)
-
;
; Extract half f16 vector that needs promotion from one that needs splitting.
;
@@ -798,8 +950,6 @@ define <vscale x 4 x half> @extract_nxv4f16_nxv16f16_12(<vscale x 16 x half> %in
ret <vscale x 4 x half> %res
}
-declare <vscale x 4 x half> @llvm.vector.extract.nxv4f16.nxv16f16(<vscale x 16 x half>, i64)
-
;
; Extract 1/4th f16 vector that needs promotion from legal type.
;
@@ -843,8 +993,6 @@ define <vscale x 2 x half> @extract_nxv2f16_nxv8f16_6(<vscale x 8 x half> %in) {
ret <vscale x 2 x half> %res
}
-declare <vscale x 2 x half> @llvm.vector.extract.nxv2f16.nxv8f16(<vscale x 8 x half>, i64)
-
;
; Extract half bf16 vector that needs promotion from legal type.
;
@@ -866,8 +1014,6 @@ define <vscale x 4 x bfloat> @extract_nxv4bf16_nxv8bf16_4(<vscale x 8 x bfloat>
ret <vscale x 4 x bfloat> %res
}
-declare <vscale x 4 x bfloat> @llvm.vector.extract.nxv4bf16.nxv8bf16(<vscale x 8 x bfloat>, i64)
-
;
; Extract bf16 vector that needs widening from one that needs widening.
;
@@ -895,8 +1041,6 @@ define <vscale x 6 x bfloat> @extract_nxv6bf16_nxv12bf16_6(<vscale x 12 x bfloat
ret <vscale x 6 x bfloat> %res
}
-declare <vscale x 6 x bfloat> @llvm.vector.extract.nxv6bf16.nxv12bf16(<vscale x 12 x bfloat>, i64)
-
;
; Extract bf16 vector that needs promotion from one that needs widening.
;
@@ -930,8 +1074,6 @@ define <vscale x 2 x bfloat> @extract_nxv2bf16_nxv6bf16_4(<vscale x 6 x bfloat>
ret <vscale x 2 x bfloat> %res
}
-declare <vscale x 2 x bfloat> @llvm.vector.extract.nxv2bf16.nxv6bf16(<vscale x 6 x bfloat>, i64)
-
;
; Extract 1/4th bf16 vector that needs promotion from legal type.
;
@@ -975,8 +1117,6 @@ define <vscale x 2 x bfloat> @extract_nxv2bf16_nxv8bf16_6(<vscale x 8 x bfloat>
ret <vscale x 2 x bfloat> %res
}
-declare <vscale x 2 x bfloat> @llvm.vector.extract.nxv2bf16.nxv8bf16(<vscale x 8 x bfloat>, i64)
-
;
; Extract half bf16 vector that needs promotion from one that needs splitting.
;
@@ -1016,9 +1156,6 @@ define <vscale x 4 x bfloat> @extract_nxv4bf16_nxv16bf16_12(<vscale x 16 x bfloa
ret <vscale x 4 x bfloat> %res
}
-declare <vscale x 4 x bfloat> @llvm.vector.extract.nxv4bf16.nxv16bf16(<vscale x 16 x bfloat>, i64)
-
-
;
; Extract from a splat
;
@@ -1070,9 +1207,6 @@ define <vscale x 2 x i1> @extract_nxv2i1_nxv16i1_all_zero() {
ret <vscale x 2 x i1> %ext
}
-declare <vscale x 2 x float> @llvm.vector.extract.nxv2f32.nxv4f32(<vscale x 4 x float>, i64)
-declare <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32>, i64)
-
;
; Extract nxv1i1 type from: nxv2i1
;
@@ -1427,8 +1561,3 @@ define <vscale x 1 x i1> @extract_nxv1i1_nxv16i1_15(<vscale x 16 x i1> %in) {
%res = call <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv16i1(<vscale x 16 x i1> %in, i64 15)
ret <vscale x 1 x i1> %res
}
-
-declare <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv2i1(<vscale x 2 x i1>, i64)
-declare <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv4i1(<vscale x 4 x i1>, i64)
-declare <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv8i1(<vscale x 8 x i1>, i64)
-declare <vscale x 1 x i1> @llvm.vector.extract.nxv1i1.nxv16i1(<vscale x 16 x i1>, i64)
diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
index 73c783d4735f8..26b4739ad4e61 100644
--- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll
@@ -1322,49 +1322,246 @@ define <vscale x 16 x i1> @insert_nxv1i1_nxv16i1_15(<vscale x 16 x i1> %vec, <vs
ret <vscale x 16 x i1> %res
}
-attributes #0 = { vscale_range(2,2) }
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_0(<vscale x 4 x i32> %vec, <vscale x 4 x i32> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1i32_nxv4i32_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: st1w { z1.s }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %i = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i64.nxv4i32(<vscale x 4 x i32> %subvec, i64 0)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> %vec, <vscale x 1 x i32> %i, i64 0)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_1(<vscale x 4 x i32> %vec, <vscale x 4 x i32> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1i32_nxv4i32_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: add x10, x9, x8
+; CHECK-NEXT: st1w { z1.s }, p0, [x10]
+; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %i = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i64.nxv4i32(<vscale x 4 x i32> %subvec, i64 0)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> %vec, <vscale x 1 x i32> %i, i64 1)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_2(<vscale x 4 x i32> %vec, <vscale x 4 x i32> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1i32_nxv4i32_2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: cnth x8
+; CHECK-NEXT: add x10, x9, x8
+; CHECK-NEXT: st1w { z1.s }, p0, [x10]
+; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %i = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i64.nxv4i32(<vscale x 4 x i32> %subvec, i64 0)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> %vec, <vscale x 1 x i32> %i, i64 2)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x i32> @insert_nxv1i32_nxv4i32_3(<vscale x 4 x i32> %vec, <vscale x 4 x i32> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1i32_nxv4i32_3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: cntw x8, all, mul #3
+; CHECK-NEXT: add x10, x9, x8
+; CHECK-NEXT: st1w { z1.s }, p0, [x10]
+; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %i = call <vscale x 1 x i32> @llvm.vector.extract.nxv1i64.nxv4i32(<vscale x 4 x i32> %subvec, i64 0)
+ %retval = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32> %vec, <vscale x 1 x i32> %i, i64 3)
+ ret <vscale x 4 x i32> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 2 x float> @insert_nxv1f32_nxv2f32_0(<vscale x 2 x float> %vec, <vscale x 2 x float> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1f32_nxv2f32_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: whilelo p1.d, xzr, x8
+; CHECK-NEXT: st1w { z1.d }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv2f32(<vscale x 2 x float> %subvec, i64 0)
+ %retval = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.nxv1f32(<vscale x 2 x float> %vec, <vscale x 1 x float> %i, i64 0)
+ ret <vscale x 2 x float> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 2 x float> @insert_nxv1f32_nxv2f32_1(<vscale x 2 x float> %vec, <vscale x 2 x float> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1f32_nxv2f32_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: addpl x9, sp, #4
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: st1w { z0.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: whilelo p1.d, xzr, x8
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: add x8, x9, x8
+; CHECK-NEXT: st1w { z1.d }, p1, [x8]
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv2f32(<vscale x 2 x float> %subvec, i64 0)
+ %retval = call <vscale x 2 x float> @llvm.vector.insert.nxv2f32.nxv1f32(<vscale x 2 x float> %vec, <vscale x 1 x float> %i, i64 1)
+ ret <vscale x 2 x float> %retval
+}
-declare <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8>, <16 x i8>, i64)
-
-declare <vscale x 6 x i16> @llvm.vector.insert.nxv6i16.nxv1i16(<vscale x 6 x i16>, <vscale x 1 x i16>, i64)
-declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.nxv2i16(<vscale x 8 x i16>, <vscale x 2 x i16>, i64)
-declare <vscale x 8 x i16> @llvm.vector.insert.nxv8i16.v8i16(<vscale x 8 x i16>, <8 x i16>, i64)
-
-declare <vscale x 3 x i32> @llvm.vector.insert.nxv3i32.nxv2i32(<vscale x 3 x i32>, <vscale x 2 x i32>, i64)
-declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.nxv1i32(<vscale x 4 x i32>, <vscale x 1 x i32>, i64)
-declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32>, <4 x i32>, i64)
-declare <vscale x 12 x i32> @llvm.vector.insert.nxv4i32.nxv12i32(<vscale x 12 x i32>, <vscale x 4 x i32>, i64)
-declare <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv2i32(<vscale x 6 x i32>, <vscale x 2 x i32>, i64)
-declare <vscale x 6 x i32> @llvm.vector.insert.nxv6i32.nxv3i32(<vscale x 6 x i32>, <vscale x 3 x i32>, i64)
-
-declare <vscale x 2 x bfloat> @llvm.vector.insert.nxv2bf16.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, i64)
-declare <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv2bf16(<vscale x 4 x bfloat>, <vscale x 2 x bfloat>, i64)
-declare <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, i64)
-declare <vscale x 4 x bfloat> @llvm.vector.insert.nxv4bf16.v4bf16(<vscale x 4 x bfloat>, <4 x bfloat>, i64)
-declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i64)
-declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.nxv4bf16(<vscale x 8 x bfloat>, <vscale x 4 x bfloat>, i64)
-declare <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat>, <8 x bfloat>, i64)
-
-declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64>, <2 x i64>, i64)
-declare <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64>, <4 x i64>, i64)
-declare <vscale x 16 x i64> @llvm.vector.insert.nxv8i64.nxv16i64(<vscale x 16 x i64>, <vscale x 8 x i64>, i64)
-declare <vscale x 16 x i64> @llvm.vector.insert.v2i64.nxv16i64(<vscale x 16 x i64>, <2 x i64>, i64)
-
-declare <vscale x 4 x half> @llvm.vector.insert.nxv4f16.nxv2f16(<vscale x 4 x half>, <vscale x 2 x half>, i64)
-declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv2f16(<vscale x 8 x half>, <vscale x 2 x half>, i64)
-declare <vscale x 8 x half> @llvm.vector.insert.nxv8f16.nxv4f16(<vscale x 8 x half>, <vscale x 4 x half>, i64)
-
-declare <vscale x 3 x float> @llvm.vector.insert.nxv3f32.nxv2f32(<vscale x 3 x float>, <vscale x 2 x float>, i64)
-declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float>, <vscale x 1 x float>, i64)
-declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv2f32(<vscale x 4 x float>, <vscale x 2 x float>, i64)
-
-declare <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.v8i1(<vscale x 2 x i1>, <8 x i1>, i64)
-declare <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.v16i1(<vscale x 4 x i1>, <16 x i1>, i64)
-declare <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.v32i1(<vscale x 8 x i1>, <32 x i1>, i64)
-declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv1i1(<vscale x 16 x i1>, <vscale x 1 x i1>, i64)
-declare <vscale x 8 x i1> @llvm.vector.insert.nxv8i1.nxv1i1(<vscale x 8 x i1>, <vscale x 1 x i1>, i64)
-declare <vscale x 4 x i1> @llvm.vector.insert.nxv4i1.nxv1i1(<vscale x 4 x i1>, <vscale x 1 x i1>, i64)
-declare <vscale x 2 x i1> @llvm.vector.insert.nxv2i1.nxv1i1(<vscale x 2 x i1>, <vscale x 1 x i1>, i64)
-declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv4i1(<vscale x 16 x i1>, <vscale x 4 x i1>, i64)
-declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.nxv8i1(<vscale x 16 x i1>, <vscale x 8 x i1>, i64)
-declare <vscale x 16 x i1> @llvm.vector.insert.nxv16i1.v64i1(<vscale x 16 x i1>, <64 x i1>, i64)
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_0(<vscale x 4 x float> %vec, <vscale x 4 x float> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1f32_nxv4f32_0:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: st1w { z1.d }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv4f32(<vscale x 4 x float> %subvec, i64 0)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> %vec, <vscale x 1 x float> %i, i64 0)
+ ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_1(<vscale x 4 x float> %vec, <vscale x 4 x float> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1f32_nxv4f32_1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: cntw x8
+; CHECK-NEXT: add x10, x9, x8
+; CHECK-NEXT: st1w { z1.d }, p0, [x10]
+; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv4f32(<vscale x 4 x float> %subvec, i64 0)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> %vec, <vscale x 1 x float> %i, i64 1)
+ ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_2(<vscale x 4 x float> %vec, <vscale x 4 x float> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1f32_nxv4f32_2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: cnth x8
+; CHECK-NEXT: st1w { z1.d }, p0, [sp, #1, mul vl]
+; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv4f32(<vscale x 4 x float> %subvec, i64 0)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> %vec, <vscale x 1 x float> %i, i64 2)
+ ret <vscale x 4 x float> %retval
+}
+
+; NOTE: Extract input sub-vector from a legal type to avoid relying on an
+; undefined calling convention.
+define <vscale x 4 x float> @insert_nxv1f32_nxv4f32_3(<vscale x 4 x float> %vec, <vscale x 4 x float> %subvec) nounwind {
+; CHECK-LABEL: insert_nxv1f32_nxv4f32_3:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: ptrue p1.b
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: cntw x8, all, mul #3
+; CHECK-NEXT: add x10, x9, x8
+; CHECK-NEXT: st1w { z1.d }, p0, [x10]
+; CHECK-NEXT: ld1b { z0.b }, p1/z, [x9, x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %i = call <vscale x 1 x float> @llvm.vector.extract.nxv1i64.nxv4f32(<vscale x 4 x float> %subvec, i64 0)
+ %retval = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> %vec, <vscale x 1 x float> %i, i64 3)
+ ret <vscale x 4 x float> %retval
+}
+
+attributes #0 = { vscale_range(2,2) }
More information about the llvm-commits
mailing list