[llvm] [LLVM][CodeGen][SVE] Add lowering for 3-way VECTOR_(DE)INTERLEAVE operations. (PR #162502)
Paul Walker via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 8 08:57:38 PDT 2025
https://github.com/paulwalker-arm created https://github.com/llvm/llvm-project/pull/162502
SVE has no in-register instructions to do this, but we can perform the operation through memory by using ld3/st3.
NOTE: I plan to follow up with a combine to spot cases where data is either read from (VECTOR_DEINTERLEAVE) or written to (VECTOR_INTERLEAVE) memory to remove the need to use a stack temporary.
>From 6485a845b72440c1c4d026755efad3bde7f0a845 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker at arm.com>
Date: Wed, 8 Oct 2025 13:14:43 +0100
Subject: [PATCH] [LLVM][CodeGen][SVE] Add lowering for 3-way
VECTOR_(DE)INTERLEAVE operations.
SVE has no in-register instructions to do this, but we can
perform the operation through memory by using ld3/st3.
NOTE: I plan to follow up with a combine to spot cases where
data is either read from (VECTOR_DEINTERLEAVE) or written to
(VECTOR_INTERLEAVE) memory to remove the need to use a stack
temporary.
---
.../Target/AArch64/AArch64ISelLowering.cpp | 73 ++++
.../AArch64/sve-vector-deinterleave.ll | 296 +++++++++++++++--
.../CodeGen/AArch64/sve-vector-interleave.ll | 312 ++++++++++++++++++
3 files changed, 653 insertions(+), 28 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index dc8e7c84f5e2c..3f338e1f5c282 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -30605,6 +30605,43 @@ AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
assert(OpVT.isScalableVector() &&
"Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
+ if (Op->getNumOperands() == 3) {
+ // aarch64_sve_ld3 only supports packed datatypes.
+ EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount());
+ Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false);
+ SDValue StackPtr =
+ DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment);
+
+ // Write out unmodified operands.
+ SmallVector<SDValue, 3> Chains;
+ for (unsigned I = 0; I < 3; ++I) {
+ SDValue Ptr =
+ DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL);
+ SDValue V = getSVESafeBitCast(PackedVT, Op.getOperand(I), DAG);
+ Chains.push_back(
+ DAG.getStore(DAG.getEntryNode(), DL, V, Ptr, MachinePointerInfo()));
+ }
+
+ Intrinsic::ID IntID = Intrinsic::aarch64_sve_ld3_sret;
+ EVT PredVT = PackedVT.changeVectorElementType(MVT::i1);
+
+ SmallVector<SDValue, 7> Ops;
+ Ops.push_back(DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains));
+ Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
+ Ops.push_back(DAG.getConstant(1, DL, PredVT));
+ Ops.push_back(StackPtr);
+
+ // Read back and deinterleave data.
+ SDVTList VTs = DAG.getVTList(PackedVT, PackedVT, PackedVT, MVT::Other);
+ SDValue LD3 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops);
+
+ SmallVector<SDValue, 3> Results;
+ Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(0), DAG));
+ Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(1), DAG));
+ Results.push_back(getSVESafeBitCast(OpVT, LD3.getValue(2), DAG));
+ return DAG.getMergeValues(Results, DL);
+ }
+
// Are multi-register uzp instructions available?
if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
OpVT.getVectorElementType() != MVT::i1) {
@@ -30646,6 +30683,42 @@ SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
assert(OpVT.isScalableVector() &&
"Expected scalable vector in LowerVECTOR_INTERLEAVE.");
+ if (Op->getNumOperands() == 3) {
+ // aarch64_sve_st3 only supports packed datatypes.
+ EVT PackedVT = getPackedSVEVectorVT(OpVT.getVectorElementCount());
+ SmallVector<SDValue, 3> InVecs;
+ for (SDValue V : Op->ops())
+ InVecs.push_back(getSVESafeBitCast(PackedVT, V, DAG));
+
+ Align Alignment = DAG.getReducedAlign(PackedVT, /*UseABI=*/false);
+ SDValue StackPtr =
+ DAG.CreateStackTemporary(PackedVT.getStoreSize() * 3, Alignment);
+
+ Intrinsic::ID IntID = Intrinsic::aarch64_sve_st3;
+ EVT PredVT = PackedVT.changeVectorElementType(MVT::i1);
+
+ SmallVector<SDValue, 7> Ops;
+ Ops.push_back(DAG.getEntryNode());
+ Ops.push_back(DAG.getTargetConstant(IntID, DL, MVT::i64));
+ Ops.append(InVecs);
+ Ops.push_back(DAG.getConstant(1, DL, PredVT));
+ Ops.push_back(StackPtr);
+
+ // Interleave operands and store.
+ SDValue Ch = DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops);
+
+ // Read back the interleaved data.
+ SmallVector<SDValue, 3> Results;
+ for (unsigned I = 0; I < 3; ++I) {
+ SDValue Ptr =
+ DAG.getMemBasePlusOffset(StackPtr, PackedVT.getStoreSize() * I, DL);
+ SDValue L = DAG.getLoad(PackedVT, DL, Ch, Ptr, MachinePointerInfo());
+ Results.push_back(getSVESafeBitCast(OpVT, L, DAG));
+ }
+
+ return DAG.getMergeValues(Results, DL);
+ }
+
// Are multi-register zip instructions available?
if (Subtarget->hasSME2() && Subtarget->isStreaming() &&
OpVT.getVectorElementType() != MVT::i1) {
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
index 139ecafaff0eb..67197b3fe4e80 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
@@ -231,6 +231,274 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv
ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval
}
+define {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @vector_deinterleave_nxv2f16_nxv6f16(<vscale x 6 x half> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv2f16_nxv6f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uunpkhi z2.d, z0.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: str z1, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} @llvm.vector.deinterleave3.nxv6f16(<vscale x 6 x half> %vec)
+ ret {<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x half>} %retval
+}
+
+define {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @vector_deinterleave_nxv4f16_nxv12f16(<vscale x 12 x half> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv4f16_nxv12f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpkhi z2.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: str z1, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave3.nxv12f16(<vscale x 12 x half> %vec)
+ ret {<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x half>} %retval
+}
+
+define {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @vector_deinterleave_nxv8f16_nxv24f16(<vscale x 24 x half> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv8f16_nxv24f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3h { z0.h - z2.h }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave3.nxv24f16(<vscale x 24 x half> %vec)
+ ret {<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x half>} %retval
+}
+
+define {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @vector_deinterleave_nxv2f32_nxv6f32(<vscale x 6 x float> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv2f32_nxv6f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uunpkhi z2.d, z0.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: str z1, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave3.nxv6f32(<vscale x 6 x float> %vec)
+ ret {<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x float>} %retval
+}
+
+define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @vector_deinterleave_nxv4f32_nxv12f32(<vscale x 12 x float> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv4f32_nxv12f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave3.nxv12f32(<vscale x 12 x float> %vec)
+ ret {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} %retval
+}
+
+define {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @vector_deinterleave_nxv2f64_nxv6f64(<vscale x 6 x double> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv2f64_nxv6f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave3.nxv6f64(<vscale x 6 x double> %vec)
+ ret {<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>} %retval
+}
+
+define {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @vector_deinterleave_nxv2bf16_nxv6bf16(<vscale x 6 x bfloat> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv2bf16_nxv6bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uunpkhi z2.d, z0.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: str z1, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} @llvm.vector.deinterleave3.nxv6bf16(<vscale x 6 x bfloat> %vec)
+ ret {<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x bfloat>} %retval
+}
+
+define {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @vector_deinterleave_nxv4bf16_nxv12bf16(<vscale x 12 x bfloat> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv4bf16_nxv12bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpkhi z2.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: str z1, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} @llvm.vector.deinterleave3.nxv12bf16(<vscale x 12 x bfloat> %vec)
+ ret {<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x bfloat>} %retval
+}
+
+define {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @vector_deinterleave_nxv8bf16_nxv24bf16(<vscale x 24 x bfloat> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv8bf16_nxv24bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3h { z0.h - z2.h }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} @llvm.vector.deinterleave3.nxv24bf16(<vscale x 24 x bfloat> %vec)
+ ret {<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x bfloat>} %retval
+}
+
+; Integers
+
+define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv48i8(<vscale x 48 x i8> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv48i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3b { z0.b - z2.b }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave3.nxv48i8(<vscale x 48 x i8> %vec)
+ ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %retval
+}
+
+define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv24i16(<vscale x 24 x i16> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv24i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3h { z0.h - z2.h }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave3.nxv24i16(<vscale x 24 x i16> %vec)
+ ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %retval
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxvv12i32(<vscale x 12 x i32> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv4i32_nxvv12i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3w { z0.s - z2.s }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave3.nxv12i32(<vscale x 12 x i32> %vec)
+ ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %retval
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv6i64(<vscale x 6 x i64> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv6i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: str z2, [sp, #2, mul vl]
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp]
+; CHECK-NEXT: ld3d { z0.d - z2.d }, p0/z, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave3.nxv6i64(<vscale x 6 x i64> %vec)
+ ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
+}
+
define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv64i8(<vscale x 64 x i8> %vec) {
; SVE-LABEL: vector_deinterleave_nxv16i8_nxv64i8:
; SVE: // %bb.0:
@@ -599,31 +867,3 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @vector_deinterleave_nxv2i32_nxv
%retval = call {<vscale x 2 x i32>,<vscale x 2 x i32>} @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32> %vec)
ret {<vscale x 2 x i32>, <vscale x 2 x i32>} %retval
}
-
-; Floating declarations
-declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
-declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
-declare {<vscale x 2 x float>, <vscale x 2 x float>} @llvm.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
-declare {<vscale x 8 x half>, <vscale x 8 x half>} @llvm.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
-declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
-declare {<vscale x 2 x double>, <vscale x 2 x double>} @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
-
-; Integer declarations
-declare {<vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8>)
-declare {<vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16>)
-declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave2.nxv8i32(<vscale x 8 x i32>)
-declare {<vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave2.nxv4i64(<vscale x 4 x i64>)
-
-; Predicated declarations
-declare {<vscale x 16 x i1>, <vscale x 16 x i1>} @llvm.vector.deinterleave2.nxv32i1(<vscale x 32 x i1>)
-declare {<vscale x 8 x i1>, <vscale x 8 x i1>} @llvm.vector.deinterleave2.nxv16i1(<vscale x 16 x i1>)
-declare {<vscale x 4 x i1>, <vscale x 4 x i1>} @llvm.vector.deinterleave2.nxv8i1(<vscale x 8 x i1>)
-declare {<vscale x 2 x i1>, <vscale x 2 x i1>} @llvm.vector.deinterleave2.nxv4i1(<vscale x 4 x i1>)
-
-; Illegal size type
-declare {<vscale x 4 x i64>, <vscale x 4 x i64>} @llvm.vector.deinterleave2.nxv8i64(<vscale x 8 x i64>)
-declare {<vscale x 8 x i64>, <vscale x 8 x i64>} @llvm.vector.deinterleave2.nxv16i64(<vscale x 16 x i64>)
-
-declare {<vscale x 8 x i8>, <vscale x 8 x i8>} @llvm.vector.deinterleave2.nxv16i8(<vscale x 16 x i8>)
-declare {<vscale x 4 x i16>, <vscale x 4 x i16>} @llvm.vector.deinterleave2.nxv8i16(<vscale x 8 x i16>)
-declare {<vscale x 2 x i32>, <vscale x 2 x i32>} @llvm.vector.deinterleave2.nxv4i32(<vscale x 4 x i32>)
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
index c7fb2db53d2a3..49f185c4312a2 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
@@ -221,6 +221,318 @@ define <vscale x 4 x i64> @interleave2_nxv4i64(<vscale x 2 x i64> %vec0, <vscale
ret <vscale x 4 x i64> %retval
}
+define <vscale x 6 x half> @interleave3_nxv6f16(<vscale x 2 x half> %vec0, <vscale x 2 x half> %vec1, <vscale x 2 x half> %vec2) {
+; CHECK-LABEL: interleave3_nxv6f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3d { z0.d - z2.d }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z2, [sp]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z1.s, z2.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 6 x half> @llvm.vector.interleave3.nxv6f16(<vscale x 2 x half> %vec0, <vscale x 2 x half> %vec1, <vscale x 2 x half> %vec2)
+ ret <vscale x 6 x half> %retval
+}
+
+define <vscale x 12 x half> @interleave3_nxv12f16(<vscale x 4 x half> %vec0, <vscale x 4 x half> %vec1, <vscale x 4 x half> %vec2) {
+; CHECK-LABEL: interleave3_nxv12f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x28, 0x1e, 0x22 // sp + 16 + 40 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: addpl x8, sp, #4
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3w { z0.s - z2.s }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z1, [sp]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: st1h { z2.s }, p0, [x8, #7, mul vl]
+; CHECK-NEXT: str z0, [sp, #3, mul vl]
+; CHECK-NEXT: ldr z1, [sp, #4, mul vl]
+; CHECK-NEXT: ldr z0, [sp, #3, mul vl]
+; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 12 x half> @llvm.vector.interleave3.nxv12f16(<vscale x 4 x half> %vec0, <vscale x 4 x half> %vec1, <vscale x 4 x half> %vec2)
+ ret <vscale x 12 x half> %retval
+}
+
+define <vscale x 24 x half> @interleave3_nxv24f16(<vscale x 8 x half> %vec0, <vscale x 8 x half> %vec1, <vscale x 8 x half> %vec2) {
+; CHECK-LABEL: interleave3_nxv24f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3h { z0.h - z2.h }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 24 x half> @llvm.vector.interleave3.nxv24f16(<vscale x 8 x half> %vec0, <vscale x 8 x half> %vec1, <vscale x 8 x half> %vec2)
+ ret <vscale x 24 x half> %retval
+}
+
+define <vscale x 6 x float> @interleave3_nxv6f32(<vscale x 2 x float> %vec0, <vscale x 2 x float> %vec1, <vscale x 2 x float> %vec2) {
+; CHECK-LABEL: interleave3_nxv6f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x28, 0x1e, 0x22 // sp + 16 + 40 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: addpl x8, sp, #4
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3d { z0.d - z2.d }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z1, [sp]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: st1w { z2.d }, p0, [x8, #7, mul vl]
+; CHECK-NEXT: str z0, [sp, #3, mul vl]
+; CHECK-NEXT: ldr z1, [sp, #4, mul vl]
+; CHECK-NEXT: ldr z0, [sp, #3, mul vl]
+; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 6 x float> @llvm.vector.interleave3.nxv6f32(<vscale x 2 x float> %vec0, <vscale x 2 x float> %vec1, <vscale x 2 x float> %vec2)
+ ret <vscale x 6 x float> %retval
+}
+
+define <vscale x 12 x float> @interleave3_nxv12f32(<vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2) {
+; CHECK-LABEL: interleave3_nxv12f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3w { z0.s - z2.s }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 12 x float> @llvm.vector.interleave3.nxv12f32(<vscale x 4 x float> %vec0, <vscale x 4 x float> %vec1, <vscale x 4 x float> %vec2)
+ ret <vscale x 12 x float> %retval
+}
+
+define <vscale x 6 x double> @interleave3_nxv6f64(<vscale x 2 x double> %vec0, <vscale x 2 x double> %vec1, <vscale x 2 x double> %vec2) {
+; CHECK-LABEL: interleave3_nxv6f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3d { z0.d - z2.d }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 6 x double>@llvm.vector.interleave3.nxv6f64(<vscale x 2 x double> %vec0, <vscale x 2 x double> %vec1, <vscale x 2 x double> %vec2)
+ ret <vscale x 6 x double> %retval
+}
+
+define <vscale x 6 x bfloat> @interleave3_nxv6bf16(<vscale x 2 x bfloat> %vec0, <vscale x 2 x bfloat> %vec1, <vscale x 2 x bfloat> %vec2) {
+; CHECK-LABEL: interleave3_nxv6bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3d { z0.d - z2.d }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z2, [sp]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z1.s, z2.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 6 x bfloat> @llvm.vector.interleave3.nxv6bf16(<vscale x 2 x bfloat> %vec0, <vscale x 2 x bfloat> %vec1, <vscale x 2 x bfloat> %vec2)
+ ret <vscale x 6 x bfloat> %retval
+}
+
+define <vscale x 12 x bfloat> @interleave3_nxv12bf16(<vscale x 4 x bfloat> %vec0, <vscale x 4 x bfloat> %vec1, <vscale x 4 x bfloat> %vec2) {
+; CHECK-LABEL: interleave3_nxv12bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-5
+; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x28, 0x1e, 0x22 // sp + 16 + 40 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: addpl x8, sp, #4
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3w { z0.s - z2.s }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z1, [sp]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: st1h { z2.s }, p0, [x8, #7, mul vl]
+; CHECK-NEXT: str z0, [sp, #3, mul vl]
+; CHECK-NEXT: ldr z1, [sp, #4, mul vl]
+; CHECK-NEXT: ldr z0, [sp, #3, mul vl]
+; CHECK-NEXT: addvl sp, sp, #5
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 12 x bfloat> @llvm.vector.interleave3.nxv12bf16(<vscale x 4 x bfloat> %vec0, <vscale x 4 x bfloat> %vec1, <vscale x 4 x bfloat> %vec2)
+ ret <vscale x 12 x bfloat> %retval
+}
+
+define <vscale x 24 x bfloat> @interleave3_nxv24bf16(<vscale x 8 x bfloat> %vec0, <vscale x 8 x bfloat> %vec1, <vscale x 8 x bfloat> %vec2) {
+; CHECK-LABEL: interleave3_nxv24bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3h { z0.h - z2.h }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 24 x bfloat> @llvm.vector.interleave3.nxv24bf16(<vscale x 8 x bfloat> %vec0, <vscale x 8 x bfloat> %vec1, <vscale x 8 x bfloat> %vec2)
+ ret <vscale x 24 x bfloat> %retval
+}
+
+; Integers
+
+define <vscale x 48 x i8> @interleave3_nxv48i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1, <vscale x 16 x i8> %vec2) {
+; CHECK-LABEL: interleave3_nxv48i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.b
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3b { z0.b - z2.b }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 48 x i8> @llvm.vector.interleave3.nxv48i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1, <vscale x 16 x i8> %vec2)
+ ret <vscale x 48 x i8> %retval
+}
+
+define <vscale x 24 x i16> @interleave3_nxv24i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1, <vscale x 8 x i16> %vec2) {
+; CHECK-LABEL: interleave3_nxv24i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3h { z0.h - z2.h }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 24 x i16> @llvm.vector.interleave3.nxv24i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1, <vscale x 8 x i16> %vec2)
+ ret <vscale x 24 x i16> %retval
+}
+
+define <vscale x 12 x i32> @interleave3_nxv12i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2) {
+; CHECK-LABEL: interleave3_nxv12i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3w { z0.s - z2.s }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 12 x i32> @llvm.vector.interleave3.nxv12i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2)
+ ret <vscale x 12 x i32> %retval
+}
+
+define <vscale x 6 x i64> @interleave3_nxv6i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1, <vscale x 2 x i64> %vec2) {
+; CHECK-LABEL: interleave3_nxv6i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2 def $z0_z1_z2
+; CHECK-NEXT: st3d { z0.d - z2.d }, p0, [sp]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z2, [sp, #2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %retval = call <vscale x 6 x i64> @llvm.vector.interleave3.nxv6i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1, <vscale x 2 x i64> %vec2)
+ ret <vscale x 6 x i64> %retval
+}
+
define <vscale x 64 x i8> @interleave4_nxv16i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1, <vscale x 16 x i8> %vec2, <vscale x 16 x i8> %vec3) {
; SVE-LABEL: interleave4_nxv16i8:
; SVE: // %bb.0:
More information about the llvm-commits
mailing list