[llvm] [AArch64] Lower vector.[de]interleave4 (PR #141513)
Luke Lau via llvm-commits
llvm-commits at lists.llvm.org
Mon May 26 11:59:20 PDT 2025
https://github.com/lukel97 created https://github.com/llvm/llvm-project/pull/141513
After https://github.com/llvm/llvm-project/pull/139893, we now have [de]interleave intrinsics for factors 2-8 inclusive, with the plan to eventually get the loop vectorizer to emit a single intrinsic for these factors instead of recursively deinterleaving (to support scalable non-power-of-2 factors and to remove the complexity in the interleaved access pass).
AArch64 currently supports scalable interleaved groups of factors 2 and 4 from the loop vectorizer. For factor 4 this is currently emitted as a series of recursive [de]interleaves, and normally converted to a target intrinsic in the interleaved access pass.
However if for some reason the interleaved access pass doesn't catch it, the [de]interleave4 intrinsic will need to be lowered by the backend, which this patch adds support for.
Factor 3 will probably be more complicated to lower so I've left it out for now. We can disable it in the cost model when implementing the loop vectorizer changes.
>From 248bff6ea7238e6f657680e671a6e7546a24dc0d Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 14 May 2025 13:15:19 +0100
Subject: [PATCH] [AArch64] Lower vector.[de]interleave4
---
.../Target/AArch64/AArch64ISelLowering.cpp | 57 ++++++++++++++++
.../AArch64/sve-vector-deinterleave.ll | 65 ++++++++++++++++++-
.../CodeGen/AArch64/sve-vector-interleave.ll | 64 ++++++++++++++++++
3 files changed, 185 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4dacd2273306e..08b9f098efb1e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29441,6 +29441,35 @@ AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
EVT OpVT = Op.getValueType();
assert(OpVT.isScalableVector() &&
"Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
+ assert(Op->getNumOperands() == 2 ||
+ Op->getNumOperands() == 4 && "Expected factor to be 2 or 4.");
+
+ // Deinterleave 'ab cd ac bd' as a series of factor 2 deinterleaves.
+ if (Op.getNumOperands() == 4) {
+ SDVTList VTList = DAG.getVTList({OpVT, OpVT});
+ // ac ac
+ SDNode *LHS0 = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTList,
+ Op.getOperand(0), Op.getOperand(1))
+ .getNode();
+ // bd bd
+ SDNode *RHS0 = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTList,
+ Op.getOperand(2), Op.getOperand(3))
+ .getNode();
+ // aa cc
+ SDNode *LHS1 = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTList,
+ SDValue(LHS0, 0), SDValue(RHS0, 0))
+ .getNode();
+ // bb dd
+ SDNode *RHS1 = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTList,
+ SDValue(LHS0, 1), SDValue(RHS0, 1))
+ .getNode();
+
+ // aa bb cc dd
+ return DAG.getMergeValues({SDValue(LHS1, 0), SDValue(RHS1, 0),
+ SDValue(LHS1, 1), SDValue(RHS1, 1)},
+ DL);
+ }
+
SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
Op.getOperand(1));
SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
@@ -29454,6 +29483,34 @@ SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
EVT OpVT = Op.getValueType();
assert(OpVT.isScalableVector() &&
"Expected scalable vector in LowerVECTOR_INTERLEAVE.");
+ assert(Op->getNumOperands() == 2 ||
+ Op->getNumOperands() == 4 && "Expected factor to be 2 or 4.");
+
+ // Interleave 'aa bb cc dd' as a series of factor 2 interleaves.
+ if (Op.getNumOperands() == 4) {
+ SDVTList VTList = DAG.getVTList({OpVT, OpVT});
+ // ac ac
+ SDNode *LHS0 = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTList,
+ Op.getOperand(0), Op.getOperand(2))
+ .getNode();
+ // bd bd
+ SDNode *RHS0 = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTList,
+ Op.getOperand(1), Op.getOperand(3))
+ .getNode();
+ // ab cd
+ SDNode *LHS1 = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTList,
+ SDValue(LHS0, 0), SDValue(RHS0, 0))
+ .getNode();
+ // ab cd
+ SDNode *RHS1 = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTList,
+ SDValue(LHS0, 1), SDValue(RHS0, 1))
+ .getNode();
+
+ // ab cd ab cd
+ return DAG.getMergeValues({SDValue(LHS1, 0), SDValue(LHS1, 1),
+ SDValue(RHS1, 0), SDValue(RHS1, 1)},
+ DL);
+ }
SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
Op.getOperand(1));
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
index adf1b48b6998a..9a871e20b4b09 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
@@ -151,6 +151,70 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv
ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval
}
+define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv64i8(<vscale x 64 x i8> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv64i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uzp1 z4.b, z2.b, z3.b
+; CHECK-NEXT: uzp1 z5.b, z0.b, z1.b
+; CHECK-NEXT: uzp2 z3.b, z2.b, z3.b
+; CHECK-NEXT: uzp2 z6.b, z0.b, z1.b
+; CHECK-NEXT: uzp1 z0.b, z5.b, z4.b
+; CHECK-NEXT: uzp2 z2.b, z5.b, z4.b
+; CHECK-NEXT: uzp1 z1.b, z6.b, z3.b
+; CHECK-NEXT: uzp2 z3.b, z6.b, z3.b
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %vec)
+ ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %retval
+}
+
+define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv32i16(<vscale x 32 x i16> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv32i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uzp1 z4.h, z2.h, z3.h
+; CHECK-NEXT: uzp1 z5.h, z0.h, z1.h
+; CHECK-NEXT: uzp2 z3.h, z2.h, z3.h
+; CHECK-NEXT: uzp2 z6.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.h, z5.h, z4.h
+; CHECK-NEXT: uzp2 z2.h, z5.h, z4.h
+; CHECK-NEXT: uzp1 z1.h, z6.h, z3.h
+; CHECK-NEXT: uzp2 z3.h, z6.h, z3.h
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave4.nxv32i16(<vscale x 32 x i16> %vec)
+ ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %retval
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxv16i32(<vscale x 16 x i32> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv4i32_nxv16i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uzp1 z4.s, z2.s, z3.s
+; CHECK-NEXT: uzp1 z5.s, z0.s, z1.s
+; CHECK-NEXT: uzp2 z3.s, z2.s, z3.s
+; CHECK-NEXT: uzp2 z6.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.s, z5.s, z4.s
+; CHECK-NEXT: uzp2 z2.s, z5.s, z4.s
+; CHECK-NEXT: uzp1 z1.s, z6.s, z3.s
+; CHECK-NEXT: uzp2 z3.s, z6.s, z3.s
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> %vec)
+ ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %retval
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv8i64(<vscale x 8 x i64> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv8i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d
+; CHECK-NEXT: uzp1 z5.d, z0.d, z1.d
+; CHECK-NEXT: uzp2 z3.d, z2.d, z3.d
+; CHECK-NEXT: uzp2 z6.d, z0.d, z1.d
+; CHECK-NEXT: uzp1 z0.d, z5.d, z4.d
+; CHECK-NEXT: uzp2 z2.d, z5.d, z4.d
+; CHECK-NEXT: uzp1 z1.d, z6.d, z3.d
+; CHECK-NEXT: uzp2 z3.d, z6.d, z3.d
+; CHECK-NEXT: ret
+ %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave4.nxv8i64(<vscale x 8 x i64> %vec)
+ ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
+}
+
; Predicated
define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv32i1(<vscale x 32 x i1> %vec) {
; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1:
@@ -279,7 +343,6 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @vector_deinterleave_nxv2i32_nxv
ret {<vscale x 2 x i32>, <vscale x 2 x i32>} %retval
}
-
; Floating declarations
declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
index 288034422d9c0..990faf0d320e3 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
@@ -146,6 +146,70 @@ define <vscale x 4 x i64> @interleave2_nxv4i64(<vscale x 2 x i64> %vec0, <vscale
ret <vscale x 4 x i64> %retval
}
+define <vscale x 64 x i8> @interleave4_nxv16i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1, <vscale x 16 x i8> %vec2, <vscale x 16 x i8> %vec3) {
+; CHECK-LABEL: interleave4_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip1 z4.b, z1.b, z3.b
+; CHECK-NEXT: zip1 z5.b, z0.b, z2.b
+; CHECK-NEXT: zip2 z3.b, z1.b, z3.b
+; CHECK-NEXT: zip2 z6.b, z0.b, z2.b
+; CHECK-NEXT: zip1 z0.b, z5.b, z4.b
+; CHECK-NEXT: zip2 z1.b, z5.b, z4.b
+; CHECK-NEXT: zip1 z2.b, z6.b, z3.b
+; CHECK-NEXT: zip2 z3.b, z6.b, z3.b
+; CHECK-NEXT: ret
+ %retval = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv16i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1, <vscale x 16 x i8> %vec2, <vscale x 16 x i8> %vec3)
+ ret <vscale x 64 x i8> %retval
+}
+
+define <vscale x 32 x i16> @interleave4_nxv8i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1, <vscale x 8 x i16> %vec2, <vscale x 8 x i16> %vec3) {
+; CHECK-LABEL: interleave4_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip1 z4.h, z1.h, z3.h
+; CHECK-NEXT: zip1 z5.h, z0.h, z2.h
+; CHECK-NEXT: zip2 z3.h, z1.h, z3.h
+; CHECK-NEXT: zip2 z6.h, z0.h, z2.h
+; CHECK-NEXT: zip1 z0.h, z5.h, z4.h
+; CHECK-NEXT: zip2 z1.h, z5.h, z4.h
+; CHECK-NEXT: zip1 z2.h, z6.h, z3.h
+; CHECK-NEXT: zip2 z3.h, z6.h, z3.h
+; CHECK-NEXT: ret
+ %retval = call <vscale x 32 x i16> @llvm.vector.interleave4.nxv8i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1, <vscale x 8 x i16> %vec2, <vscale x 8 x i16> %vec3)
+ ret <vscale x 32 x i16> %retval
+}
+
+define <vscale x 16 x i32> @interleave4_nxv4i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2, <vscale x 4 x i32> %vec3) {
+; CHECK-LABEL: interleave4_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip1 z4.s, z1.s, z3.s
+; CHECK-NEXT: zip1 z5.s, z0.s, z2.s
+; CHECK-NEXT: zip2 z3.s, z1.s, z3.s
+; CHECK-NEXT: zip2 z6.s, z0.s, z2.s
+; CHECK-NEXT: zip1 z0.s, z5.s, z4.s
+; CHECK-NEXT: zip2 z1.s, z5.s, z4.s
+; CHECK-NEXT: zip1 z2.s, z6.s, z3.s
+; CHECK-NEXT: zip2 z3.s, z6.s, z3.s
+; CHECK-NEXT: ret
+ %retval = call <vscale x 16 x i32> @llvm.vector.interleave4.nxv4i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2, <vscale x 4 x i32> %vec3)
+ ret <vscale x 16 x i32> %retval
+}
+
+define <vscale x 8 x i64> @interleave4_nxv8i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1, <vscale x 2 x i64> %vec2, <vscale x 2 x i64> %vec3) {
+; CHECK-LABEL: interleave4_nxv8i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: zip1 z4.d, z1.d, z3.d
+; CHECK-NEXT: zip1 z5.d, z0.d, z2.d
+; CHECK-NEXT: zip2 z3.d, z1.d, z3.d
+; CHECK-NEXT: zip2 z6.d, z0.d, z2.d
+; CHECK-NEXT: zip1 z0.d, z5.d, z4.d
+; CHECK-NEXT: zip2 z1.d, z5.d, z4.d
+; CHECK-NEXT: zip1 z2.d, z6.d, z3.d
+; CHECK-NEXT: zip2 z3.d, z6.d, z3.d
+; CHECK-NEXT: ret
+ %retval = call <vscale x 8 x i64> @llvm.vector.interleave4.nxv8i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1, <vscale x 2 x i64> %vec2, <vscale x 2 x i64> %vec3)
+ ret <vscale x 8 x i64> %retval
+}
+
; Predicated
define <vscale x 32 x i1> @interleave2_nxv32i1(<vscale x 16 x i1> %vec0, <vscale x 16 x i1> %vec1) {
More information about the llvm-commits
mailing list