[llvm] [SelectionDAG][AArch64] Legalize power of 2 vector.[de]interleaveN (PR #141513)

Tue May 27 09:42:16 PDT 2025

https://github.com/lukel97 updated https://github.com/llvm/llvm-project/pull/141513

>From 248bff6ea7238e6f657680e671a6e7546a24dc0d Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 14 May 2025 13:15:19 +0100
Subject: [PATCH 1/3] [AArch64] Lower vector.[de]interleave4

---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 57 ++++++++++++++++
 .../AArch64/sve-vector-deinterleave.ll        | 65 ++++++++++++++++++-
 .../CodeGen/AArch64/sve-vector-interleave.ll  | 64 ++++++++++++++++++
 3 files changed, 185 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 4dacd2273306e..08b9f098efb1e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29441,6 +29441,35 @@ AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
   EVT OpVT = Op.getValueType();
   assert(OpVT.isScalableVector() &&
          "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
+  assert(Op->getNumOperands() == 2 ||
+         Op->getNumOperands() == 4 && "Expected factor to be 2 or 4.");
+
+  // Deinterleave 'ab cd ac bd' as a series of factor 2 deinterleaves.
+  if (Op.getNumOperands() == 4) {
+    SDVTList VTList = DAG.getVTList({OpVT, OpVT});
+    // ac ac
+    SDNode *LHS0 = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTList,
+                               Op.getOperand(0), Op.getOperand(1))
+                       .getNode();
+    // bd bd
+    SDNode *RHS0 = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTList,
+                               Op.getOperand(2), Op.getOperand(3))
+                       .getNode();
+    // aa cc
+    SDNode *LHS1 = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTList,
+                               SDValue(LHS0, 0), SDValue(RHS0, 0))
+                       .getNode();
+    // bb dd
+    SDNode *RHS1 = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTList,
+                               SDValue(LHS0, 1), SDValue(RHS0, 1))
+                       .getNode();
+
+    // aa bb cc dd
+    return DAG.getMergeValues({SDValue(LHS1, 0), SDValue(RHS1, 0),
+                               SDValue(LHS1, 1), SDValue(RHS1, 1)},
+                              DL);
+  }
+
   SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
                              Op.getOperand(1));
   SDValue Odd = DAG.getNode(AArch64ISD::UZP2, DL, OpVT, Op.getOperand(0),
@@ -29454,6 +29483,34 @@ SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
   EVT OpVT = Op.getValueType();
   assert(OpVT.isScalableVector() &&
          "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
+  assert(Op->getNumOperands() == 2 ||
+         Op->getNumOperands() == 4 && "Expected factor to be 2 or 4.");
+
+  // Interleave 'aa bb cc dd' as a series of factor 2 interleaves.
+  if (Op.getNumOperands() == 4) {
+    SDVTList VTList = DAG.getVTList({OpVT, OpVT});
+    // ac ac
+    SDNode *LHS0 = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTList,
+                               Op.getOperand(0), Op.getOperand(2))
+                       .getNode();
+    // bd bd
+    SDNode *RHS0 = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTList,
+                               Op.getOperand(1), Op.getOperand(3))
+                       .getNode();
+    // ab cd
+    SDNode *LHS1 = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTList,
+                               SDValue(LHS0, 0), SDValue(RHS0, 0))
+                       .getNode();
+    // ab cd
+    SDNode *RHS1 = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTList,
+                               SDValue(LHS0, 1), SDValue(RHS0, 1))
+                       .getNode();
+
+    // ab cd ab cd
+    return DAG.getMergeValues({SDValue(LHS1, 0), SDValue(LHS1, 1),
+                               SDValue(RHS1, 0), SDValue(RHS1, 1)},
+                              DL);
+  }
 
   SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
                            Op.getOperand(1));
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
index adf1b48b6998a..9a871e20b4b09 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
@@ -151,6 +151,70 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv
   ret {<vscale x 2 x i64>, <vscale x 2 x i64>} %retval
 }
 
+define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @vector_deinterleave_nxv16i8_nxv64i8(<vscale x 64 x i8> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv16i8_nxv64i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 z4.b, z2.b, z3.b
+; CHECK-NEXT:    uzp1 z5.b, z0.b, z1.b
+; CHECK-NEXT:    uzp2 z3.b, z2.b, z3.b
+; CHECK-NEXT:    uzp2 z6.b, z0.b, z1.b
+; CHECK-NEXT:    uzp1 z0.b, z5.b, z4.b
+; CHECK-NEXT:    uzp2 z2.b, z5.b, z4.b
+; CHECK-NEXT:    uzp1 z1.b, z6.b, z3.b
+; CHECK-NEXT:    uzp2 z3.b, z6.b, z3.b
+; CHECK-NEXT:    ret
+  %retval = call {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} @llvm.vector.deinterleave4.nxv64i8(<vscale x 64 x i8> %vec)
+  ret {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>} %retval
+}
+
+define {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @vector_deinterleave_nxv8i16_nxv32i16(<vscale x 32 x i16> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv8i16_nxv32i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 z4.h, z2.h, z3.h
+; CHECK-NEXT:    uzp1 z5.h, z0.h, z1.h
+; CHECK-NEXT:    uzp2 z3.h, z2.h, z3.h
+; CHECK-NEXT:    uzp2 z6.h, z0.h, z1.h
+; CHECK-NEXT:    uzp1 z0.h, z5.h, z4.h
+; CHECK-NEXT:    uzp2 z2.h, z5.h, z4.h
+; CHECK-NEXT:    uzp1 z1.h, z6.h, z3.h
+; CHECK-NEXT:    uzp2 z3.h, z6.h, z3.h
+; CHECK-NEXT:    ret
+  %retval = call {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} @llvm.vector.deinterleave4.nxv32i16(<vscale x 32 x i16> %vec)
+  ret {<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>} %retval
+}
+
+define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @vector_deinterleave_nxv4i32_nxv16i32(<vscale x 16 x i32> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv4i32_nxv16i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 z4.s, z2.s, z3.s
+; CHECK-NEXT:    uzp1 z5.s, z0.s, z1.s
+; CHECK-NEXT:    uzp2 z3.s, z2.s, z3.s
+; CHECK-NEXT:    uzp2 z6.s, z0.s, z1.s
+; CHECK-NEXT:    uzp1 z0.s, z5.s, z4.s
+; CHECK-NEXT:    uzp2 z2.s, z5.s, z4.s
+; CHECK-NEXT:    uzp1 z1.s, z6.s, z3.s
+; CHECK-NEXT:    uzp2 z3.s, z6.s, z3.s
+; CHECK-NEXT:    ret
+  %retval = call {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.vector.deinterleave4.nxv16i32(<vscale x 16 x i32> %vec)
+  ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %retval
+}
+
+define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv8i64(<vscale x 8 x i64> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv8i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 z4.d, z2.d, z3.d
+; CHECK-NEXT:    uzp1 z5.d, z0.d, z1.d
+; CHECK-NEXT:    uzp2 z3.d, z2.d, z3.d
+; CHECK-NEXT:    uzp2 z6.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z0.d, z5.d, z4.d
+; CHECK-NEXT:    uzp2 z2.d, z5.d, z4.d
+; CHECK-NEXT:    uzp1 z1.d, z6.d, z3.d
+; CHECK-NEXT:    uzp2 z3.d, z6.d, z3.d
+; CHECK-NEXT:    ret
+  %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave4.nxv8i64(<vscale x 8 x i64> %vec)
+  ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
+}
+
 ; Predicated
 define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv32i1(<vscale x 32 x i1> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1:
@@ -279,7 +343,6 @@ define {<vscale x 2 x i32>, <vscale x 2 x i32>} @vector_deinterleave_nxv2i32_nxv
   ret {<vscale x 2 x i32>, <vscale x 2 x i32>} %retval
 }
 
-
 ; Floating declarations
 declare {<vscale x 2 x half>,<vscale x 2 x half>} @llvm.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
 declare {<vscale x 4 x half>, <vscale x 4 x half>} @llvm.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
index 288034422d9c0..990faf0d320e3 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
@@ -146,6 +146,70 @@ define <vscale x 4 x i64> @interleave2_nxv4i64(<vscale x 2 x i64> %vec0, <vscale
   ret <vscale x 4 x i64> %retval
 }
 
+define <vscale x 64 x i8> @interleave4_nxv16i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1, <vscale x 16 x i8> %vec2, <vscale x 16 x i8> %vec3) {
+; CHECK-LABEL: interleave4_nxv16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    zip1 z4.b, z1.b, z3.b
+; CHECK-NEXT:    zip1 z5.b, z0.b, z2.b
+; CHECK-NEXT:    zip2 z3.b, z1.b, z3.b
+; CHECK-NEXT:    zip2 z6.b, z0.b, z2.b
+; CHECK-NEXT:    zip1 z0.b, z5.b, z4.b
+; CHECK-NEXT:    zip2 z1.b, z5.b, z4.b
+; CHECK-NEXT:    zip1 z2.b, z6.b, z3.b
+; CHECK-NEXT:    zip2 z3.b, z6.b, z3.b
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 64 x i8> @llvm.vector.interleave4.nxv16i8(<vscale x 16 x i8> %vec0, <vscale x 16 x i8> %vec1, <vscale x 16 x i8> %vec2, <vscale x 16 x i8> %vec3)
+  ret <vscale x 64 x i8> %retval
+}
+
+define <vscale x 32 x i16> @interleave4_nxv8i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1, <vscale x 8 x i16> %vec2, <vscale x 8 x i16> %vec3) {
+; CHECK-LABEL: interleave4_nxv8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    zip1 z4.h, z1.h, z3.h
+; CHECK-NEXT:    zip1 z5.h, z0.h, z2.h
+; CHECK-NEXT:    zip2 z3.h, z1.h, z3.h
+; CHECK-NEXT:    zip2 z6.h, z0.h, z2.h
+; CHECK-NEXT:    zip1 z0.h, z5.h, z4.h
+; CHECK-NEXT:    zip2 z1.h, z5.h, z4.h
+; CHECK-NEXT:    zip1 z2.h, z6.h, z3.h
+; CHECK-NEXT:    zip2 z3.h, z6.h, z3.h
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 32 x i16> @llvm.vector.interleave4.nxv8i16(<vscale x 8 x i16> %vec0, <vscale x 8 x i16> %vec1, <vscale x 8 x i16> %vec2, <vscale x 8 x i16> %vec3)
+  ret <vscale x 32 x i16> %retval
+}
+
+define <vscale x 16 x i32> @interleave4_nxv4i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2, <vscale x 4 x i32> %vec3) {
+; CHECK-LABEL: interleave4_nxv4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    zip1 z4.s, z1.s, z3.s
+; CHECK-NEXT:    zip1 z5.s, z0.s, z2.s
+; CHECK-NEXT:    zip2 z3.s, z1.s, z3.s
+; CHECK-NEXT:    zip2 z6.s, z0.s, z2.s
+; CHECK-NEXT:    zip1 z0.s, z5.s, z4.s
+; CHECK-NEXT:    zip2 z1.s, z5.s, z4.s
+; CHECK-NEXT:    zip1 z2.s, z6.s, z3.s
+; CHECK-NEXT:    zip2 z3.s, z6.s, z3.s
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 16 x i32> @llvm.vector.interleave4.nxv4i32(<vscale x 4 x i32> %vec0, <vscale x 4 x i32> %vec1, <vscale x 4 x i32> %vec2, <vscale x 4 x i32> %vec3)
+  ret <vscale x 16 x i32> %retval
+}
+
+define <vscale x 8 x i64> @interleave4_nxv8i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1, <vscale x 2 x i64> %vec2, <vscale x 2 x i64> %vec3) {
+; CHECK-LABEL: interleave4_nxv8i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    zip1 z4.d, z1.d, z3.d
+; CHECK-NEXT:    zip1 z5.d, z0.d, z2.d
+; CHECK-NEXT:    zip2 z3.d, z1.d, z3.d
+; CHECK-NEXT:    zip2 z6.d, z0.d, z2.d
+; CHECK-NEXT:    zip1 z0.d, z5.d, z4.d
+; CHECK-NEXT:    zip2 z1.d, z5.d, z4.d
+; CHECK-NEXT:    zip1 z2.d, z6.d, z3.d
+; CHECK-NEXT:    zip2 z3.d, z6.d, z3.d
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 8 x i64> @llvm.vector.interleave4.nxv8i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1, <vscale x 2 x i64> %vec2, <vscale x 2 x i64> %vec3)
+  ret <vscale x 8 x i64> %retval
+}
+
 ; Predicated
 
 define <vscale x 32 x i1> @interleave2_nxv32i1(<vscale x 16 x i1> %vec0, <vscale x 16 x i1> %vec1) {

>From c2e329d4aee3766e988f39d6374d129ca37ab79e Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 27 May 2025 17:36:11 +0100
Subject: [PATCH 2/3] Move to LegalizeDAG and handle arbitrary pow-of-2 factors

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 56 +++++++++++++++++
 .../Target/AArch64/AArch64ISelLowering.cpp    | 62 ++-----------------
 .../AArch64/sve-vector-deinterleave.ll        | 32 ++++++++++
 .../CodeGen/AArch64/sve-vector-interleave.ll  | 32 ++++++++++
 4 files changed, 126 insertions(+), 56 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 528c07cc5549d..412bf087251e8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3558,6 +3558,62 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(TLI.expandVectorSplice(Node, DAG));
     break;
   }
+  case ISD::VECTOR_DEINTERLEAVE: {
+    unsigned Factor = Node->getNumOperands();
+    if (Factor <= 2 || !isPowerOf2_32(Factor))
+      break;
+    SmallVector<SDValue, 8> Ops;
+    for (SDValue Op : Node->ops())
+      Ops.push_back(Op);
+    EVT VecVT = Node->getValueType(0);
+    SmallVector<EVT> HalfVTs(Factor / 2, VecVT);
+    // Deinterleave at Factor/2 so each result contains two factors interleaved:
+    // ab cd ab cd -> [ac bd] [ac bd]
+    SDValue L = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, dl, HalfVTs,
+                            ArrayRef(Ops).take_front(Factor / 2));
+    SDValue R = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, dl, HalfVTs,
+                            ArrayRef(Ops).take_back(Factor / 2));
+    Results.resize(Factor);
+    // Deinterleave the 2 factors out:
+    // [ac ac] [bd bd] -> aa bb cc dd
+    for (unsigned I = 0; I < Factor / 2; I++) {
+      SDValue Deinterleave =
+          DAG.getNode(ISD::VECTOR_DEINTERLEAVE, dl, {VecVT, VecVT},
+                      {L.getValue(I), R.getValue(I)});
+      Results[I] = Deinterleave.getValue(0);
+      Results[I + Factor / 2] = Deinterleave.getValue(1);
+    }
+    break;
+  }
+  case ISD::VECTOR_INTERLEAVE: {
+    unsigned Factor = Node->getNumOperands();
+    if (Factor <= 2 || !isPowerOf2_32(Factor))
+      break;
+    SmallVector<SDValue, 8> Ops;
+    for (SDValue Op : Node->ops())
+      Ops.push_back(Op);
+    EVT VecVT = Node->getValueType(0);
+    SmallVector<EVT> HalfVTs(Factor / 2, VecVT);
+    SmallVector<SDValue, 8> LOps, ROps;
+    // Interleave so we have 2 factors per result:
+    // aa bb cc dd -> [ac bd] [ac bd]
+    for (unsigned I = 0; I < Factor / 2; I++) {
+      SDValue Interleave =
+          DAG.getNode(ISD::VECTOR_INTERLEAVE, dl, {VecVT, VecVT},
+                      {Ops[I], Ops[I + Factor / 2]});
+      LOps.push_back(Interleave.getValue(0));
+      ROps.push_back(Interleave.getValue(1));
+    }
+    // Interleave at Factor/2:
+    // [ac bd] [ac bd] -> ab cd ab cd
+    SDValue L = DAG.getNode(ISD::VECTOR_INTERLEAVE, dl, HalfVTs, LOps);
+    SDValue R = DAG.getNode(ISD::VECTOR_INTERLEAVE, dl, HalfVTs, ROps);
+    for (unsigned I = 0; I < Factor / 2; I++)
+      Results.push_back(L.getValue(I));
+    for (unsigned I = 0; I < Factor / 2; I++)
+      Results.push_back(R.getValue(I));
+    break;
+  }
   case ISD::EXTRACT_ELEMENT: {
     EVT OpTy = Node->getOperand(0).getValueType();
     if (Node->getConstantOperandVal(1)) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 08b9f098efb1e..269716e44dbc8 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29441,34 +29441,9 @@ AArch64TargetLowering::LowerVECTOR_DEINTERLEAVE(SDValue Op,
   EVT OpVT = Op.getValueType();
   assert(OpVT.isScalableVector() &&
          "Expected scalable vector in LowerVECTOR_DEINTERLEAVE.");
-  assert(Op->getNumOperands() == 2 ||
-         Op->getNumOperands() == 4 && "Expected factor to be 2 or 4.");
-
-  // Deinterleave 'ab cd ac bd' as a series of factor 2 deinterleaves.
-  if (Op.getNumOperands() == 4) {
-    SDVTList VTList = DAG.getVTList({OpVT, OpVT});
-    // ac ac
-    SDNode *LHS0 = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTList,
-                               Op.getOperand(0), Op.getOperand(1))
-                       .getNode();
-    // bd bd
-    SDNode *RHS0 = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTList,
-                               Op.getOperand(2), Op.getOperand(3))
-                       .getNode();
-    // aa cc
-    SDNode *LHS1 = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTList,
-                               SDValue(LHS0, 0), SDValue(RHS0, 0))
-                       .getNode();
-    // bb dd
-    SDNode *RHS1 = DAG.getNode(ISD::VECTOR_DEINTERLEAVE, DL, VTList,
-                               SDValue(LHS0, 1), SDValue(RHS0, 1))
-                       .getNode();
-
-    // aa bb cc dd
-    return DAG.getMergeValues({SDValue(LHS1, 0), SDValue(RHS1, 0),
-                               SDValue(LHS1, 1), SDValue(RHS1, 1)},
-                              DL);
-  }
+
+  if (Op->getNumOperands() != 2)
+    return SDValue();
 
   SDValue Even = DAG.getNode(AArch64ISD::UZP1, DL, OpVT, Op.getOperand(0),
                              Op.getOperand(1));
@@ -29483,34 +29458,9 @@ SDValue AArch64TargetLowering::LowerVECTOR_INTERLEAVE(SDValue Op,
   EVT OpVT = Op.getValueType();
   assert(OpVT.isScalableVector() &&
          "Expected scalable vector in LowerVECTOR_INTERLEAVE.");
-  assert(Op->getNumOperands() == 2 ||
-         Op->getNumOperands() == 4 && "Expected factor to be 2 or 4.");
-
-  // Interleave 'aa bb cc dd' as a series of factor 2 interleaves.
-  if (Op.getNumOperands() == 4) {
-    SDVTList VTList = DAG.getVTList({OpVT, OpVT});
-    // ac ac
-    SDNode *LHS0 = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTList,
-                               Op.getOperand(0), Op.getOperand(2))
-                       .getNode();
-    // bd bd
-    SDNode *RHS0 = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTList,
-                               Op.getOperand(1), Op.getOperand(3))
-                       .getNode();
-    // ab cd
-    SDNode *LHS1 = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTList,
-                               SDValue(LHS0, 0), SDValue(RHS0, 0))
-                       .getNode();
-    // ab cd
-    SDNode *RHS1 = DAG.getNode(ISD::VECTOR_INTERLEAVE, DL, VTList,
-                               SDValue(LHS0, 1), SDValue(RHS0, 1))
-                       .getNode();
-
-    // ab cd ab cd
-    return DAG.getMergeValues({SDValue(LHS1, 0), SDValue(LHS1, 1),
-                               SDValue(RHS1, 0), SDValue(RHS1, 1)},
-                              DL);
-  }
+
+  if (Op->getNumOperands() != 2)
+    return SDValue();
 
   SDValue Lo = DAG.getNode(AArch64ISD::ZIP1, DL, OpVT, Op.getOperand(0),
                            Op.getOperand(1));
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
index 9a871e20b4b09..89fc10b47bb35 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-deinterleave.ll
@@ -215,6 +215,38 @@ define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2
   ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
 }
 
+define {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @vector_deinterleave_nxv2i64_nxv16i64(<vscale x 16 x i64> %vec) {
+; CHECK-LABEL: vector_deinterleave_nxv2i64_nxv16i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uzp1 z24.d, z6.d, z7.d
+; CHECK-NEXT:    uzp1 z25.d, z4.d, z5.d
+; CHECK-NEXT:    uzp1 z26.d, z2.d, z3.d
+; CHECK-NEXT:    uzp1 z27.d, z0.d, z1.d
+; CHECK-NEXT:    uzp2 z6.d, z6.d, z7.d
+; CHECK-NEXT:    uzp2 z4.d, z4.d, z5.d
+; CHECK-NEXT:    uzp2 z2.d, z2.d, z3.d
+; CHECK-NEXT:    uzp2 z0.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z5.d, z25.d, z24.d
+; CHECK-NEXT:    uzp2 z24.d, z25.d, z24.d
+; CHECK-NEXT:    uzp1 z7.d, z27.d, z26.d
+; CHECK-NEXT:    uzp1 z28.d, z4.d, z6.d
+; CHECK-NEXT:    uzp2 z25.d, z27.d, z26.d
+; CHECK-NEXT:    uzp1 z29.d, z0.d, z2.d
+; CHECK-NEXT:    uzp2 z26.d, z4.d, z6.d
+; CHECK-NEXT:    uzp2 z27.d, z0.d, z2.d
+; CHECK-NEXT:    uzp1 z0.d, z7.d, z5.d
+; CHECK-NEXT:    uzp1 z2.d, z25.d, z24.d
+; CHECK-NEXT:    uzp2 z4.d, z7.d, z5.d
+; CHECK-NEXT:    uzp1 z1.d, z29.d, z28.d
+; CHECK-NEXT:    uzp1 z3.d, z27.d, z26.d
+; CHECK-NEXT:    uzp2 z5.d, z29.d, z28.d
+; CHECK-NEXT:    uzp2 z6.d, z25.d, z24.d
+; CHECK-NEXT:    uzp2 z7.d, z27.d, z26.d
+; CHECK-NEXT:    ret
+  %retval = call {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} @llvm.vector.deinterleave8.nxv16i64(<vscale x 16 x i64> %vec)
+  ret {<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>} %retval
+}
+
 ; Predicated
 define {<vscale x 16 x i1>, <vscale x 16 x i1>} @vector_deinterleave_nxv16i1_nxv32i1(<vscale x 32 x i1> %vec) {
 ; CHECK-LABEL: vector_deinterleave_nxv16i1_nxv32i1:
diff --git a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
index 990faf0d320e3..34d026f43708c 100644
--- a/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
+++ b/llvm/test/CodeGen/AArch64/sve-vector-interleave.ll
@@ -210,6 +210,38 @@ define <vscale x 8 x i64> @interleave4_nxv8i64(<vscale x 2 x i64> %vec0, <vscale
   ret <vscale x 8 x i64> %retval
 }
 
+define <vscale x 16 x i64> @interleave8_nxv16i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1, <vscale x 2 x i64> %vec2, <vscale x 2 x i64> %vec3, <vscale x 2 x i64> %vec4, <vscale x 2 x i64> %vec5, <vscale x 2 x i64> %vec6, <vscale x 2 x i64> %vec7) {
+; CHECK-LABEL: interleave8_nxv16i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    zip1 z24.d, z3.d, z7.d
+; CHECK-NEXT:    zip1 z25.d, z1.d, z5.d
+; CHECK-NEXT:    zip1 z26.d, z2.d, z6.d
+; CHECK-NEXT:    zip1 z27.d, z0.d, z4.d
+; CHECK-NEXT:    zip2 z3.d, z3.d, z7.d
+; CHECK-NEXT:    zip2 z1.d, z1.d, z5.d
+; CHECK-NEXT:    zip2 z2.d, z2.d, z6.d
+; CHECK-NEXT:    zip2 z0.d, z0.d, z4.d
+; CHECK-NEXT:    zip1 z4.d, z25.d, z24.d
+; CHECK-NEXT:    zip2 z6.d, z25.d, z24.d
+; CHECK-NEXT:    zip1 z5.d, z27.d, z26.d
+; CHECK-NEXT:    zip2 z7.d, z27.d, z26.d
+; CHECK-NEXT:    zip1 z24.d, z1.d, z3.d
+; CHECK-NEXT:    zip1 z25.d, z0.d, z2.d
+; CHECK-NEXT:    zip2 z26.d, z1.d, z3.d
+; CHECK-NEXT:    zip2 z27.d, z0.d, z2.d
+; CHECK-NEXT:    zip1 z0.d, z5.d, z4.d
+; CHECK-NEXT:    zip2 z1.d, z5.d, z4.d
+; CHECK-NEXT:    zip1 z2.d, z7.d, z6.d
+; CHECK-NEXT:    zip2 z3.d, z7.d, z6.d
+; CHECK-NEXT:    zip1 z4.d, z25.d, z24.d
+; CHECK-NEXT:    zip2 z5.d, z25.d, z24.d
+; CHECK-NEXT:    zip1 z6.d, z27.d, z26.d
+; CHECK-NEXT:    zip2 z7.d, z27.d, z26.d
+; CHECK-NEXT:    ret
+  %retval = call <vscale x 16 x i64> @llvm.vector.interleave8.nxv16i64(<vscale x 2 x i64> %vec0, <vscale x 2 x i64> %vec1, <vscale x 2 x i64> %vec2, <vscale x 2 x i64> %vec3, <vscale x 2 x i64> %vec4, <vscale x 2 x i64> %vec5, <vscale x 2 x i64> %vec6, <vscale x 2 x i64> %vec7)
+  ret <vscale x 16 x i64> %retval
+}
+
 ; Predicated
 
 define <vscale x 32 x i1> @interleave2_nxv32i1(<vscale x 16 x i1> %vec0, <vscale x 16 x i1> %vec1) {

>From 02bba74e00aa56b8603bb043ca5994a91584b833 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Tue, 27 May 2025 17:41:56 +0100
Subject: [PATCH 3/3] Remove unneccessary Ops vector

---
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 412bf087251e8..dfe7b5cecbb19 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -3589,9 +3589,6 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     unsigned Factor = Node->getNumOperands();
     if (Factor <= 2 || !isPowerOf2_32(Factor))
       break;
-    SmallVector<SDValue, 8> Ops;
-    for (SDValue Op : Node->ops())
-      Ops.push_back(Op);
     EVT VecVT = Node->getValueType(0);
     SmallVector<EVT> HalfVTs(Factor / 2, VecVT);
     SmallVector<SDValue, 8> LOps, ROps;
@@ -3600,7 +3597,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     for (unsigned I = 0; I < Factor / 2; I++) {
       SDValue Interleave =
           DAG.getNode(ISD::VECTOR_INTERLEAVE, dl, {VecVT, VecVT},
-                      {Ops[I], Ops[I + Factor / 2]});
+                      {Node->getOperand(I), Node->getOperand(I + Factor / 2)});
       LOps.push_back(Interleave.getValue(0));
       ROps.push_back(Interleave.getValue(1));
     }