[llvm] [AArch64] Support symmetric complex deinterleaving with higher factors (PR #151295)

Wed Jul 30 01:53:50 PDT 2025

https://github.com/david-arm created https://github.com/llvm/llvm-project/pull/151295

For loops such as this:
  
struct foo {
  double a, b;
};
  
void foo(struct foo *dst, struct foo *src, int n) {
  for (int i = 0; i < n; i++) {
    dst[i].a += src[i].a * 3.2;
    dst[i].b += src[i].b * 3.2;
  }
}

the complex deinterleaving pass will spot that the deinterleaving
associated with the structured loads cancels out the interleaving
associated with the structured stores. This happens even though
they are not truly "complex" numbers because the pass can handle
symmetric operations too. This is great because it means we can
then perform normal loads and stores instead. However, we can also
do the same for higher interleave factors, e.g. 4:

struct foo {
  double a, b, c, d;
};
  
void foo(struct foo *dst, struct foo *src, int n) {
  for (int i = 0; i < n; i++) {
    dst[i].a += src[i].a * 3.2;
    dst[i].b += src[i].b * 3.2;
    dst[i].c += src[i].c * 3.2;
    dst[i].d += src[i].d * 3.2;
  }
}

This PR extends the pass to effectively treat such structures as
a set of complex numbers, i.e.

struct foo_alt {
  std::complex<double> x, y;
};

with equivalence between members:

  foo_alt.x.real == foo.a
  foo_alt.x.imag == foo.b
  foo_alt.y.real == foo.c
  foo_alt.y.imag == foo.d
  
I've written the code to handle sets with arbitrary numbers of
complex values, but since we only support interleave factors
between 2 and 4 I've restricted the sets to 1 or 2 complex
numbers. Also, for now I've restricted support for interleave
factors of 4 to purely symmetric operations only. However, it
could also be extended to handle complex multiplications,
reductions, etc.

Fixes: https://github.com/llvm/llvm-project/issues/144795

>From 38de6dc8609ca5c204c0f27602a380bdaacb5755 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Wed, 30 Jul 2025 08:45:36 +0000
Subject: [PATCH 1/2] Add tests

---
 ...plex-deinterleaving-reductions-scalable.ll | 110 +++++++++++++
 .../complex-deinterleaving-symmetric-fixed.ll |  76 +++++++++
 ...mplex-deinterleaving-symmetric-scalable.ll | 150 ++++++++++++++++++
 3 files changed, 336 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/complex-deinterleaving-symmetric-fixed.ll
 create mode 100644 llvm/test/CodeGen/AArch64/complex-deinterleaving-symmetric-scalable.ll

diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
index 29be231920305..1b4b8ba1a9ae8 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll
@@ -380,6 +380,116 @@ middle.block:                                     ; preds = %vector.body
 }
 
 
+; Zero initialized double reduction
+;   struct foo {
+;     complex<double> v1, v2;
+;   };
+;
+;complex<double> foo(struct foo *a, struct foo *b, int n) {
+;   complex<double> x = 0.0 + 0.0i;
+;   complex<double> y = 0.0 + 0.0i;
+;   for (int i = 0; i < n; i++) {
+;       struct foo t1 = a[i];
+;       struct foo t2 = b[i];
+;       x += t1.v1 * t2.v1;
+;       y += t1.v2 * t2.v2;
+;   }
+;   return x + y;
+;}
+%struct.foo2 = type { %"class.std::complex", %"class.std::complex" }
+
+define %"class.std::complex" @double_complex_mul_v2f64(ptr noundef readonly captures(none) %src1, ptr noundef readonly captures(none) %src2, i64 noundef %nvec) {
+; CHECK-LABEL: double_complex_mul_v2f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    cntd x8
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-NEXT:    rdvl x9, #4
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:  .LBB4_1: // %vector.body
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    ld4d { z4.d - z7.d }, p0/z, [x0]
+; CHECK-NEXT:    subs x2, x2, x8
+; CHECK-NEXT:    add x0, x0, x9
+; CHECK-NEXT:    ld4d { z16.d - z19.d }, p0/z, [x1]
+; CHECK-NEXT:    add x1, x1, x9
+; CHECK-NEXT:    fmla z2.d, p0/m, z16.d, z4.d
+; CHECK-NEXT:    fmla z0.d, p0/m, z16.d, z5.d
+; CHECK-NEXT:    fmla z3.d, p0/m, z18.d, z6.d
+; CHECK-NEXT:    fmla z1.d, p0/m, z18.d, z7.d
+; CHECK-NEXT:    fmls z2.d, p0/m, z5.d, z17.d
+; CHECK-NEXT:    fmla z0.d, p0/m, z17.d, z4.d
+; CHECK-NEXT:    fmls z3.d, p0/m, z7.d, z19.d
+; CHECK-NEXT:    fmla z1.d, p0/m, z19.d, z6.d
+; CHECK-NEXT:    b.ne .LBB4_1
+; CHECK-NEXT:  // %bb.2: // %middle.block
+; CHECK-NEXT:    fadd z2.d, z2.d, z3.d
+; CHECK-NEXT:    fadd z1.d, z0.d, z1.d
+; CHECK-NEXT:    faddv d0, p0, z2.d
+; CHECK-NEXT:    faddv d1, p0, z1.d
+; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 killed $z1
+; CHECK-NEXT:    ret
+entry:
+  %vscale = tail call i64 @llvm.vscale.i64()
+  %inc = shl nuw nsw i64 %vscale, 1
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
+  %vec.phi1 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %vec.phi1.next, %vector.body ]
+  %vec.phi2 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %vec.phi2.next, %vector.body ]
+  %vec.phi3 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %vec.phi3.next, %vector.body ]
+  %vec.phi4 = phi <vscale x 2 x double> [ zeroinitializer, %entry ], [ %vec.phi4.next, %vector.body ]
+  %gep1 = getelementptr inbounds nuw %struct.foo2, ptr %src1, i64 %index
+  %wide.vec = load <vscale x 8 x double>, ptr %gep1, align 8
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave4.nxv8f64(<vscale x 8 x double> %wide.vec)
+  %ext00 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
+  %ext01 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
+  %ext02 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 2
+  %ext03 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 3
+  %gep2 = getelementptr inbounds nuw %struct.foo2, ptr %src2, i64 %index
+  %wide.vec73 = load <vscale x 8 x double>, ptr %gep2, align 8
+  %strided.vec74 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave4.nxv8f64(<vscale x 8 x double> %wide.vec73)
+  %ext10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec74, 0
+  %ext11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec74, 1
+  %ext12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec74, 2
+  %ext13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec74, 3
+  %fmul1 = fmul fast <vscale x 2 x double> %ext10, %ext00
+  %fmul2 = fmul fast <vscale x 2 x double> %ext11, %ext00
+  %fmul3 = fmul fast <vscale x 2 x double> %ext10, %ext01
+  %fadd1 = fadd fast <vscale x 2 x double> %fmul1, %vec.phi2
+  %fmul4 = fmul fast <vscale x 2 x double> %ext01, %ext11
+  %vec.phi2.next = fsub fast <vscale x 2 x double> %fadd1, %fmul4
+  %fadd2 = fadd fast <vscale x 2 x double> %fmul3, %vec.phi1
+  %vec.phi1.next = fadd fast <vscale x 2 x double> %fadd2, %fmul2
+  %fmul5 = fmul fast <vscale x 2 x double> %ext12, %ext02
+  %fmul6 = fmul fast <vscale x 2 x double> %ext13, %ext02
+  %fmul7 = fmul fast <vscale x 2 x double> %ext12, %ext03
+  %fadd3 = fadd fast <vscale x 2 x double> %fmul5, %vec.phi4
+  %fmul8 = fmul fast <vscale x 2 x double> %ext03, %ext13
+  %vec.phi4.next = fsub fast <vscale x 2 x double> %fadd3, %fmul8
+  %fadd4 = fadd fast <vscale x 2 x double> %fmul7, %vec.phi3
+  %vec.phi3.next = fadd fast <vscale x 2 x double> %fadd4, %fmul6
+  %index.next = add nuw i64 %index, %inc
+  %cmp = icmp eq i64 %index.next, %nvec
+  br i1 %cmp, label %middle.block, label %vector.body
+
+middle.block:
+  %final1 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double 0.000000e+00, <vscale x 2 x double> %vec.phi1.next)
+  %final2 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double 0.000000e+00, <vscale x 2 x double> %vec.phi2.next)
+  %final3 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double 0.000000e+00, <vscale x 2 x double> %vec.phi3.next)
+  %final4 = tail call fast double @llvm.vector.reduce.fadd.nxv2f64(double 0.000000e+00, <vscale x 2 x double> %vec.phi4.next)
+  %last_fadd1 = fadd fast double %final2, %final4
+  %last_fadd2 = fadd fast double %final1, %final3
+  %.fca.0.0.insert = insertvalue %"class.std::complex" poison, double %last_fadd1, 0, 0
+  %.fca.0.1.insert = insertvalue %"class.std::complex" %.fca.0.0.insert, double %last_fadd2, 0, 1
+  ret %"class.std::complex" %.fca.0.1.insert
+}
+
+
 declare i64 @llvm.vscale.i64()
 declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
 declare double @llvm.vector.reduce.fadd.nxv2f64(double, <vscale x 2 x double>)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-symmetric-fixed.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-symmetric-fixed.ll
new file mode 100644
index 0000000000000..d05b9c6d7662a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-symmetric-fixed.ll
@@ -0,0 +1,76 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+complxnum,+sve,+fullfp16 -o - | FileCheck %s
+
+target triple = "aarch64"
+
+define <4 x double> @simple_symmetric_muladd2(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: simple_symmetric_muladd2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, #-7378697629483820647 // =0x9999999999999999
+; CHECK-NEXT:    movk x8, #39322
+; CHECK-NEXT:    movk x8, #16393, lsl #48
+; CHECK-NEXT:    dup v4.2d, x8
+; CHECK-NEXT:    fmla v2.2d, v4.2d, v0.2d
+; CHECK-NEXT:    fmla v3.2d, v4.2d, v1.2d
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    mov v1.16b, v3.16b
+; CHECK-NEXT:    ret
+entry:
+  %ext00 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+  %ext01 = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+  %fmul0 = fmul fast <2 x double> %ext00, splat (double 3.200000e+00)
+  %ext10 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+  %ext11 = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+  %fadd0 = fadd fast <2 x double> %ext10, %fmul0
+  %fmul1 = fmul fast <2 x double> %ext01, splat (double 3.200000e+00)
+  %fadd1 = fadd fast <2 x double> %ext11, %fmul1
+  %interleaved.vec = shufflevector <2 x double> %fadd0, <2 x double> %fadd1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x double> %interleaved.vec
+}
+
+define <8 x double> @simple_symmetric_muladd4(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: simple_symmetric_muladd4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov x8, #-7378697629483820647 // =0x9999999999999999
+; CHECK-NEXT:    zip1 v16.2d, v0.2d, v2.2d
+; CHECK-NEXT:    zip2 v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    movk x8, #39322
+; CHECK-NEXT:    zip1 v2.2d, v1.2d, v3.2d
+; CHECK-NEXT:    zip2 v1.2d, v1.2d, v3.2d
+; CHECK-NEXT:    movk x8, #16393, lsl #48
+; CHECK-NEXT:    zip1 v3.2d, v4.2d, v6.2d
+; CHECK-NEXT:    zip2 v4.2d, v4.2d, v6.2d
+; CHECK-NEXT:    zip1 v17.2d, v5.2d, v7.2d
+; CHECK-NEXT:    zip2 v5.2d, v5.2d, v7.2d
+; CHECK-NEXT:    dup v6.2d, x8
+; CHECK-NEXT:    fmla v3.2d, v6.2d, v16.2d
+; CHECK-NEXT:    fmla v4.2d, v6.2d, v0.2d
+; CHECK-NEXT:    fmla v17.2d, v6.2d, v2.2d
+; CHECK-NEXT:    fmla v5.2d, v6.2d, v1.2d
+; CHECK-NEXT:    zip1 v0.2d, v3.2d, v4.2d
+; CHECK-NEXT:    zip2 v2.2d, v3.2d, v4.2d
+; CHECK-NEXT:    zip1 v1.2d, v17.2d, v5.2d
+; CHECK-NEXT:    zip2 v3.2d, v17.2d, v5.2d
+; CHECK-NEXT:    ret
+entry:
+  %ext00 = shufflevector <8 x double> %a, <8 x double> poison, <2 x i32> <i32 0, i32 4>
+  %ext01 = shufflevector <8 x double> %a, <8 x double> poison, <2 x i32> <i32 1, i32 5>
+  %ext02 = shufflevector <8 x double> %a, <8 x double> poison, <2 x i32> <i32 2, i32 6>
+  %ext03 = shufflevector <8 x double> %a, <8 x double> poison, <2 x i32> <i32 3, i32 7>
+  %fmul0 = fmul fast <2 x double> %ext00, splat (double 3.200000e+00)
+  %ext10 = shufflevector <8 x double> %b, <8 x double> poison, <2 x i32> <i32 0, i32 4>
+  %ext11 = shufflevector <8 x double> %b, <8 x double> poison, <2 x i32> <i32 1, i32 5>
+  %ext12 = shufflevector <8 x double> %b, <8 x double> poison, <2 x i32> <i32 2, i32 6>
+  %ext13 = shufflevector <8 x double> %b, <8 x double> poison, <2 x i32> <i32 3, i32 7>
+  %fadd0 = fadd fast <2 x double> %ext10, %fmul0
+  %fmul1 = fmul fast <2 x double> %ext01, splat (double 3.200000e+00)
+  %fadd1 = fadd fast <2 x double> %ext11, %fmul1
+  %fmul2 = fmul fast <2 x double> %ext02, splat (double 3.200000e+00)
+  %fadd2 = fadd fast <2 x double> %ext12, %fmul2
+  %fmul3 = fmul fast <2 x double> %ext03, splat (double 3.200000e+00)
+  %fadd3 = fadd fast <2 x double> %ext13, %fmul3
+  %interleave.pt1 = shufflevector <2 x double> %fadd0, <2 x double> %fadd1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %interleave.pt2 = shufflevector <2 x double> %fadd2, <2 x double> %fadd3, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %interleaved.vec = shufflevector <4 x double> %interleave.pt1, <4 x double> %interleave.pt2, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  ret <8 x double> %interleaved.vec
+}
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-symmetric-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-symmetric-scalable.ll
new file mode 100644
index 0000000000000..5d22120d7513e
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-symmetric-scalable.ll
@@ -0,0 +1,150 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+complxnum,+sve,+fullfp16 -o - | FileCheck %s
+
+target triple = "aarch64"
+
+define <vscale x 4 x double> @simple_symmetric_muladd2(<vscale x 4 x double> %a, <vscale x 4 x double> %b) {
+; CHECK-LABEL: simple_symmetric_muladd2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    adrp x8, .LCPI0_0
+; CHECK-NEXT:    add x8, x8, :lo12:.LCPI0_0
+; CHECK-NEXT:    ld1rd { z4.d }, p0/z, [x8]
+; CHECK-NEXT:    zip2 z5.d, z4.d, z4.d
+; CHECK-NEXT:    zip1 z4.d, z4.d, z4.d
+; CHECK-NEXT:    fmad z0.d, p0/m, z4.d, z2.d
+; CHECK-NEXT:    fmad z1.d, p0/m, z5.d, z3.d
+; CHECK-NEXT:    ret
+entry:
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %ext00 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
+  %ext01 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
+  %fmul0 = fmul fast <vscale x 2 x double> %ext00, splat (double 3.200000e+00)
+  %strided.vec44 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %ext10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec44, 0
+  %ext11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec44, 1
+  %fadd0 = fadd fast <vscale x 2 x double> %ext10, %fmul0
+  %fmul1 = fmul fast <vscale x 2 x double> %ext01, splat (double 3.200000e+00)
+  %fadd1 = fadd fast <vscale x 2 x double> %ext11, %fmul1
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %fadd0, <vscale x 2 x double> %fadd1)
+  ret <vscale x 4 x double> %interleaved.vec
+}
+
+define <vscale x 4 x double> @simple_symmetric_unary2(<vscale x 4 x double> %a) {
+; CHECK-LABEL: simple_symmetric_unary2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fneg z0.d, p0/m, z0.d
+; CHECK-NEXT:    fneg z1.d, p0/m, z1.d
+; CHECK-NEXT:    ret
+entry:
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %ext00 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
+  %ext01 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
+  %fneg0 = fneg fast <vscale x 2 x double> %ext00
+  %fneg1 = fneg fast <vscale x 2 x double> %ext01
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.vector.interleave2.nxv4f64(<vscale x 2 x double> %fneg0, <vscale x 2 x double> %fneg1)
+  ret <vscale x 4 x double> %interleaved.vec
+}
+
+define <vscale x 8 x double> @simple_symmetric_muladd4(<vscale x 8 x double> %a, <vscale x 8 x double> %b) {
+; CHECK-LABEL: simple_symmetric_muladd4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uzp2 z24.d, z2.d, z3.d
+; CHECK-NEXT:    uzp2 z25.d, z0.d, z1.d
+; CHECK-NEXT:    adrp x8, .LCPI2_0
+; CHECK-NEXT:    add x8, x8, :lo12:.LCPI2_0
+; CHECK-NEXT:    uzp1 z2.d, z2.d, z3.d
+; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    uzp2 z1.d, z6.d, z7.d
+; CHECK-NEXT:    uzp2 z3.d, z4.d, z5.d
+; CHECK-NEXT:    uzp1 z6.d, z6.d, z7.d
+; CHECK-NEXT:    uzp1 z4.d, z4.d, z5.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uzp2 z5.d, z25.d, z24.d
+; CHECK-NEXT:    uzp1 z7.d, z25.d, z24.d
+; CHECK-NEXT:    uzp1 z24.d, z0.d, z2.d
+; CHECK-NEXT:    uzp2 z0.d, z0.d, z2.d
+; CHECK-NEXT:    ld1rd { z25.d }, p0/z, [x8]
+; CHECK-NEXT:    uzp1 z26.d, z3.d, z1.d
+; CHECK-NEXT:    uzp2 z1.d, z3.d, z1.d
+; CHECK-NEXT:    uzp1 z27.d, z4.d, z6.d
+; CHECK-NEXT:    uzp2 z2.d, z4.d, z6.d
+; CHECK-NEXT:    movprfx z4, z26
+; CHECK-NEXT:    fmla z4.d, p0/m, z7.d, z25.d
+; CHECK-NEXT:    fmla z1.d, p0/m, z5.d, z25.d
+; CHECK-NEXT:    movprfx z3, z27
+; CHECK-NEXT:    fmla z3.d, p0/m, z24.d, z25.d
+; CHECK-NEXT:    fmad z0.d, p0/m, z25.d, z2.d
+; CHECK-NEXT:    zip1 z2.d, z4.d, z1.d
+; CHECK-NEXT:    zip2 z4.d, z4.d, z1.d
+; CHECK-NEXT:    zip1 z5.d, z3.d, z0.d
+; CHECK-NEXT:    zip2 z3.d, z3.d, z0.d
+; CHECK-NEXT:    zip1 z0.d, z5.d, z2.d
+; CHECK-NEXT:    zip2 z1.d, z5.d, z2.d
+; CHECK-NEXT:    zip1 z2.d, z3.d, z4.d
+; CHECK-NEXT:    zip2 z3.d, z3.d, z4.d
+; CHECK-NEXT:    ret
+entry:
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave4.nxv8f64(<vscale x 8 x double> %a)
+  %ext00 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
+  %ext01 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
+  %ext02 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 2
+  %ext03 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 3
+  %fmul0 = fmul fast <vscale x 2 x double> %ext00, splat (double 3.200000e+00)
+  %strided.vec44 = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave4.nxv8f64(<vscale x 8 x double> %b)
+  %ext10 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec44, 0
+  %ext11 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec44, 1
+  %ext12 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec44, 2
+  %ext13 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec44, 3
+  %fadd0 = fadd fast <vscale x 2 x double> %ext10, %fmul0
+  %fmul1 = fmul fast <vscale x 2 x double> %ext01, splat (double 3.200000e+00)
+  %fadd1 = fadd fast <vscale x 2 x double> %ext11, %fmul1
+  %fmul2 = fmul fast <vscale x 2 x double> %ext02, splat (double 3.200000e+00)
+  %fadd2 = fadd fast <vscale x 2 x double> %ext12, %fmul2
+  %fmul3 = fmul fast <vscale x 2 x double> %ext03, splat (double 3.200000e+00)
+  %fadd3 = fadd fast <vscale x 2 x double> %ext13, %fmul3
+  %interleaved.vec = tail call <vscale x 8 x double> @llvm.vector.interleave4.nxv8f64(<vscale x 2 x double> %fadd0, <vscale x 2 x double> %fadd1, <vscale x 2 x double> %fadd2, <vscale x 2 x double> %fadd3)
+  ret <vscale x 8 x double> %interleaved.vec
+}
+
+
+define <vscale x 8 x double> @simple_symmetric_unary4(<vscale x 8 x double> %a) {
+; CHECK-LABEL: simple_symmetric_unary4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uzp1 z4.d, z2.d, z3.d
+; CHECK-NEXT:    uzp1 z5.d, z0.d, z1.d
+; CHECK-NEXT:    uzp2 z2.d, z2.d, z3.d
+; CHECK-NEXT:    uzp2 z0.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uzp1 z1.d, z5.d, z4.d
+; CHECK-NEXT:    uzp2 z4.d, z5.d, z4.d
+; CHECK-NEXT:    uzp2 z3.d, z0.d, z2.d
+; CHECK-NEXT:    uzp1 z0.d, z0.d, z2.d
+; CHECK-NEXT:    fneg z1.d, p0/m, z1.d
+; CHECK-NEXT:    movprfx z2, z4
+; CHECK-NEXT:    fneg z2.d, p0/m, z4.d
+; CHECK-NEXT:    fneg z0.d, p0/m, z0.d
+; CHECK-NEXT:    fneg z3.d, p0/m, z3.d
+; CHECK-NEXT:    zip1 z5.d, z1.d, z2.d
+; CHECK-NEXT:    zip2 z6.d, z1.d, z2.d
+; CHECK-NEXT:    zip1 z4.d, z0.d, z3.d
+; CHECK-NEXT:    zip2 z3.d, z0.d, z3.d
+; CHECK-NEXT:    zip1 z0.d, z5.d, z4.d
+; CHECK-NEXT:    zip2 z1.d, z5.d, z4.d
+; CHECK-NEXT:    zip1 z2.d, z6.d, z3.d
+; CHECK-NEXT:    zip2 z3.d, z6.d, z3.d
+; CHECK-NEXT:    ret
+entry:
+  %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave4.nxv8f64(<vscale x 8 x double> %a)
+  %ext00 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 0
+  %ext01 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 1
+  %ext02 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 2
+  %ext03 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } %strided.vec, 3
+  %fneg0 = fneg fast <vscale x 2 x double> %ext00
+  %fneg1 = fneg fast <vscale x 2 x double> %ext01
+  %fneg2 = fneg fast <vscale x 2 x double> %ext02
+  %fneg3 = fneg fast <vscale x 2 x double> %ext03
+  %interleaved.vec = tail call <vscale x 8 x double> @llvm.vector.interleave4.nxv8f64(<vscale x 2 x double> %fneg0, <vscale x 2 x double> %fneg1, <vscale x 2 x double> %fneg2, <vscale x 2 x double> %fneg3)
+  ret <vscale x 8 x double> %interleaved.vec
+}

>From 53a94274c069a4a40c32b6a84d75bfe89ffe01f1 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood at arm.com>
Date: Wed, 30 Jul 2025 08:47:59 +0000
Subject: [PATCH 2/2] [AArch64] Support symmetric complex deinterleaving with
 higher factors

For loops such as this:

struct foo {
  double a, b;
};

void foo(struct foo *dst, struct foo *src, int n) {
  for (int i = 0; i < n; i++) {
    dst[i].a += src[i].a * 3.2;
    dst[i].b += src[i].b * 3.2;
  }
}

the complex deinterleaving pass will spot that the deinterleaving
associated with the structured loads cancels out the interleaving
associated with the structured stores. This happens even though
they are not truly "complex" numbers because the pass can handle
symmetric operations too. This is great because it means we can
then perform normal loads and stores instead. However, we can also
do the same for higher interleave factors, e.g. 4:

struct foo {
  double a, b, c, d;
};

void foo(struct foo *dst, struct foo *src, int n) {
  for (int i = 0; i < n; i++) {
    dst[i].a += src[i].a * 3.2;
    dst[i].b += src[i].b * 3.2;
    dst[i].c += src[i].c * 3.2;
    dst[i].d += src[i].d * 3.2;
  }
}

This PR extends the pass to effectively treat such structures as
a set of complex numbers, i.e.

struct foo_alt {
  std::complex<double> x, y;
};

with equivalence between members:

  foo_alt.x.real == foo.a
  foo_alt.x.imag == foo.b
  foo_alt.y.real == foo.c
  foo_alt.y.imag == foo.d

I've written the code to handle sets with arbitrary numbers of
complex values, but since we only support interleave factors
between 2 and 4 I've restricted the sets to 1 or 2 complex
numbers. Also, for now I've restricted support for interleave
factors of 4 to purely symmetric operations only. However, it
could also be extended to handle complex multiplications,
reductions, etc.

Fixes: https://github.com/llvm/llvm-project/issues/144795
---
 .../lib/CodeGen/ComplexDeinterleavingPass.cpp | 460 ++++++++++++------
 ...mplex-deinterleaving-symmetric-scalable.ll |  65 +--
 2 files changed, 335 insertions(+), 190 deletions(-)

diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 9b2851eb42b40..2787227e0a255 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -67,6 +67,7 @@
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Target/TargetMachine.h"
@@ -107,6 +108,42 @@ static bool isNeg(Value *V);
 /// Returns the operand for negation operation.
 static Value *getNegOperand(Value *V);
 
+namespace {
+struct ComplexValue {
+  Value *Real = nullptr;
+  Value *Imag = nullptr;
+
+  bool operator==(const ComplexValue &Other) const {
+    return Real == Other.Real && Imag == Other.Imag;
+  }
+};
+hash_code hash_value(const ComplexValue &Arg) {
+  return hash_combine(DenseMapInfo<Value *>::getHashValue(Arg.Real),
+                      DenseMapInfo<Value *>::getHashValue(Arg.Imag));
+}
+} // end namespace
+typedef SmallVector<struct ComplexValue, 2> ComplexValues;
+
+namespace llvm {
+template <> struct DenseMapInfo<ComplexValue> {
+  static inline ComplexValue getEmptyKey() {
+    return {DenseMapInfo<Value *>::getEmptyKey(),
+            DenseMapInfo<Value *>::getEmptyKey()};
+  }
+  static inline ComplexValue getTombstoneKey() {
+    return {DenseMapInfo<Value *>::getTombstoneKey(),
+            DenseMapInfo<Value *>::getTombstoneKey()};
+  }
+  static unsigned getHashValue(const ComplexValue &Val) {
+    return hash_combine(DenseMapInfo<Value *>::getHashValue(Val.Real),
+                        DenseMapInfo<Value *>::getHashValue(Val.Imag));
+  }
+  static bool isEqual(const ComplexValue &LHS, const ComplexValue &RHS) {
+    return LHS.Real == RHS.Real && LHS.Imag == RHS.Imag;
+  }
+};
+} // end namespace llvm
+
 namespace {
 template <typename T, typename IterT>
 std::optional<T> findCommonBetweenCollections(IterT A, IterT B) {
@@ -145,7 +182,13 @@ struct ComplexDeinterleavingCompositeNode {
 
   ComplexDeinterleavingCompositeNode(ComplexDeinterleavingOperation Op,
                                      Value *R, Value *I)
-      : Operation(Op), Real(R), Imag(I) {}
+      : Operation(Op) {
+    Vals.push_back({R, I});
+  }
+
+  ComplexDeinterleavingCompositeNode(ComplexDeinterleavingOperation Op,
+                                     ComplexValues &Other)
+      : Operation(Op), Vals(Other) {}
 
 private:
   friend class ComplexDeinterleavingGraph;
@@ -155,8 +198,7 @@ struct ComplexDeinterleavingCompositeNode {
 
 public:
   ComplexDeinterleavingOperation Operation;
-  Value *Real;
-  Value *Imag;
+  ComplexValues Vals;
 
   // This two members are required exclusively for generating
   // ComplexDeinterleavingOperation::Symmetric operations.
@@ -192,10 +234,12 @@ struct ComplexDeinterleavingCompositeNode {
     };
 
     OS << "- CompositeNode: " << this << "\n";
-    OS << "  Real: ";
-    PrintValue(Real);
-    OS << "  Imag: ";
-    PrintValue(Imag);
+    for (unsigned I = 0; I < Vals.size(); I++) {
+      OS << "  Real(" << I << ") : ";
+      PrintValue(Vals[I].Real);
+      OS << "  Imag(" << I << ") : ";
+      PrintValue(Vals[I].Imag);
+    }
     OS << "  ReplacementNode: ";
     PrintValue(ReplacementNode);
     OS << "  Operation: " << (int)Operation << "\n";
@@ -233,14 +277,16 @@ class ComplexDeinterleavingGraph {
   };
 
   explicit ComplexDeinterleavingGraph(const TargetLowering *TL,
-                                      const TargetLibraryInfo *TLI)
-      : TL(TL), TLI(TLI) {}
+                                      const TargetLibraryInfo *TLI,
+                                      unsigned Factor)
+      : TL(TL), TLI(TLI), Factor(Factor) {}
 
 private:
   const TargetLowering *TL = nullptr;
   const TargetLibraryInfo *TLI = nullptr;
+  unsigned Factor;
   SmallVector<NodePtr> CompositeNodes;
-  DenseMap<std::pair<Value *, Value *>, NodePtr> CachedResult;
+  DenseMap<ComplexValues, NodePtr> CachedResult;
 
   SmallPtrSet<Instruction *, 16> FinalInstructions;
 
@@ -305,10 +351,26 @@ class ComplexDeinterleavingGraph {
                                                                 I);
   }
 
+  NodePtr prepareCompositeNode(ComplexDeinterleavingOperation Operation,
+                               ComplexValues &Vals) {
+#ifndef NDEBUG
+    for (auto &V : Vals) {
+      assert(
+          ((Operation != ComplexDeinterleavingOperation::ReductionPHI &&
+            Operation != ComplexDeinterleavingOperation::ReductionOperation) ||
+           (V.Real && V.Imag)) &&
+          "Reduction related nodes must have Real and Imaginary parts");
+    }
+#endif
+    return std::make_shared<ComplexDeinterleavingCompositeNode>(Operation,
+                                                                Vals);
+  }
+
   NodePtr submitCompositeNode(NodePtr Node) {
     CompositeNodes.push_back(Node);
-    if (Node->Real)
-      CachedResult[{Node->Real, Node->Imag}] = Node;
+    if (Node->Vals[0].Real) {
+      CachedResult[Node->Vals] = Node;
+    }
     return Node;
   }
 
@@ -340,11 +402,17 @@ class ComplexDeinterleavingGraph {
   /// 270: r: ar + bi
   ///      i: ai - br
   NodePtr identifyAdd(Instruction *Real, Instruction *Imag);
-  NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag);
+  NodePtr identifySymmetricOperation(ComplexValues &Vals);
   NodePtr identifyPartialReduction(Value *R, Value *I);
   NodePtr identifyDotProduct(Value *Inst);
 
-  NodePtr identifyNode(Value *R, Value *I);
+  NodePtr identifyNode(ComplexValues &Vals);
+
+  NodePtr identifyNode(Value *R, Value *I) {
+    ComplexValues Vals;
+    Vals.push_back({R, I});
+    return identifyNode(Vals);
+  }
 
   /// Determine if a sum of complex numbers can be formed from \p RealAddends
   /// and \p ImagAddens. If \p Accumulator is not null, add the result to it.
@@ -390,13 +458,13 @@ class ComplexDeinterleavingGraph {
   /// odd indices for /pImag instructions (only for fixed-width vectors)
   /// * Using two extractvalue instructions applied to `vector.deinterleave2`
   /// intrinsic (for both fixed and scalable vectors)
-  NodePtr identifyDeinterleave(Instruction *Real, Instruction *Imag);
+  NodePtr identifyDeinterleave(ComplexValues &Vals);
 
   /// identifying the operation that represents a complex number repeated in a
   /// Splat vector. There are two possible types of splats: ConstantExpr with
   /// the opcode ShuffleVector and ShuffleVectorInstr. Both should have an
   /// initialization mask with all values set to zero.
-  NodePtr identifySplat(Value *Real, Value *Imag);
+  NodePtr identifySplat(ComplexValues &Vals);
 
   NodePtr identifyPHINode(Instruction *Real, Instruction *Imag);
 
@@ -447,7 +515,7 @@ class ComplexDeinterleaving {
   bool runOnFunction(Function &F);
 
 private:
-  bool evaluateBasicBlock(BasicBlock *B);
+  bool evaluateBasicBlock(BasicBlock *B, unsigned Factor);
 
   const TargetLowering *TL = nullptr;
   const TargetLibraryInfo *TLI = nullptr;
@@ -500,7 +568,15 @@ bool ComplexDeinterleaving::runOnFunction(Function &F) {
 
   bool Changed = false;
   for (auto &B : F)
-    Changed |= evaluateBasicBlock(&B);
+    Changed |= evaluateBasicBlock(&B, 2);
+
+  // TODO: Permit changes for both interleave factors in the same function.
+  if (!Changed) {
+    for (auto &B : F)
+      Changed |= evaluateBasicBlock(&B, 4);
+  }
+
+  // TODO: We can also support interleave factors of 6 and 8 if needed.
 
   return Changed;
 }
@@ -545,8 +621,8 @@ Value *getNegOperand(Value *V) {
   return I->getOperand(1);
 }
 
-bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B) {
-  ComplexDeinterleavingGraph Graph(TL, TLI);
+bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B, unsigned Factor) {
+  ComplexDeinterleavingGraph Graph(TL, TLI, Factor);
   if (Graph.collectPotentialReductions(B))
     Graph.identifyReductionNodes();
 
@@ -669,6 +745,7 @@ ComplexDeinterleavingGraph::identifyPartialMul(Instruction *Real,
                                                Instruction *Imag) {
   LLVM_DEBUG(dbgs() << "identifyPartialMul " << *Real << " / " << *Imag
                     << "\n");
+
   // Determine rotation
   auto IsAdd = [](unsigned Op) {
     return Op == Instruction::FAdd || Op == Instruction::Add;
@@ -865,43 +942,57 @@ static bool isInstructionPotentiallySymmetric(Instruction *I) {
 }
 
 ComplexDeinterleavingGraph::NodePtr
-ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real,
-                                                       Instruction *Imag) {
-  if (Real->getOpcode() != Imag->getOpcode())
-    return nullptr;
+ComplexDeinterleavingGraph::identifySymmetricOperation(ComplexValues &Vals) {
+  auto *FirstReal = cast<Instruction>(Vals[0].Real);
+  unsigned FirstOpc = FirstReal->getOpcode();
+  for (auto &V : Vals) {
+    auto *Real = cast<Instruction>(V.Real);
+    auto *Imag = cast<Instruction>(V.Imag);
+    if (Real->getOpcode() != FirstOpc || Imag->getOpcode() != FirstOpc)
+      return nullptr;
 
-  if (!isInstructionPotentiallySymmetric(Real) ||
-      !isInstructionPotentiallySymmetric(Imag))
-    return nullptr;
+    if (!isInstructionPotentiallySymmetric(Real) ||
+        !isInstructionPotentiallySymmetric(Imag))
+      return nullptr;
 
-  auto *R0 = Real->getOperand(0);
-  auto *I0 = Imag->getOperand(0);
+    if (isa<FPMathOperator>(FirstReal))
+      if (Real->getFastMathFlags() != FirstReal->getFastMathFlags() ||
+          Imag->getFastMathFlags() != FirstReal->getFastMathFlags())
+        return nullptr;
+  }
+
+  ComplexValues OpVals;
+  for (auto &V : Vals) {
+    auto *R0 = cast<Instruction>(V.Real)->getOperand(0);
+    auto *I0 = cast<Instruction>(V.Imag)->getOperand(0);
+    OpVals.push_back({R0, I0});
+  }
 
-  NodePtr Op0 = identifyNode(R0, I0);
+  NodePtr Op0 = identifyNode(OpVals);
   NodePtr Op1 = nullptr;
   if (Op0 == nullptr)
     return nullptr;
 
-  if (Real->isBinaryOp()) {
-    auto *R1 = Real->getOperand(1);
-    auto *I1 = Imag->getOperand(1);
-    Op1 = identifyNode(R1, I1);
+  if (FirstReal->isBinaryOp()) {
+    OpVals.clear();
+    for (auto &V : Vals) {
+      auto *R1 = cast<Instruction>(V.Real)->getOperand(1);
+      auto *I1 = cast<Instruction>(V.Imag)->getOperand(1);
+      OpVals.push_back({R1, I1});
+    }
+    Op1 = identifyNode(OpVals);
     if (Op1 == nullptr)
       return nullptr;
   }
 
-  if (isa<FPMathOperator>(Real) &&
-      Real->getFastMathFlags() != Imag->getFastMathFlags())
-    return nullptr;
-
-  auto Node = prepareCompositeNode(ComplexDeinterleavingOperation::Symmetric,
-                                   Real, Imag);
-  Node->Opcode = Real->getOpcode();
-  if (isa<FPMathOperator>(Real))
-    Node->Flags = Real->getFastMathFlags();
+  auto Node =
+      prepareCompositeNode(ComplexDeinterleavingOperation::Symmetric, Vals);
+  Node->Opcode = FirstReal->getOpcode();
+  if (isa<FPMathOperator>(FirstReal))
+    Node->Flags = FirstReal->getFastMathFlags();
 
   Node->addOperand(Op0);
-  if (Real->isBinaryOp())
+  if (FirstReal->isBinaryOp())
     Node->addOperand(Op1);
 
   return submitCompositeNode(Node);
@@ -909,7 +1000,6 @@ ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real,
 
 ComplexDeinterleavingGraph::NodePtr
 ComplexDeinterleavingGraph::identifyDotProduct(Value *V) {
-
   if (!TL->isComplexDeinterleavingOperationSupported(
           ComplexDeinterleavingOperation::CDot, V->getType())) {
     LLVM_DEBUG(dbgs() << "Target doesn't support complex deinterleaving "
@@ -1054,65 +1144,77 @@ ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) {
 }
 
 ComplexDeinterleavingGraph::NodePtr
-ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) {
-  auto It = CachedResult.find({R, I});
+ComplexDeinterleavingGraph::identifyNode(ComplexValues &Vals) {
+  auto It = CachedResult.find(Vals);
   if (It != CachedResult.end()) {
     LLVM_DEBUG(dbgs() << " - Folding to existing node\n");
     return It->second;
   }
 
-  if (NodePtr CN = identifyPartialReduction(R, I))
-    return CN;
-
-  bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I);
-  if (!IsReduction && R->getType() != I->getType())
-    return nullptr;
+  if (Vals.size() == 1) {
+    assert(Factor == 2 && "Can only handle interleave factors of 2");
+    Value *R = Vals[0].Real;
+    Value *I = Vals[0].Imag;
+    if (NodePtr CN = identifyPartialReduction(R, I))
+      return CN;
+    bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I);
+    if (!IsReduction && R->getType() != I->getType())
+      return nullptr;
+  }
 
-  if (NodePtr CN = identifySplat(R, I))
+  if (NodePtr CN = identifySplat(Vals))
     return CN;
 
-  auto *Real = dyn_cast<Instruction>(R);
-  auto *Imag = dyn_cast<Instruction>(I);
-  if (!Real || !Imag)
-    return nullptr;
+  for (auto &V : Vals) {
+    auto *Real = dyn_cast<Instruction>(V.Real);
+    auto *Imag = dyn_cast<Instruction>(V.Imag);
+    if (!Real || !Imag)
+      return nullptr;
+  }
 
-  if (NodePtr CN = identifyDeinterleave(Real, Imag))
+  if (NodePtr CN = identifyDeinterleave(Vals))
     return CN;
 
-  if (NodePtr CN = identifyPHINode(Real, Imag))
-    return CN;
+  if (Vals.size() == 1) {
+    assert(Factor == 2 && "Can only handle interleave factors of 2");
+    auto *Real = dyn_cast<Instruction>(Vals[0].Real);
+    auto *Imag = dyn_cast<Instruction>(Vals[0].Imag);
+    if (NodePtr CN = identifyPHINode(Real, Imag))
+      return CN;
 
-  if (NodePtr CN = identifySelectNode(Real, Imag))
-    return CN;
+    if (NodePtr CN = identifySelectNode(Real, Imag))
+      return CN;
 
-  auto *VTy = cast<VectorType>(Real->getType());
-  auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
+    auto *VTy = cast<VectorType>(Real->getType());
+    auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
 
-  bool HasCMulSupport = TL->isComplexDeinterleavingOperationSupported(
-      ComplexDeinterleavingOperation::CMulPartial, NewVTy);
-  bool HasCAddSupport = TL->isComplexDeinterleavingOperationSupported(
-      ComplexDeinterleavingOperation::CAdd, NewVTy);
+    bool HasCMulSupport = TL->isComplexDeinterleavingOperationSupported(
+        ComplexDeinterleavingOperation::CMulPartial, NewVTy);
+    bool HasCAddSupport = TL->isComplexDeinterleavingOperationSupported(
+        ComplexDeinterleavingOperation::CAdd, NewVTy);
 
-  if (HasCMulSupport && isInstructionPairMul(Real, Imag)) {
-    if (NodePtr CN = identifyPartialMul(Real, Imag))
-      return CN;
-  }
+    if (HasCMulSupport && isInstructionPairMul(Real, Imag)) {
+      if (NodePtr CN = identifyPartialMul(Real, Imag))
+        return CN;
+    }
 
-  if (HasCAddSupport && isInstructionPairAdd(Real, Imag)) {
-    if (NodePtr CN = identifyAdd(Real, Imag))
-      return CN;
-  }
+    if (HasCAddSupport && isInstructionPairAdd(Real, Imag)) {
+      if (NodePtr CN = identifyAdd(Real, Imag))
+        return CN;
+    }
 
-  if (HasCMulSupport && HasCAddSupport) {
-    if (NodePtr CN = identifyReassocNodes(Real, Imag))
-      return CN;
+    if (HasCMulSupport && HasCAddSupport) {
+      if (NodePtr CN = identifyReassocNodes(Real, Imag)) {
+        return CN;
+      }
+    }
   }
 
-  if (NodePtr CN = identifySymmetricOperation(Real, Imag))
+  if (NodePtr CN = identifySymmetricOperation(Vals))
     return CN;
 
   LLVM_DEBUG(dbgs() << "  - Not recognised as a valid pattern.\n");
-  CachedResult[{R, I}] = nullptr;
+  CachedResult[Vals] = nullptr;
   return nullptr;
 }
 
@@ -1256,9 +1358,10 @@ ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real,
       return nullptr;
   }
   assert(FinalNode && "FinalNode can not be nullptr here");
+  assert(FinalNode->Vals.size() == 1);
   // Set the Real and Imag fields of the final node and submit it
-  FinalNode->Real = Real;
-  FinalNode->Imag = Imag;
+  FinalNode->Vals[0].Real = Real;
+  FinalNode->Vals[0].Imag = Imag;
   submitCompositeNode(FinalNode);
   return FinalNode;
 }
@@ -1381,7 +1484,7 @@ ComplexDeinterleavingGraph::identifyMultiplications(
 
     auto NodeA = It->second;
     auto NodeB = PMI.Node;
-    auto IsMultiplicandReal = PMI.Common == NodeA->Real;
+    auto IsMultiplicandReal = PMI.Common == NodeA->Vals[0].Real;
     // The following table illustrates the relationship between multiplications
     // and rotations. If we consider the multiplication (X + iY) * (U + iV), we
     // can see:
@@ -1423,10 +1526,10 @@ ComplexDeinterleavingGraph::identifyMultiplications(
 
     LLVM_DEBUG({
       dbgs() << "Identified partial multiplication (X, Y) * (U, V):\n";
-      dbgs().indent(4) << "X: " << *NodeA->Real << "\n";
-      dbgs().indent(4) << "Y: " << *NodeA->Imag << "\n";
-      dbgs().indent(4) << "U: " << *NodeB->Real << "\n";
-      dbgs().indent(4) << "V: " << *NodeB->Imag << "\n";
+      dbgs().indent(4) << "X: " << *NodeA->Vals[0].Real << "\n";
+      dbgs().indent(4) << "Y: " << *NodeA->Vals[0].Imag << "\n";
+      dbgs().indent(4) << "U: " << *NodeB->Vals[0].Real << "\n";
+      dbgs().indent(4) << "V: " << *NodeB->Vals[0].Imag << "\n";
       dbgs().indent(4) << "Rotation - " << (int)Rotation * 90 << "\n";
     });
 
@@ -1595,10 +1698,13 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
                ComplexDeinterleavingOperation::ReductionOperation ||
            RootNode->Operation ==
                ComplexDeinterleavingOperation::ReductionSingle);
+    assert(RootNode->Vals.size() == 1 &&
+           "Cannot handle reductions involving multiple complex values");
     // Find out which part, Real or Imag, comes later, and only if we come to
     // the latest part, add it to OrderedRoots.
-    auto *R = cast<Instruction>(RootNode->Real);
-    auto *I = RootNode->Imag ? cast<Instruction>(RootNode->Imag) : nullptr;
+    auto *R = cast<Instruction>(RootNode->Vals[0].Real);
+    auto *I = RootNode->Vals[0].Imag ? cast<Instruction>(RootNode->Vals[0].Imag)
+                                     : nullptr;
 
     Instruction *ReplacementAnchor;
     if (I)
@@ -1631,6 +1737,8 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
 
 bool ComplexDeinterleavingGraph::collectPotentialReductions(BasicBlock *B) {
   bool FoundPotentialReduction = false;
+  if (Factor != 2)
+    return false;
 
   auto *Br = dyn_cast<BranchInst>(B->getTerminator());
   if (!Br || Br->getNumSuccessors() != 2)
@@ -1682,6 +1790,8 @@ bool ComplexDeinterleavingGraph::collectPotentialReductions(BasicBlock *B) {
 }
 
 void ComplexDeinterleavingGraph::identifyReductionNodes() {
+  assert(Factor == 2 && "Cannot handle multiple complex values");
+
   SmallVector<bool> Processed(ReductionInfo.size(), false);
   SmallVector<Instruction *> OperationInstruction;
   for (auto &P : ReductionInfo)
@@ -1771,11 +1881,11 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
 }
 
 bool ComplexDeinterleavingGraph::checkNodes() {
-
   bool FoundDeinterleaveNode = false;
   for (NodePtr N : CompositeNodes) {
     if (!N->areOperandsValid())
       return false;
+
     if (N->Operation == ComplexDeinterleavingOperation::Deinterleave)
       FoundDeinterleaveNode = true;
   }
@@ -1861,17 +1971,33 @@ bool ComplexDeinterleavingGraph::checkNodes() {
 ComplexDeinterleavingGraph::NodePtr
 ComplexDeinterleavingGraph::identifyRoot(Instruction *RootI) {
   if (auto *Intrinsic = dyn_cast<IntrinsicInst>(RootI)) {
-    if (Intrinsic->getIntrinsicID() != Intrinsic::vector_interleave2)
+    if (Intrinsic::getInterleaveIntrinsicID(Factor) !=
+        Intrinsic->getIntrinsicID())
       return nullptr;
 
-    auto *Real = dyn_cast<Instruction>(Intrinsic->getOperand(0));
-    auto *Imag = dyn_cast<Instruction>(Intrinsic->getOperand(1));
-    if (!Real || !Imag)
-      return nullptr;
+    ComplexValues Vals;
+    for (unsigned I = 0; I < Factor; I += 2) {
+      auto *Real = dyn_cast<Instruction>(Intrinsic->getOperand(I));
+      auto *Imag = dyn_cast<Instruction>(Intrinsic->getOperand(I + 1));
+      if (!Real || !Imag)
+        return nullptr;
+      Vals.push_back({Real, Imag});
+    }
 
-    return identifyNode(Real, Imag);
+    ComplexDeinterleavingGraph::NodePtr Node1 = identifyNode(Vals);
+    if (!Node1)
+      return nullptr;
+    return Node1;
   }
 
+  // TODO: We could also add support for fixed-width interleave factors of 4
+  // and above, but currently for symmetric operations the interleaves and
+  // deinterleaves are already removed by VectorCombine. If we extend this to
+  // permit complex multiplications, reductions, etc. then we should also add
+  // support for fixed-width here.
+  if (Factor != 2)
+    return nullptr;
+
   auto *SVI = dyn_cast<ShuffleVectorInst>(RootI);
   if (!SVI)
     return nullptr;
@@ -1890,22 +2016,53 @@ ComplexDeinterleavingGraph::identifyRoot(Instruction *RootI) {
 }
 
 ComplexDeinterleavingGraph::NodePtr
-ComplexDeinterleavingGraph::identifyDeinterleave(Instruction *Real,
-                                                 Instruction *Imag) {
-  Instruction *I = nullptr;
-  Value *FinalValue = nullptr;
-  if (match(Real, m_ExtractValue<0>(m_Instruction(I))) &&
-      match(Imag, m_ExtractValue<1>(m_Specific(I))) &&
-      match(I, m_Intrinsic<Intrinsic::vector_deinterleave2>(
-                   m_Value(FinalValue)))) {
+ComplexDeinterleavingGraph::identifyDeinterleave(ComplexValues &Vals) {
+  Instruction *II = nullptr;
+
+  // Must be at least one complex value.
+  for (unsigned Idx = 0; Idx < Vals.size(); Idx++) {
+    auto *EVI = dyn_cast<ExtractValueInst>(Vals[Idx].Real);
+    if (!EVI || EVI->getNumIndices() != 1 ||
+        EVI->getIndices()[0] != (Idx * 2) ||
+        (Idx != 0 && II != EVI->getAggregateOperand())) {
+      II = nullptr;
+      break;
+    }
+    if (Idx == 0) {
+      II = dyn_cast<Instruction>(EVI->getAggregateOperand());
+      if (!II)
+        break;
+    }
+    EVI = dyn_cast<ExtractValueInst>(Vals[Idx].Imag);
+    if (!EVI || EVI->getNumIndices() != 1 ||
+        EVI->getIndices()[0] != ((Idx * 2) + 1) ||
+        II != EVI->getAggregateOperand()) {
+      II = nullptr;
+      break;
+    }
+  }
+
+  if (II && isa<IntrinsicInst>(II)) {
+    if (cast<IntrinsicInst>(II)->getIntrinsicID() !=
+        Intrinsic::getDeinterleaveIntrinsicID(2 * Vals.size()))
+      return nullptr;
+
+    // The remaining should match too.
     NodePtr PlaceholderNode = prepareCompositeNode(
-        llvm::ComplexDeinterleavingOperation::Deinterleave, Real, Imag);
-    PlaceholderNode->ReplacementNode = FinalValue;
-    FinalInstructions.insert(Real);
-    FinalInstructions.insert(Imag);
+        llvm::ComplexDeinterleavingOperation::Deinterleave, Vals);
+    PlaceholderNode->ReplacementNode = II->getOperand(0);
+    for (auto &V : Vals) {
+      FinalInstructions.insert(cast<Instruction>(V.Real));
+      FinalInstructions.insert(cast<Instruction>(V.Imag));
+    }
     return submitCompositeNode(PlaceholderNode);
   }
 
+  if (Vals.size() != 1)
+    return nullptr;
+
+  Value *Real = Vals[0].Real;
+  Value *Imag = Vals[0].Imag;
   auto *RealShuffle = dyn_cast<ShuffleVectorInst>(Real);
   auto *ImagShuffle = dyn_cast<ShuffleVectorInst>(Imag);
   if (!RealShuffle || !ImagShuffle) {
@@ -1999,7 +2156,7 @@ ComplexDeinterleavingGraph::identifyDeinterleave(Instruction *Real,
 }
 
 ComplexDeinterleavingGraph::NodePtr
-ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) {
+ComplexDeinterleavingGraph::identifySplat(ComplexValues &Vals) {
   auto IsSplat = [](Value *V) -> bool {
     // Fixed-width vector with constants
     if (isa<ConstantDataVector>(V))
@@ -2033,24 +2190,38 @@ ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) {
     return all_equal(Mask) && Mask[0] == 0;
   };
 
-  if (!IsSplat(R) || !IsSplat(I))
-    return nullptr;
+  // The splats must meet the following requirements:
+  //   1. Must either be all instructions or all values.
+  //   2. Non-constant splats must live in the same block.
+  auto FirstValAsInstruction = dyn_cast<Instruction>(Vals[0].Real);
+  BasicBlock *FirstBB =
+      FirstValAsInstruction ? FirstValAsInstruction->getParent() : nullptr;
+  for (auto &V : Vals) {
+    if (!IsSplat(V.Real) || !IsSplat(V.Imag))
+      return nullptr;
 
-  auto *Real = dyn_cast<Instruction>(R);
-  auto *Imag = dyn_cast<Instruction>(I);
-  if ((!Real && Imag) || (Real && !Imag))
-    return nullptr;
+    auto *Real = dyn_cast<Instruction>(V.Real);
+    auto *Imag = dyn_cast<Instruction>(V.Imag);
+    if (FirstValAsInstruction && (!Real || !Imag))
+      return nullptr;
+    else if (!FirstValAsInstruction && (Real || Imag))
+      return nullptr;
 
-  if (Real && Imag) {
-    // Non-constant splats should be in the same basic block
-    if (Real->getParent() != Imag->getParent())
+    if (FirstValAsInstruction &&
+        (Real->getParent() != FirstBB || Imag->getParent() != FirstBB))
       return nullptr;
+  }
 
-    FinalInstructions.insert(Real);
-    FinalInstructions.insert(Imag);
+  for (auto &V : Vals) {
+    auto *Real = dyn_cast<Instruction>(V.Real);
+    auto *Imag = dyn_cast<Instruction>(V.Imag);
+    if (Real && Imag) {
+      FinalInstructions.insert(Real);
+      FinalInstructions.insert(Imag);
+    }
   }
   NodePtr PlaceholderNode =
-      prepareCompositeNode(ComplexDeinterleavingOperation::Splat, R, I);
+      prepareCompositeNode(ComplexDeinterleavingOperation::Splat, Vals);
   return submitCompositeNode(PlaceholderNode);
 }
 
@@ -2186,24 +2357,35 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
     llvm_unreachable("Deinterleave node should already have ReplacementNode");
     break;
   case ComplexDeinterleavingOperation::Splat: {
-    auto *R = dyn_cast<Instruction>(Node->Real);
-    auto *I = dyn_cast<Instruction>(Node->Imag);
+    SmallVector<Value *> Ops;
+    for (auto &V : Node->Vals) {
+      Ops.push_back(V.Real);
+      Ops.push_back(V.Imag);
+    }
+    auto *R = dyn_cast<Instruction>(Node->Vals[0].Real);
+    auto *I = dyn_cast<Instruction>(Node->Vals[0].Imag);
     if (R && I) {
       // Splats that are not constant are interleaved where they are located
-      Instruction *InsertPoint = (I->comesBefore(R) ? R : I)->getNextNode();
+      Instruction *InsertPoint = R;
+      for (auto V : Node->Vals) {
+        if (InsertPoint->comesBefore(cast<Instruction>(V.Real)))
+          InsertPoint = cast<Instruction>(V.Real);
+        if (InsertPoint->comesBefore(cast<Instruction>(V.Imag)))
+          InsertPoint = cast<Instruction>(V.Imag);
+      }
+      InsertPoint = InsertPoint->getNextNode();
       IRBuilder<> IRB(InsertPoint);
-      ReplacementNode = IRB.CreateVectorInterleave({Node->Real, Node->Imag});
+      ReplacementNode = IRB.CreateVectorInterleave(Ops);
     } else {
-      ReplacementNode =
-          Builder.CreateVectorInterleave({Node->Real, Node->Imag});
+      ReplacementNode = Builder.CreateVectorInterleave(Ops);
     }
     break;
   }
   case ComplexDeinterleavingOperation::ReductionPHI: {
     // If Operation is ReductionPHI, a new empty PHINode is created.
     // It is filled later when the ReductionOperation is processed.
-    auto *OldPHI = cast<PHINode>(Node->Real);
-    auto *VTy = cast<VectorType>(Node->Real->getType());
+    auto *OldPHI = cast<PHINode>(Node->Vals[0].Real);
+    auto *VTy = cast<VectorType>(Node->Vals[0].Real->getType());
     auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
     auto *NewPHI = PHINode::Create(NewVTy, 0, "", BackEdge->getFirstNonPHIIt());
     OldToNewPHI[OldPHI] = NewPHI;
@@ -2219,8 +2401,8 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
     processReductionOperation(ReplacementNode, Node);
     break;
   case ComplexDeinterleavingOperation::ReductionSelect: {
-    auto *MaskReal = cast<Instruction>(Node->Real)->getOperand(0);
-    auto *MaskImag = cast<Instruction>(Node->Imag)->getOperand(0);
+    auto *MaskReal = cast<Instruction>(Node->Vals[0].Real)->getOperand(0);
+    auto *MaskImag = cast<Instruction>(Node->Vals[0].Imag)->getOperand(0);
     auto *A = replaceNode(Builder, Node->Operands[0]);
     auto *B = replaceNode(Builder, Node->Operands[1]);
     auto *NewMask = Builder.CreateVectorInterleave({MaskReal, MaskImag});
@@ -2237,7 +2419,7 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
 
 void ComplexDeinterleavingGraph::processReductionSingle(
     Value *OperationReplacement, RawNodePtr Node) {
-  auto *Real = cast<Instruction>(Node->Real);
+  auto *Real = cast<Instruction>(Node->Vals[0].Real);
   auto *OldPHI = ReductionInfo[Real].first;
   auto *NewPHI = OldToNewPHI[OldPHI];
   auto *VTy = cast<VectorType>(Real->getType());
@@ -2269,8 +2451,8 @@ void ComplexDeinterleavingGraph::processReductionSingle(
 
 void ComplexDeinterleavingGraph::processReductionOperation(
     Value *OperationReplacement, RawNodePtr Node) {
-  auto *Real = cast<Instruction>(Node->Real);
-  auto *Imag = cast<Instruction>(Node->Imag);
+  auto *Real = cast<Instruction>(Node->Vals[0].Real);
+  auto *Imag = cast<Instruction>(Node->Vals[0].Imag);
   auto *OldPHIReal = ReductionInfo[Real].first;
   auto *OldPHIImag = ReductionInfo[Imag].first;
   auto *NewPHI = OldToNewPHI[OldPHIReal];
@@ -2318,15 +2500,15 @@ void ComplexDeinterleavingGraph::replaceNodes() {
 
     if (RootNode->Operation ==
         ComplexDeinterleavingOperation::ReductionOperation) {
-      auto *RootReal = cast<Instruction>(RootNode->Real);
-      auto *RootImag = cast<Instruction>(RootNode->Imag);
+      auto *RootReal = cast<Instruction>(RootNode->Vals[0].Real);
+      auto *RootImag = cast<Instruction>(RootNode->Vals[0].Imag);
       ReductionInfo[RootReal].first->removeIncomingValue(BackEdge);
       ReductionInfo[RootImag].first->removeIncomingValue(BackEdge);
       DeadInstrRoots.push_back(RootReal);
       DeadInstrRoots.push_back(RootImag);
     } else if (RootNode->Operation ==
                ComplexDeinterleavingOperation::ReductionSingle) {
-      auto *RootInst = cast<Instruction>(RootNode->Real);
+      auto *RootInst = cast<Instruction>(RootNode->Vals[0].Real);
       auto &Info = ReductionInfo[RootInst];
       Info.first->removeIncomingValue(BackEdge);
       DeadInstrRoots.push_back(Info.second);
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-symmetric-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-symmetric-scalable.ll
index 5d22120d7513e..0915015b8dd07 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-symmetric-scalable.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-symmetric-scalable.ll
@@ -50,40 +50,20 @@ entry:
 define <vscale x 8 x double> @simple_symmetric_muladd4(<vscale x 8 x double> %a, <vscale x 8 x double> %b) {
 ; CHECK-LABEL: simple_symmetric_muladd4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp2 z24.d, z2.d, z3.d
-; CHECK-NEXT:    uzp2 z25.d, z0.d, z1.d
+; CHECK-NEXT:    ptrue p0.d
 ; CHECK-NEXT:    adrp x8, .LCPI2_0
 ; CHECK-NEXT:    add x8, x8, :lo12:.LCPI2_0
-; CHECK-NEXT:    uzp1 z2.d, z2.d, z3.d
-; CHECK-NEXT:    uzp1 z0.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z1.d, z6.d, z7.d
-; CHECK-NEXT:    uzp2 z3.d, z4.d, z5.d
-; CHECK-NEXT:    uzp1 z6.d, z6.d, z7.d
-; CHECK-NEXT:    uzp1 z4.d, z4.d, z5.d
-; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uzp2 z5.d, z25.d, z24.d
-; CHECK-NEXT:    uzp1 z7.d, z25.d, z24.d
-; CHECK-NEXT:    uzp1 z24.d, z0.d, z2.d
-; CHECK-NEXT:    uzp2 z0.d, z0.d, z2.d
-; CHECK-NEXT:    ld1rd { z25.d }, p0/z, [x8]
-; CHECK-NEXT:    uzp1 z26.d, z3.d, z1.d
-; CHECK-NEXT:    uzp2 z1.d, z3.d, z1.d
-; CHECK-NEXT:    uzp1 z27.d, z4.d, z6.d
-; CHECK-NEXT:    uzp2 z2.d, z4.d, z6.d
-; CHECK-NEXT:    movprfx z4, z26
-; CHECK-NEXT:    fmla z4.d, p0/m, z7.d, z25.d
-; CHECK-NEXT:    fmla z1.d, p0/m, z5.d, z25.d
-; CHECK-NEXT:    movprfx z3, z27
-; CHECK-NEXT:    fmla z3.d, p0/m, z24.d, z25.d
-; CHECK-NEXT:    fmad z0.d, p0/m, z25.d, z2.d
-; CHECK-NEXT:    zip1 z2.d, z4.d, z1.d
-; CHECK-NEXT:    zip2 z4.d, z4.d, z1.d
-; CHECK-NEXT:    zip1 z5.d, z3.d, z0.d
-; CHECK-NEXT:    zip2 z3.d, z3.d, z0.d
-; CHECK-NEXT:    zip1 z0.d, z5.d, z2.d
-; CHECK-NEXT:    zip2 z1.d, z5.d, z2.d
-; CHECK-NEXT:    zip1 z2.d, z3.d, z4.d
-; CHECK-NEXT:    zip2 z3.d, z3.d, z4.d
+; CHECK-NEXT:    ld1rd { z24.d }, p0/z, [x8]
+; CHECK-NEXT:    zip1 z25.d, z24.d, z24.d
+; CHECK-NEXT:    zip2 z24.d, z24.d, z24.d
+; CHECK-NEXT:    zip2 z26.d, z25.d, z25.d
+; CHECK-NEXT:    zip1 z25.d, z25.d, z25.d
+; CHECK-NEXT:    zip2 z27.d, z24.d, z24.d
+; CHECK-NEXT:    zip1 z24.d, z24.d, z24.d
+; CHECK-NEXT:    fmad z0.d, p0/m, z25.d, z4.d
+; CHECK-NEXT:    fmad z1.d, p0/m, z26.d, z5.d
+; CHECK-NEXT:    fmad z2.d, p0/m, z24.d, z6.d
+; CHECK-NEXT:    fmad z3.d, p0/m, z27.d, z7.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave4.nxv8f64(<vscale x 8 x double> %a)
@@ -112,28 +92,11 @@ entry:
 define <vscale x 8 x double> @simple_symmetric_unary4(<vscale x 8 x double> %a) {
 ; CHECK-LABEL: simple_symmetric_unary4:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uzp1 z4.d, z2.d, z3.d
-; CHECK-NEXT:    uzp1 z5.d, z0.d, z1.d
-; CHECK-NEXT:    uzp2 z2.d, z2.d, z3.d
-; CHECK-NEXT:    uzp2 z0.d, z0.d, z1.d
 ; CHECK-NEXT:    ptrue p0.d
-; CHECK-NEXT:    uzp1 z1.d, z5.d, z4.d
-; CHECK-NEXT:    uzp2 z4.d, z5.d, z4.d
-; CHECK-NEXT:    uzp2 z3.d, z0.d, z2.d
-; CHECK-NEXT:    uzp1 z0.d, z0.d, z2.d
-; CHECK-NEXT:    fneg z1.d, p0/m, z1.d
-; CHECK-NEXT:    movprfx z2, z4
-; CHECK-NEXT:    fneg z2.d, p0/m, z4.d
 ; CHECK-NEXT:    fneg z0.d, p0/m, z0.d
+; CHECK-NEXT:    fneg z1.d, p0/m, z1.d
+; CHECK-NEXT:    fneg z2.d, p0/m, z2.d
 ; CHECK-NEXT:    fneg z3.d, p0/m, z3.d
-; CHECK-NEXT:    zip1 z5.d, z1.d, z2.d
-; CHECK-NEXT:    zip2 z6.d, z1.d, z2.d
-; CHECK-NEXT:    zip1 z4.d, z0.d, z3.d
-; CHECK-NEXT:    zip2 z3.d, z0.d, z3.d
-; CHECK-NEXT:    zip1 z0.d, z5.d, z4.d
-; CHECK-NEXT:    zip2 z1.d, z5.d, z4.d
-; CHECK-NEXT:    zip1 z2.d, z6.d, z3.d
-; CHECK-NEXT:    zip2 z3.d, z6.d, z3.d
 ; CHECK-NEXT:    ret
 entry:
   %strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double> } @llvm.vector.deinterleave4.nxv8f64(<vscale x 8 x double> %a)