[llvm] 41a3f92 - [AArch64][CodeGen] Add AArch64 support for complex deinterleaving

Wed Nov 16 06:01:10 PST 2022

Author: Nicholas Guy
Date: 2022-11-16T14:00:54Z
New Revision: 41a3f92596a7a6821fa7d09e0f7bdceb90c7df2e

URL: https://github.com/llvm/llvm-project/commit/41a3f92596a7a6821fa7d09e0f7bdceb90c7df2e
DIFF: https://github.com/llvm/llvm-project/commit/41a3f92596a7a6821fa7d09e0f7bdceb90c7df2e.diff

LOG: [AArch64][CodeGen] Add AArch64 support for complex deinterleaving

Differential Revision: https://reviews.llvm.org/D129066

Added: 
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll

Modified: 
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.h
    llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
    llvm/test/CodeGen/AArch64/O3-pipeline.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 41b30320a90c..b08eef8992e9 100644

--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -23289,3 +23289,94 @@ bool AArch64TargetLowering::isConstantUnsignedBitfieldExtractLegal(
     unsigned Opc, LLT Ty1, LLT Ty2) const {
   return Ty1 == Ty2 && (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64));
 }
+
+bool AArch64TargetLowering::isComplexDeinterleavingSupported() const {
+  return Subtarget->hasComplxNum();
+}
+
+bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
+    ComplexDeinterleavingOperation Operation, Type *Ty) const {
+  auto *VTy = dyn_cast<FixedVectorType>(Ty);
+  if (!VTy)
+    return false;
+
+  auto *ScalarTy = VTy->getScalarType();
+  unsigned NumElements = VTy->getNumElements();
+
+  unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
+  if ((VTyWidth < 128 && VTyWidth != 64) || !llvm::isPowerOf2_32(VTyWidth))
+    return false;
+
+  return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
+         ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
+}
+
+Value *AArch64TargetLowering::createComplexDeinterleavingIR(
+    Instruction *I, ComplexDeinterleavingOperation OperationType,
+    ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
+    Value *Accumulator) const {
+  FixedVectorType *Ty = cast<FixedVectorType>(InputA->getType());
+
+  IRBuilder<> B(I);
+
+  unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
+
+  assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
+         "Vector type must be either 64 or a power of 2 that is at least 128");
+
+  if (TyWidth > 128) {
+    int Stride = Ty->getNumElements() / 2;
+    auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
+    auto SplitSeqVec = llvm::to_vector(SplitSeq);
+    ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
+    ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
+
+    auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
+    auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
+    auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
+    auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
+    Value *LowerSplitAcc = nullptr;
+    Value *UpperSplitAcc = nullptr;
+
+    if (Accumulator) {
+      LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
+      UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
+    }
+
+    auto *LowerSplitInt = createComplexDeinterleavingIR(
+        I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
+    auto *UpperSplitInt = createComplexDeinterleavingIR(
+        I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
+
+    ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
+    return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
+  }
+
+  if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
+    Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
+                              Intrinsic::aarch64_neon_vcmla_rot90,
+                              Intrinsic::aarch64_neon_vcmla_rot180,
+                              Intrinsic::aarch64_neon_vcmla_rot270};
+
+    if (Accumulator == nullptr)
+      Accumulator = ConstantFP::get(Ty, 0);
+
+    return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
+                             {Accumulator, InputB, InputA});
+  }
+
+  if (OperationType == ComplexDeinterleavingOperation::CAdd) {
+    Intrinsic::ID IntId = Intrinsic::not_intrinsic;
+    if (Rotation == ComplexDeinterleavingRotation::Rotation_90)
+      IntId = Intrinsic::aarch64_neon_vcadd_rot90;
+    else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)
+      IntId = Intrinsic::aarch64_neon_vcadd_rot270;
+
+    if (IntId == Intrinsic::not_intrinsic)
+      return nullptr;
+
+    return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
+  }
+
+  return nullptr;
+}

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 88bd154829e2..b0c697f069d1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -812,6 +812,15 @@ class AArch64TargetLowering : public TargetLowering {
 
   bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override;
 
+  bool isComplexDeinterleavingSupported() const override;
+  bool isComplexDeinterleavingOperationSupported(
+      ComplexDeinterleavingOperation Operation, Type *Ty) const override;
+
+  Value *createComplexDeinterleavingIR(
+      Instruction *I, ComplexDeinterleavingOperation OperationType,
+      ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
+      Value *Accumulator = nullptr) const override;
+
   bool hasBitPreservingFPLogic(EVT VT) const override {
     // FIXME: Is this always true? It should be true for vectors at least.
     return VT == MVT::f32 || VT == MVT::f64;

diff  --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 329f5b433f23..d7df95e21c6c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -589,6 +589,10 @@ void AArch64PassConfig::addIRPasses() {
   addPass(createAArch64StackTaggingPass(
       /*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None));
 
+  // Match complex arithmetic patterns
+  if (TM->getOptLevel() >= CodeGenOpt::Default)
+    addPass(createComplexDeinterleavingPass(TM));
+
   // Match interleaved memory accesses to ldN/stN intrinsics.
   if (TM->getOptLevel() != CodeGenOpt::None) {
     addPass(createInterleavedLoadCombinePass());

diff  --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index a605b2ecda7e..81e1c6df19e0 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -87,6 +87,7 @@
 ; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       AArch64 Stack Tagging
+; CHECK-NEXT:       Complex Deinterleaving Pass
 ; CHECK-NEXT:       Function Alias Analysis Results
 ; CHECK-NEXT:       Memory SSA
 ; CHECK-NEXT:       Interleaved Load Combine Pass

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
new file mode 100644
index 000000000000..64a436b4031d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; Expected to not transform
+define <2 x half> @complex_add_v2f16(<2 x half> %a, <2 x half> %b) {
+; CHECK-LABEL: complex_add_v2f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov h2, v0.h[1]
+; CHECK-NEXT:    mov h3, v1.h[1]
+; CHECK-NEXT:    fsub h1, h1, h2
+; CHECK-NEXT:    fadd h0, h3, h0
+; CHECK-NEXT:    mov v1.h[1], v0.h[0]
+; CHECK-NEXT:    fmov d0, d1
+; CHECK-NEXT:    ret
+entry:
+  %a.real = shufflevector <2 x half> %a, <2 x half> zeroinitializer, <1 x i32> <i32 0>
+  %a.imag = shufflevector <2 x half> %a, <2 x half> zeroinitializer, <1 x i32> <i32 1>
+  %b.real = shufflevector <2 x half> %b, <2 x half> zeroinitializer, <1 x i32> <i32 0>
+  %b.imag = shufflevector <2 x half> %b, <2 x half> zeroinitializer, <1 x i32> <i32 1>
+  %0 = fsub fast <1 x half> %b.real, %a.imag
+  %1 = fadd fast <1 x half> %b.imag, %a.real
+  %interleaved.vec = shufflevector <1 x half> %0, <1 x half> %1, <2 x i32> <i32 0, i32 1>
+  ret <2 x half> %interleaved.vec
+}
+
+; Expected to transform
+define <4 x half> @complex_add_v4f16(<4 x half> %a, <4 x half> %b) {
+; CHECK-LABEL: complex_add_v4f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcadd v0.4h, v1.4h, v0.4h, #90
+; CHECK-NEXT:    ret
+entry:
+  %a.real = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x half> %b, <4 x half> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x half> %b, <4 x half> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %0 = fsub fast <2 x half> %b.real, %a.imag
+  %1 = fadd fast <2 x half> %b.imag, %a.real
+  %interleaved.vec = shufflevector <2 x half> %0, <2 x half> %1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x half> %interleaved.vec
+}
+
+; Expected to transform
+define <8 x half> @complex_add_v8f16(<8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: complex_add_v8f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcadd v0.8h, v1.8h, v0.8h, #90
+; CHECK-NEXT:    ret
+entry:
+  %a.real = shufflevector <8 x half> %a, <8 x half> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %a.imag = shufflevector <8 x half> %a, <8 x half> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %b.real = shufflevector <8 x half> %b, <8 x half> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b.imag = shufflevector <8 x half> %b, <8 x half> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fsub fast <4 x half> %b.real, %a.imag
+  %1 = fadd fast <4 x half> %b.imag, %a.real
+  %interleaved.vec = shufflevector <4 x half> %0, <4 x half> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x half> %interleaved.vec
+}
+
+; Expected to transform
+define <16 x half> @complex_add_v16f16(<16 x half> %a, <16 x half> %b) {
+; CHECK-LABEL: complex_add_v16f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcadd v0.8h, v2.8h, v0.8h, #90
+; CHECK-NEXT:    fcadd v1.8h, v3.8h, v1.8h, #90
+; CHECK-NEXT:    ret
+entry:
+  %a.real = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %a.imag = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %b.real = shufflevector <16 x half> %b, <16 x half> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %b.imag = shufflevector <16 x half> %b, <16 x half> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %0 = fsub fast <8 x half> %b.real, %a.imag
+  %1 = fadd fast <8 x half> %b.imag, %a.real
+  %interleaved.vec = shufflevector <8 x half> %0, <8 x half> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <16 x half> %interleaved.vec
+}
+
+; Expected to transform
+define <32 x half> @complex_add_v32f16(<32 x half> %a, <32 x half> %b) {
+; CHECK-LABEL: complex_add_v32f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcadd v2.8h, v6.8h, v2.8h, #90
+; CHECK-NEXT:    fcadd v0.8h, v4.8h, v0.8h, #90
+; CHECK-NEXT:    fcadd v1.8h, v5.8h, v1.8h, #90
+; CHECK-NEXT:    fcadd v3.8h, v7.8h, v3.8h, #90
+; CHECK-NEXT:    ret
+entry:
+  %a.real = shufflevector <32 x half> %a, <32 x half> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %a.imag = shufflevector <32 x half> %a, <32 x half> zeroinitializer, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %b.real = shufflevector <32 x half> %b, <32 x half> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %b.imag = shufflevector <32 x half> %b, <32 x half> zeroinitializer, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %0 = fsub fast <16 x half> %b.real, %a.imag
+  %1 = fadd fast <16 x half> %b.imag, %a.real
+  %interleaved.vec = shufflevector <16 x half> %0, <16 x half> %1, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <32 x half> %interleaved.vec
+}

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll
new file mode 100644
index 000000000000..069817508a5b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul.ll
@@ -0,0 +1,146 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; Expected to not transform
+define <2 x half> @complex_mul_v2f16(<2 x half> %a, <2 x half> %b) {
+; CHECK-LABEL: complex_mul_v2f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    mov h3, v0.h[1]
+; CHECK-NEXT:    mov h2, v1.h[1]
+; CHECK-NEXT:    fmul h4, h2, v0.h[0]
+; CHECK-NEXT:    fnmul h2, h3, h2
+; CHECK-NEXT:    fmla h4, h3, v1.h[0]
+; CHECK-NEXT:    fmla h2, h0, v1.h[0]
+; CHECK-NEXT:    mov v2.h[1], v4.h[0]
+; CHECK-NEXT:    fmov d0, d2
+; CHECK-NEXT:    ret
+entry:
+  %a.real   = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> <i32 0>
+  %a.imag = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> <i32 1>
+  %b.real = shufflevector <2 x half> %b, <2 x half> poison, <1 x i32> <i32 0>
+  %b.imag = shufflevector <2 x half> %b, <2 x half> poison, <1 x i32> <i32 1>
+  %0 = fmul fast <1 x half> %b.imag, %a.real
+  %1 = fmul fast <1 x half> %b.real, %a.imag
+  %2 = fadd fast <1 x half> %1, %0
+  %3 = fmul fast <1 x half> %b.real, %a.real
+  %4 = fmul fast <1 x half> %a.imag, %b.imag
+  %5 = fsub fast <1 x half> %3, %4
+  %interleaved.vec = shufflevector <1 x half> %5, <1 x half> %2, <2 x i32> <i32 0, i32 1>
+  ret <2 x half> %interleaved.vec
+}
+
+; Expected to transform
+define <4 x half> @complex_mul_v4f16(<4 x half> %a, <4 x half> %b) {
+; CHECK-LABEL: complex_mul_v4f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d2, #0000000000000000
+; CHECK-NEXT:    fcmla v2.4h, v0.4h, v1.4h, #0
+; CHECK-NEXT:    fcmla v2.4h, v0.4h, v1.4h, #90
+; CHECK-NEXT:    fmov d0, d2
+; CHECK-NEXT:    ret
+entry:
+  %a.real   = shufflevector <4 x half> %a, <4 x half> poison, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x half> %a, <4 x half> poison, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x half> %b, <4 x half> poison, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x half> %b, <4 x half> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x half> %b.imag, %a.real
+  %1 = fmul fast <2 x half> %b.real, %a.imag
+  %2 = fadd fast <2 x half> %1, %0
+  %3 = fmul fast <2 x half> %b.real, %a.real
+  %4 = fmul fast <2 x half> %a.imag, %b.imag
+  %5 = fsub fast <2 x half> %3, %4
+  %interleaved.vec = shufflevector <2 x half> %5, <2 x half> %2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x half> %interleaved.vec
+}
+
+; Expected to transform
+define <8 x half> @complex_mul_v8f16(<8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: complex_mul_v8f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v2.8h, v0.8h, v1.8h, #0
+; CHECK-NEXT:    fcmla v2.8h, v0.8h, v1.8h, #90
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+entry:
+  %a.real   = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %a.imag = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %b.real = shufflevector <8 x half> %b, <8 x half> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b.imag = shufflevector <8 x half> %b, <8 x half> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fmul fast <4 x half> %b.imag, %a.real
+  %1 = fmul fast <4 x half> %b.real, %a.imag
+  %2 = fadd fast <4 x half> %1, %0
+  %3 = fmul fast <4 x half> %b.real, %a.real
+  %4 = fmul fast <4 x half> %a.imag, %b.imag
+  %5 = fsub fast <4 x half> %3, %4
+  %interleaved.vec = shufflevector <4 x half> %5, <4 x half> %2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x half> %interleaved.vec
+}
+
+; Expected to transform
+define <16 x half> @complex_mul_v16f16(<16 x half> %a, <16 x half> %b) {
+; CHECK-LABEL: complex_mul_v16f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v4.8h, v0.8h, v2.8h, #0
+; CHECK-NEXT:    fcmla v5.8h, v1.8h, v3.8h, #0
+; CHECK-NEXT:    fcmla v4.8h, v0.8h, v2.8h, #90
+; CHECK-NEXT:    fcmla v5.8h, v1.8h, v3.8h, #90
+; CHECK-NEXT:    mov v0.16b, v4.16b
+; CHECK-NEXT:    mov v1.16b, v5.16b
+; CHECK-NEXT:    ret
+entry:
+  %a.real   = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %a.imag = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %b.real = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %b.imag = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %0 = fmul fast <8 x half> %b.imag, %a.real
+  %1 = fmul fast <8 x half> %b.real, %a.imag
+  %2 = fadd fast <8 x half> %1, %0
+  %3 = fmul fast <8 x half> %b.real, %a.real
+  %4 = fmul fast <8 x half> %a.imag, %b.imag
+  %5 = fsub fast <8 x half> %3, %4
+  %interleaved.vec = shufflevector <8 x half> %5, <8 x half> %2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <16 x half> %interleaved.vec
+}
+
+; Expected to transform
+define <32 x half> @complex_mul_v32f16(<32 x half> %a, <32 x half> %b) {
+; CHECK-LABEL: complex_mul_v32f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-NEXT:    movi v18.2d, #0000000000000000
+; CHECK-NEXT:    movi v19.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v16.8h, v0.8h, v4.8h, #0
+; CHECK-NEXT:    fcmla v17.8h, v1.8h, v5.8h, #0
+; CHECK-NEXT:    fcmla v18.8h, v2.8h, v6.8h, #0
+; CHECK-NEXT:    fcmla v19.8h, v3.8h, v7.8h, #0
+; CHECK-NEXT:    fcmla v16.8h, v0.8h, v4.8h, #90
+; CHECK-NEXT:    fcmla v17.8h, v1.8h, v5.8h, #90
+; CHECK-NEXT:    fcmla v18.8h, v2.8h, v6.8h, #90
+; CHECK-NEXT:    fcmla v19.8h, v3.8h, v7.8h, #90
+; CHECK-NEXT:    mov v0.16b, v16.16b
+; CHECK-NEXT:    mov v1.16b, v17.16b
+; CHECK-NEXT:    mov v2.16b, v18.16b
+; CHECK-NEXT:    mov v3.16b, v19.16b
+; CHECK-NEXT:    ret
+entry:
+  %a.real   = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %a.imag = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %b.real = shufflevector <32 x half> %b, <32 x half> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %b.imag = shufflevector <32 x half> %b, <32 x half> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %0 = fmul fast <16 x half> %b.imag, %a.real
+  %1 = fmul fast <16 x half> %b.real, %a.imag
+  %2 = fadd fast <16 x half> %1, %0
+  %3 = fmul fast <16 x half> %b.real, %a.real
+  %4 = fmul fast <16 x half> %a.imag, %b.imag
+  %5 = fsub fast <16 x half> %3, %4
+  %interleaved.vec = shufflevector <16 x half> %5, <16 x half> %2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <32 x half> %interleaved.vec
+}

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add.ll
new file mode 100644
index 000000000000..fce50b6c8ce4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+
+; Expected to transform
+define <2 x float> @complex_add_v2f32(<2 x float> %a, <2 x float> %b) {
+; CHECK-LABEL: complex_add_v2f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcadd v0.2s, v1.2s, v0.2s, #90
+; CHECK-NEXT:    ret
+entry:
+  %a.real = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <1 x i32> <i32 0>
+  %a.imag = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <1 x i32> <i32 1>
+  %b.real = shufflevector <2 x float> %b, <2 x float> zeroinitializer, <1 x i32> <i32 0>
+  %b.imag = shufflevector <2 x float> %b, <2 x float> zeroinitializer, <1 x i32> <i32 1>
+  %0 = fsub fast <1 x float> %b.real, %a.imag
+  %1 = fadd fast <1 x float> %b.imag, %a.real
+  %interleaved.vec = shufflevector <1 x float> %0, <1 x float> %1, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %interleaved.vec
+}
+
+; Expected to transform
+define <4 x float> @complex_add_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: complex_add_v4f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcadd v0.4s, v1.4s, v0.4s, #90
+; CHECK-NEXT:    ret
+entry:
+  %a.real = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %0 = fsub fast <2 x float> %b.real, %a.imag
+  %1 = fadd fast <2 x float> %b.imag, %a.real
+  %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to transform
+define <8 x float> @complex_add_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: complex_add_v8f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcadd v0.4s, v2.4s, v0.4s, #90
+; CHECK-NEXT:    fcadd v1.4s, v3.4s, v1.4s, #90
+; CHECK-NEXT:    ret
+entry:
+  %a.real = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %a.imag = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %b.real = shufflevector <8 x float> %b, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b.imag = shufflevector <8 x float> %b, <8 x float> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fsub fast <4 x float> %b.real, %a.imag
+  %1 = fadd fast <4 x float> %b.imag, %a.real
+  %interleaved.vec = shufflevector <4 x float> %0, <4 x float> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x float> %interleaved.vec
+}
+
+; Expected to transform
+define <16 x float> @complex_add_v16f32(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: complex_add_v16f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcadd v2.4s, v6.4s, v2.4s, #90
+; CHECK-NEXT:    fcadd v0.4s, v4.4s, v0.4s, #90
+; CHECK-NEXT:    fcadd v1.4s, v5.4s, v1.4s, #90
+; CHECK-NEXT:    fcadd v3.4s, v7.4s, v3.4s, #90
+; CHECK-NEXT:    ret
+entry:
+  %a.real = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %a.imag = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %b.real = shufflevector <16 x float> %b, <16 x float> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %b.imag = shufflevector <16 x float> %b, <16 x float> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %0 = fsub fast <8 x float> %b.real, %a.imag
+  %1 = fadd fast <8 x float> %b.imag, %a.real
+  %interleaved.vec = shufflevector <8 x float> %0, <8 x float> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <16 x float> %interleaved.vec
+}

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul.ll
new file mode 100644
index 000000000000..5cda0e3b39d9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul.ll
@@ -0,0 +1,116 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; Expected to transform
+define <2 x float> @complex_mul_v2f32(<2 x float> %a, <2 x float> %b) {
+; CHECK-LABEL: complex_mul_v2f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi d2, #0000000000000000
+; CHECK-NEXT:    fcmla v2.2s, v0.2s, v1.2s, #0
+; CHECK-NEXT:    fcmla v2.2s, v0.2s, v1.2s, #90
+; CHECK-NEXT:    fmov d0, d2
+; CHECK-NEXT:    ret
+entry:
+  %a.real   = shufflevector <2 x float> %a, <2 x float> poison, <1 x i32> <i32 0>
+  %a.imag = shufflevector <2 x float> %a, <2 x float> poison, <1 x i32> <i32 1>
+  %b.real = shufflevector <2 x float> %b, <2 x float> poison, <1 x i32> <i32 0>
+  %b.imag = shufflevector <2 x float> %b, <2 x float> poison, <1 x i32> <i32 1>
+  %0 = fmul fast <1 x float> %b.imag, %a.real
+  %1 = fmul fast <1 x float> %b.real, %a.imag
+  %2 = fadd fast <1 x float> %1, %0
+  %3 = fmul fast <1 x float> %b.real, %a.real
+  %4 = fmul fast <1 x float> %a.imag, %b.imag
+  %5 = fsub fast <1 x float> %3, %4
+  %interleaved.vec = shufflevector <1 x float> %5, <1 x float> %2, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %interleaved.vec
+}
+
+; Expected to transform
+define <4 x float> @complex_mul_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: complex_mul_v4f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v2.4s, v0.4s, v1.4s, #0
+; CHECK-NEXT:    fcmla v2.4s, v0.4s, v1.4s, #90
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+entry:
+  %a.real   = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x float> %b.imag, %a.real
+  %1 = fmul fast <2 x float> %b.real, %a.imag
+  %2 = fadd fast <2 x float> %1, %0
+  %3 = fmul fast <2 x float> %b.real, %a.real
+  %4 = fmul fast <2 x float> %a.imag, %b.imag
+  %5 = fsub fast <2 x float> %3, %4
+  %interleaved.vec = shufflevector <2 x float> %5, <2 x float> %2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to transform
+define <8 x float> @complex_mul_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: complex_mul_v8f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v4.4s, v0.4s, v2.4s, #0
+; CHECK-NEXT:    fcmla v5.4s, v1.4s, v3.4s, #0
+; CHECK-NEXT:    fcmla v4.4s, v0.4s, v2.4s, #90
+; CHECK-NEXT:    fcmla v5.4s, v1.4s, v3.4s, #90
+; CHECK-NEXT:    mov v0.16b, v4.16b
+; CHECK-NEXT:    mov v1.16b, v5.16b
+; CHECK-NEXT:    ret
+entry:
+  %a.real   = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %a.imag = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %b.real = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b.imag = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fmul fast <4 x float> %b.imag, %a.real
+  %1 = fmul fast <4 x float> %b.real, %a.imag
+  %2 = fadd fast <4 x float> %1, %0
+  %3 = fmul fast <4 x float> %b.real, %a.real
+  %4 = fmul fast <4 x float> %a.imag, %b.imag
+  %5 = fsub fast <4 x float> %3, %4
+  %interleaved.vec = shufflevector <4 x float> %5, <4 x float> %2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x float> %interleaved.vec
+}
+
+; Expected to transform
+define <16 x float> @complex_mul_v16f32(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: complex_mul_v16f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-NEXT:    movi v18.2d, #0000000000000000
+; CHECK-NEXT:    movi v19.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v16.4s, v0.4s, v4.4s, #0
+; CHECK-NEXT:    fcmla v17.4s, v1.4s, v5.4s, #0
+; CHECK-NEXT:    fcmla v18.4s, v2.4s, v6.4s, #0
+; CHECK-NEXT:    fcmla v19.4s, v3.4s, v7.4s, #0
+; CHECK-NEXT:    fcmla v16.4s, v0.4s, v4.4s, #90
+; CHECK-NEXT:    fcmla v17.4s, v1.4s, v5.4s, #90
+; CHECK-NEXT:    fcmla v18.4s, v2.4s, v6.4s, #90
+; CHECK-NEXT:    fcmla v19.4s, v3.4s, v7.4s, #90
+; CHECK-NEXT:    mov v0.16b, v16.16b
+; CHECK-NEXT:    mov v1.16b, v17.16b
+; CHECK-NEXT:    mov v2.16b, v18.16b
+; CHECK-NEXT:    mov v3.16b, v19.16b
+; CHECK-NEXT:    ret
+entry:
+  %a.real   = shufflevector <16 x float> %a, <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %a.imag = shufflevector <16 x float> %a, <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %b.real = shufflevector <16 x float> %b, <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %b.imag = shufflevector <16 x float> %b, <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %0 = fmul fast <8 x float> %b.imag, %a.real
+  %1 = fmul fast <8 x float> %b.real, %a.imag
+  %2 = fadd fast <8 x float> %1, %0
+  %3 = fmul fast <8 x float> %b.real, %a.real
+  %4 = fmul fast <8 x float> %a.imag, %b.imag
+  %5 = fsub fast <8 x float> %3, %4
+  %interleaved.vec = shufflevector <8 x float> %5, <8 x float> %2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <16 x float> %interleaved.vec
+}

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add.ll
new file mode 100644
index 000000000000..54c0fd98f4f4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+
+; Expected to transform
+define <2 x double> @complex_add_v2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: complex_add_v2f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcadd v0.2d, v1.2d, v0.2d, #90
+; CHECK-NEXT:    ret
+entry:
+  %a.real = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> <i32 0>
+  %a.imag = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> <i32 1>
+  %b.real = shufflevector <2 x double> %b, <2 x double> zeroinitializer, <1 x i32> <i32 0>
+  %b.imag = shufflevector <2 x double> %b, <2 x double> zeroinitializer, <1 x i32> <i32 1>
+  %0 = fsub fast <1 x double> %b.real, %a.imag
+  %1 = fadd fast <1 x double> %b.imag, %a.real
+  %interleaved.vec = shufflevector <1 x double> %0, <1 x double> %1, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %interleaved.vec
+}
+
+; Expected to transform
+define <4 x double> @complex_add_v4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: complex_add_v4f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcadd v0.2d, v2.2d, v0.2d, #90
+; CHECK-NEXT:    fcadd v1.2d, v3.2d, v1.2d, #90
+; CHECK-NEXT:    ret
+entry:
+  %a.real = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x double> %b, <4 x double> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x double> %b, <4 x double> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %0 = fsub fast <2 x double> %b.real, %a.imag
+  %1 = fadd fast <2 x double> %b.imag, %a.real
+  %interleaved.vec = shufflevector <2 x double> %0, <2 x double> %1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x double> %interleaved.vec
+}
+
+; Expected to transform
+define <8 x double> @complex_add_v8f64(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: complex_add_v8f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcadd v2.2d, v6.2d, v2.2d, #90
+; CHECK-NEXT:    fcadd v0.2d, v4.2d, v0.2d, #90
+; CHECK-NEXT:    fcadd v1.2d, v5.2d, v1.2d, #90
+; CHECK-NEXT:    fcadd v3.2d, v7.2d, v3.2d, #90
+; CHECK-NEXT:    ret
+entry:
+  %a.real = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %a.imag = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %b.real = shufflevector <8 x double> %b, <8 x double> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b.imag = shufflevector <8 x double> %b, <8 x double> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fsub fast <4 x double> %b.real, %a.imag
+  %1 = fadd fast <4 x double> %b.imag, %a.real
+  %interleaved.vec = shufflevector <4 x double> %0, <4 x double> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x double> %interleaved.vec
+}

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul.ll
new file mode 100644
index 000000000000..b72d386be7d8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul.ll
@@ -0,0 +1,92 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; Expected to transform
+define <2 x double> @complex_mul_v2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: complex_mul_v2f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v2.2d, v0.2d, v1.2d, #0
+; CHECK-NEXT:    fcmla v2.2d, v0.2d, v1.2d, #90
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+entry:
+  %a.real   = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> <i32 0>
+  %a.imag = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> <i32 1>
+  %b.real = shufflevector <2 x double> %b, <2 x double> poison, <1 x i32> <i32 0>
+  %b.imag = shufflevector <2 x double> %b, <2 x double> poison, <1 x i32> <i32 1>
+  %0 = fmul fast <1 x double> %b.imag, %a.real
+  %1 = fmul fast <1 x double> %b.real, %a.imag
+  %2 = fadd fast <1 x double> %1, %0
+  %3 = fmul fast <1 x double> %b.real, %a.real
+  %4 = fmul fast <1 x double> %a.imag, %b.imag
+  %5 = fsub fast <1 x double> %3, %4
+  %interleaved.vec = shufflevector <1 x double> %5, <1 x double> %2, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %interleaved.vec
+}
+
+; Expected to transform
+define <4 x double> @complex_mul_v4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: complex_mul_v4f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v4.2d, v0.2d, v2.2d, #0
+; CHECK-NEXT:    fcmla v5.2d, v1.2d, v3.2d, #0
+; CHECK-NEXT:    fcmla v4.2d, v0.2d, v2.2d, #90
+; CHECK-NEXT:    fcmla v5.2d, v1.2d, v3.2d, #90
+; CHECK-NEXT:    mov v0.16b, v4.16b
+; CHECK-NEXT:    mov v1.16b, v5.16b
+; CHECK-NEXT:    ret
+entry:
+  %a.real   = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x double> %b.imag, %a.real
+  %1 = fmul fast <2 x double> %b.real, %a.imag
+  %2 = fadd fast <2 x double> %1, %0
+  %3 = fmul fast <2 x double> %b.real, %a.real
+  %4 = fmul fast <2 x double> %a.imag, %b.imag
+  %5 = fsub fast <2 x double> %3, %4
+  %interleaved.vec = shufflevector <2 x double> %5, <2 x double> %2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x double> %interleaved.vec
+}
+
+; Expected to transform
+define <8 x double> @complex_mul_v8f64(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: complex_mul_v8f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-NEXT:    movi v18.2d, #0000000000000000
+; CHECK-NEXT:    movi v19.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v16.2d, v0.2d, v4.2d, #0
+; CHECK-NEXT:    fcmla v17.2d, v1.2d, v5.2d, #0
+; CHECK-NEXT:    fcmla v18.2d, v2.2d, v6.2d, #0
+; CHECK-NEXT:    fcmla v19.2d, v3.2d, v7.2d, #0
+; CHECK-NEXT:    fcmla v16.2d, v0.2d, v4.2d, #90
+; CHECK-NEXT:    fcmla v17.2d, v1.2d, v5.2d, #90
+; CHECK-NEXT:    fcmla v18.2d, v2.2d, v6.2d, #90
+; CHECK-NEXT:    fcmla v19.2d, v3.2d, v7.2d, #90
+; CHECK-NEXT:    mov v0.16b, v16.16b
+; CHECK-NEXT:    mov v1.16b, v17.16b
+; CHECK-NEXT:    mov v2.16b, v18.16b
+; CHECK-NEXT:    mov v3.16b, v19.16b
+; CHECK-NEXT:    ret
+entry:
+  %a.real   = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %a.imag = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %b.real = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b.imag = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fmul fast <4 x double> %b.imag, %a.real
+  %1 = fmul fast <4 x double> %b.real, %a.imag
+  %2 = fadd fast <4 x double> %1, %0
+  %3 = fmul fast <4 x double> %b.real, %a.real
+  %4 = fmul fast <4 x double> %a.imag, %b.imag
+  %5 = fsub fast <4 x double> %3, %4
+  %interleaved.vec = shufflevector <4 x double> %5, <4 x double> %2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x double> %interleaved.vec
+}

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll
new file mode 100644
index 000000000000..4050172efd90
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll
@@ -0,0 +1,363 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; Expected to transform
+define <4 x float> @mul_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: mul_mul:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v4.4s, v0.4s, v1.4s, #0
+; CHECK-NEXT:    fcmla v4.4s, v0.4s, v1.4s, #90
+; CHECK-NEXT:    fcmla v3.4s, v4.4s, v2.4s, #0
+; CHECK-NEXT:    fcmla v3.4s, v4.4s, v2.4s, #90
+; CHECK-NEXT:    mov v0.16b, v3.16b
+; CHECK-NEXT:    ret
+entry:
+  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec151 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec153 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec154 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x float> %strided.vec154, %strided.vec151
+  %1 = fmul fast <2 x float> %strided.vec153, %strided.vec
+  %2 = fmul fast <2 x float> %strided.vec154, %strided.vec
+  %3 = fmul fast <2 x float> %strided.vec153, %strided.vec151
+  %4 = fadd fast <2 x float> %3, %2
+  %5 = fsub fast <2 x float> %1, %0
+  %strided.vec156 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec157 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %6 = fmul fast <2 x float> %4, %strided.vec156
+  %7 = fmul fast <2 x float> %5, %strided.vec157
+  %8 = fadd fast <2 x float> %6, %7
+  %9 = fmul fast <2 x float> %strided.vec156, %5
+  %10 = fmul fast <2 x float> %4, %strided.vec157
+  %11 = fsub fast <2 x float> %9, %10
+  %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to not transform
+define <4 x float> @add_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: add_mul:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fsub v0.4s, v1.4s, v0.4s
+; CHECK-NEXT:    fsub v1.4s, v1.4s, v2.4s
+; CHECK-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT:    ext v4.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    zip2 v0.2s, v0.2s, v4.2s
+; CHECK-NEXT:    zip2 v4.2s, v2.2s, v3.2s
+; CHECK-NEXT:    zip1 v1.2s, v1.2s, v5.2s
+; CHECK-NEXT:    zip1 v2.2s, v2.2s, v3.2s
+; CHECK-NEXT:    fmul v5.2s, v4.2s, v0.2s
+; CHECK-NEXT:    fmul v3.2s, v1.2s, v4.2s
+; CHECK-NEXT:    fneg v4.2s, v5.2s
+; CHECK-NEXT:    fmla v3.2s, v0.2s, v2.2s
+; CHECK-NEXT:    fmla v4.2s, v1.2s, v2.2s
+; CHECK-NEXT:    zip2 v1.2s, v4.2s, v3.2s
+; CHECK-NEXT:    zip1 v0.2s, v4.2s, v3.2s
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %0 = fsub fast <4 x float> %b, %c
+  %1 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec58 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec59 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %2 = fmul fast <2 x float> %1, %strided.vec59
+  %3 = fsub fast <4 x float> %b, %a
+  %4 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %5 = fmul fast <2 x float> %strided.vec58, %4
+  %6 = fadd fast <2 x float> %5, %2
+  %7 = fmul fast <2 x float> %strided.vec58, %1
+  %8 = fmul fast <2 x float> %strided.vec59, %4
+  %9 = fsub fast <2 x float> %7, %8
+  %interleaved.vec = shufflevector <2 x float> %9, <2 x float> %6, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to not transform
+define <4 x float> @mul_mul270_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: mul_mul270_mul:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    zip1 v5.2s, v2.2s, v3.2s
+; CHECK-NEXT:    zip2 v2.2s, v2.2s, v3.2s
+; CHECK-NEXT:    zip1 v6.2s, v1.2s, v4.2s
+; CHECK-NEXT:    zip2 v1.2s, v1.2s, v4.2s
+; CHECK-NEXT:    ext v3.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    fmul v7.2s, v6.2s, v5.2s
+; CHECK-NEXT:    fneg v4.2s, v7.2s
+; CHECK-NEXT:    zip2 v7.2s, v0.2s, v3.2s
+; CHECK-NEXT:    zip1 v0.2s, v0.2s, v3.2s
+; CHECK-NEXT:    fmla v4.2s, v2.2s, v1.2s
+; CHECK-NEXT:    fmul v1.2s, v1.2s, v5.2s
+; CHECK-NEXT:    fmul v3.2s, v4.2s, v7.2s
+; CHECK-NEXT:    fmla v1.2s, v2.2s, v6.2s
+; CHECK-NEXT:    fmul v2.2s, v4.2s, v0.2s
+; CHECK-NEXT:    fneg v3.2s, v3.2s
+; CHECK-NEXT:    fmla v2.2s, v7.2s, v1.2s
+; CHECK-NEXT:    fmla v3.2s, v0.2s, v1.2s
+; CHECK-NEXT:    zip2 v1.2s, v3.2s, v2.2s
+; CHECK-NEXT:    zip1 v0.2s, v3.2s, v2.2s
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %strided.vec = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec81 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec83 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec84 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x float> %strided.vec84, %strided.vec
+  %1 = fmul fast <2 x float> %strided.vec83, %strided.vec81
+  %2 = fadd fast <2 x float> %1, %0
+  %strided.vec86 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec87 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %3 = fmul fast <2 x float> %2, %strided.vec87
+  %4 = fmul fast <2 x float> %strided.vec84, %strided.vec81
+  %5 = fmul fast <2 x float> %strided.vec83, %strided.vec
+  %6 = fsub fast <2 x float> %4, %5
+  %7 = fmul fast <2 x float> %6, %strided.vec86
+  %8 = fadd fast <2 x float> %3, %7
+  %9 = fmul fast <2 x float> %2, %strided.vec86
+  %10 = fmul fast <2 x float> %6, %strided.vec87
+  %11 = fsub fast <2 x float> %9, %10
+  %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; (a * b) * a
+; Expected to transform
+define <4 x float> @mul_triangle(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: mul_triangle:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v3.4s, v1.4s, v0.4s, #0
+; CHECK-NEXT:    fcmla v3.4s, v1.4s, v0.4s, #90
+; CHECK-NEXT:    fcmla v2.4s, v0.4s, v3.4s, #0
+; CHECK-NEXT:    fcmla v2.4s, v0.4s, v3.4s, #90
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+entry:
+  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x float> %strided.vec37, %strided.vec
+  %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35
+  %2 = fsub fast <2 x float> %0, %1
+  %3 = fmul fast <2 x float> %2, %strided.vec35
+  %4 = fmul fast <2 x float> %strided.vec38, %strided.vec
+  %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37
+  %6 = fadd fast <2 x float> %4, %5
+  %7 = fmul fast <2 x float> %6, %strided.vec
+  %8 = fadd fast <2 x float> %3, %7
+  %9 = fmul fast <2 x float> %2, %strided.vec
+  %10 = fmul fast <2 x float> %6, %strided.vec35
+  %11 = fsub fast <2 x float> %9, %10
+  %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+
+; d * (b * a) * (c * a)
+; Expected to transform
+define <4 x float> @mul_diamond(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) {
+; CHECK-LABEL: mul_diamond:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-NEXT:    movi v6.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v4.4s, v1.4s, v0.4s, #0
+; CHECK-NEXT:    fcmla v6.4s, v2.4s, v0.4s, #0
+; CHECK-NEXT:    fcmla v4.4s, v1.4s, v0.4s, #90
+; CHECK-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v6.4s, v2.4s, v0.4s, #90
+; CHECK-NEXT:    fcmla v5.4s, v4.4s, v3.4s, #0
+; CHECK-NEXT:    fcmla v5.4s, v4.4s, v3.4s, #90
+; CHECK-NEXT:    fcmla v1.4s, v6.4s, v5.4s, #0
+; CHECK-NEXT:    fcmla v1.4s, v6.4s, v5.4s, #90
+; CHECK-NEXT:    mov v0.16b, v1.16b
+; CHECK-NEXT:    ret
+entry:
+  %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %d.real = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %d.imag = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x float> %a.imag, %b.real
+  %1 = fmul fast <2 x float> %a.real, %b.imag
+  %2 = fadd fast <2 x float> %1, %0
+  %3 = fmul fast <2 x float> %a.real, %b.real
+  %4 = fmul fast <2 x float> %b.imag, %a.imag
+  %5 = fsub fast <2 x float> %3, %4
+  %6 = fmul fast <2 x float> %d.real, %5
+  %7 = fmul fast <2 x float> %2, %d.imag
+  %8 = fmul fast <2 x float> %d.real, %2
+  %9 = fmul fast <2 x float> %5, %d.imag
+  %10 = fsub fast <2 x float> %6, %7
+  %11 = fadd fast <2 x float> %8, %9
+  %12 = fmul fast <2 x float> %c.real, %a.imag
+  %13 = fmul fast <2 x float> %c.imag, %a.real
+  %14 = fadd fast <2 x float> %13, %12
+  %15 = fmul fast <2 x float> %14, %10
+  %16 = fmul fast <2 x float> %c.real, %a.real
+  %17 = fmul fast <2 x float> %c.imag, %a.imag
+  %18 = fsub fast <2 x float> %16, %17
+  %19 = fmul fast <2 x float> %18, %11
+  %20 = fadd fast <2 x float> %15, %19
+  %21 = fmul fast <2 x float> %18, %10
+  %22 = fmul fast <2 x float> %14, %11
+  %23 = fsub fast <2 x float> %21, %22
+  %interleaved.vec = shufflevector <2 x float> %23, <2 x float> %20, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to transform
+define <4 x float> @mul_add90_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: mul_add90_mul:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v3.4s, v1.4s, v0.4s, #0
+; CHECK-NEXT:    fcmla v4.4s, v2.4s, v0.4s, #0
+; CHECK-NEXT:    fcmla v3.4s, v1.4s, v0.4s, #90
+; CHECK-NEXT:    fcmla v4.4s, v2.4s, v0.4s, #90
+; CHECK-NEXT:    fcadd v0.4s, v4.4s, v3.4s, #90
+; CHECK-NEXT:    ret
+entry:
+  %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %ai = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %br = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %bi = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %cr = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %ci = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+
+  %i6 = fmul fast <2 x float> %br, %ar
+  %i7 = fmul fast <2 x float> %bi, %ai
+  %xr = fsub fast <2 x float> %i6, %i7
+  %i9 = fmul fast <2 x float> %bi, %ar
+  %i10 = fmul fast <2 x float> %br, %ai
+  %xi = fadd fast <2 x float> %i9, %i10
+
+  %j6 = fmul fast <2 x float> %cr, %ar
+  %j7 = fmul fast <2 x float> %ci, %ai
+  %yr = fsub fast <2 x float> %j6, %j7
+  %j9 = fmul fast <2 x float> %ci, %ar
+  %j10 = fmul fast <2 x float> %cr, %ai
+  %yi = fadd fast <2 x float> %j9, %j10
+
+  %zr = fsub fast <2 x float> %yr, %xi
+  %zi = fadd fast <2 x float> %yi, %xr
+  %interleaved.vec = shufflevector <2 x float> %zr, <2 x float> %zi, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to not transform
+define <4 x float> @mul_triangle_addmul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: mul_triangle_addmul:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    ext v4.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    zip1 v5.2s, v1.2s, v3.2s
+; CHECK-NEXT:    zip2 v1.2s, v1.2s, v3.2s
+; CHECK-NEXT:    zip1 v6.2s, v0.2s, v4.2s
+; CHECK-NEXT:    zip2 v0.2s, v0.2s, v4.2s
+; CHECK-NEXT:    ext v3.16b, v2.16b, v2.16b, #8
+; CHECK-NEXT:    fmul v7.2s, v5.2s, v6.2s
+; CHECK-NEXT:    fmul v6.2s, v1.2s, v6.2s
+; CHECK-NEXT:    zip1 v4.2s, v2.2s, v3.2s
+; CHECK-NEXT:    zip2 v2.2s, v2.2s, v3.2s
+; CHECK-NEXT:    fmov d3, d7
+; CHECK-NEXT:    fmov d16, d6
+; CHECK-NEXT:    fmls v7.2s, v0.2s, v2.2s
+; CHECK-NEXT:    fmla v6.2s, v0.2s, v4.2s
+; CHECK-NEXT:    fmls v3.2s, v0.2s, v1.2s
+; CHECK-NEXT:    fmla v16.2s, v0.2s, v5.2s
+; CHECK-NEXT:    fsub v0.2s, v7.2s, v16.2s
+; CHECK-NEXT:    fadd v1.2s, v6.2s, v3.2s
+; CHECK-NEXT:    zip2 v2.2s, v0.2s, v1.2s
+; CHECK-NEXT:    zip1 v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %ai = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %br = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %bi = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %cr = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %ci = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+
+  %i6 = fmul fast <2 x float> %br, %ar
+  %i7 = fmul fast <2 x float> %bi, %ai
+  %xr = fsub fast <2 x float> %i6, %i7
+  %i9 = fmul fast <2 x float> %bi, %ar
+  %i10 = fmul fast <2 x float> %br, %ai
+  %xi = fadd fast <2 x float> %i9, %i10
+
+  ;%j6 = fmul fast <2 x float> %cr, %ar
+  %j7 = fmul fast <2 x float> %ci, %ai
+  %yr = fsub fast <2 x float> %i6, %j7
+  ;%j9 = fmul fast <2 x float> %ci, %ar
+  %j10 = fmul fast <2 x float> %cr, %ai
+  %yi = fadd fast <2 x float> %i9, %j10
+
+  %zr = fsub fast <2 x float> %yr, %xi
+  %zi = fadd fast <2 x float> %yi, %xr
+  %interleaved.vec = shufflevector <2 x float> %zr, <2 x float> %zi, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to not transform
+define <4 x float> @mul_triangle_multiuses(<4 x float> %a, <4 x float> %b, ptr %p) {
+; CHECK-LABEL: mul_triangle_multiuses:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    zip2 v4.2s, v0.2s, v2.2s
+; CHECK-NEXT:    zip1 v0.2s, v0.2s, v2.2s
+; CHECK-NEXT:    zip1 v5.2s, v1.2s, v3.2s
+; CHECK-NEXT:    zip2 v1.2s, v1.2s, v3.2s
+; CHECK-NEXT:    fmul v2.2s, v4.2s, v5.2s
+; CHECK-NEXT:    fmul v3.2s, v1.2s, v4.2s
+; CHECK-NEXT:    fmla v2.2s, v0.2s, v1.2s
+; CHECK-NEXT:    fneg v1.2s, v3.2s
+; CHECK-NEXT:    fmul v3.2s, v2.2s, v4.2s
+; CHECK-NEXT:    fmla v1.2s, v0.2s, v5.2s
+; CHECK-NEXT:    fmul v5.2s, v2.2s, v0.2s
+; CHECK-NEXT:    fneg v3.2s, v3.2s
+; CHECK-NEXT:    fmla v5.2s, v4.2s, v1.2s
+; CHECK-NEXT:    fmla v3.2s, v0.2s, v1.2s
+; CHECK-NEXT:    mov v1.d[1], v2.d[0]
+; CHECK-NEXT:    zip2 v4.2s, v3.2s, v5.2s
+; CHECK-NEXT:    zip1 v0.2s, v3.2s, v5.2s
+; CHECK-NEXT:    str q1, [x0]
+; CHECK-NEXT:    mov v0.d[1], v4.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x float> %strided.vec37, %strided.vec
+  %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35
+  %2 = fsub fast <2 x float> %0, %1
+  %3 = fmul fast <2 x float> %2, %strided.vec35
+  %4 = fmul fast <2 x float> %strided.vec38, %strided.vec
+  %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37
+  %6 = fadd fast <2 x float> %4, %5
+  %otheruse = shufflevector <2 x float> %2, <2 x float> %6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x float> %otheruse, ptr %p
+  %7 = fmul fast <2 x float> %6, %strided.vec
+  %8 = fadd fast <2 x float> %3, %7
+  %9 = fmul fast <2 x float> %2, %strided.vec
+  %10 = fmul fast <2 x float> %6, %strided.vec35
+  %11 = fsub fast <2 x float> %9, %10
+  %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
new file mode 100644
index 000000000000..d2ca1cb95ff7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
@@ -0,0 +1,325 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; Expected to transform
+define <4 x float> @simple_mul(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: simple_mul:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v2.4s, v0.4s, v1.4s, #0
+; CHECK-NEXT:    fcmla v2.4s, v0.4s, v1.4s, #90
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+entry:
+  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec17 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec19 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec20 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x float> %strided.vec20, %strided.vec
+  %1 = fmul fast <2 x float> %strided.vec19, %strided.vec17
+  %2 = fadd fast <2 x float> %1, %0
+  %3 = fmul fast <2 x float> %strided.vec19, %strided.vec
+  %4 = fmul fast <2 x float> %strided.vec17, %strided.vec20
+  %5 = fsub fast <2 x float> %3, %4
+  %interleaved.vec = shufflevector <2 x float> %5, <2 x float> %2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to not transform
+define <4 x float> @simple_mul_no_contract(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: simple_mul_no_contract:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    zip1 v4.2s, v0.2s, v2.2s
+; CHECK-NEXT:    zip2 v0.2s, v0.2s, v2.2s
+; CHECK-NEXT:    zip2 v5.2s, v1.2s, v3.2s
+; CHECK-NEXT:    zip1 v1.2s, v1.2s, v3.2s
+; CHECK-NEXT:    fmul v2.2s, v5.2s, v4.2s
+; CHECK-NEXT:    fmul v3.2s, v1.2s, v4.2s
+; CHECK-NEXT:    fmul v4.2s, v0.2s, v5.2s
+; CHECK-NEXT:    fmla v2.2s, v0.2s, v1.2s
+; CHECK-NEXT:    fsub v0.2s, v3.2s, v4.2s
+; CHECK-NEXT:    zip2 v1.2s, v0.2s, v2.2s
+; CHECK-NEXT:    zip1 v0.2s, v0.2s, v2.2s
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec17 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec19 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec20 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x float> %strided.vec20, %strided.vec
+  %1 = fmul fast <2 x float> %strided.vec19, %strided.vec17
+  %2 = fadd fast <2 x float> %1, %0
+  %3 = fmul fast <2 x float> %strided.vec19, %strided.vec
+  %4 = fmul fast <2 x float> %strided.vec17, %strided.vec20
+  %5 = fsub <2 x float> %3, %4
+  %interleaved.vec = shufflevector <2 x float> %5, <2 x float> %2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to transform
+define <4 x float> @three_way_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: three_way_mul:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v4.4s, v1.4s, v0.4s, #0
+; CHECK-NEXT:    fcmla v4.4s, v1.4s, v0.4s, #90
+; CHECK-NEXT:    fcmla v3.4s, v2.4s, v4.4s, #0
+; CHECK-NEXT:    fcmla v3.4s, v2.4s, v4.4s, #90
+; CHECK-NEXT:    mov v0.16b, v3.16b
+; CHECK-NEXT:    ret
+entry:
+  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec39 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec41 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec42 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec44 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec45 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x float> %strided.vec41, %strided.vec
+  %1 = fmul fast <2 x float> %strided.vec42, %strided.vec39
+  %2 = fsub fast <2 x float> %0, %1
+  %3 = fmul fast <2 x float> %2, %strided.vec45
+  %4 = fmul fast <2 x float> %strided.vec42, %strided.vec
+  %5 = fmul fast <2 x float> %strided.vec39, %strided.vec41
+  %6 = fadd fast <2 x float> %4, %5
+  %7 = fmul fast <2 x float> %6, %strided.vec44
+  %8 = fadd fast <2 x float> %3, %7
+  %9 = fmul fast <2 x float> %2, %strided.vec44
+  %10 = fmul fast <2 x float> %6, %strided.vec45
+  %11 = fsub fast <2 x float> %9, %10
+  %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to transform
+define <4 x float> @simple_add_90(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: simple_add_90:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcadd v0.4s, v1.4s, v0.4s, #90
+; CHECK-NEXT:    ret
+entry:
+  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec17 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec19 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec20 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fsub fast <2 x float> %strided.vec19, %strided.vec17
+  %1 = fadd fast <2 x float> %strided.vec20, %strided.vec
+  %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to not transform, fadd commutativity is not yet implemented
+define <4 x float> @simple_add_270_false(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: simple_add_270_false:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    zip1 v4.2s, v0.2s, v2.2s
+; CHECK-NEXT:    zip2 v0.2s, v0.2s, v2.2s
+; CHECK-NEXT:    zip1 v2.2s, v1.2s, v3.2s
+; CHECK-NEXT:    zip2 v1.2s, v1.2s, v3.2s
+; CHECK-NEXT:    fadd v1.2s, v1.2s, v4.2s
+; CHECK-NEXT:    fsub v0.2s, v0.2s, v2.2s
+; CHECK-NEXT:    zip2 v2.2s, v1.2s, v0.2s
+; CHECK-NEXT:    zip1 v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec17 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec19 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec20 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fadd fast <2 x float> %strided.vec20, %strided.vec
+  %1 = fsub fast <2 x float> %strided.vec17, %strided.vec19
+  %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to transform
+define <4 x float> @simple_add_270_true(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: simple_add_270_true:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcadd v0.4s, v0.4s, v1.4s, #270
+; CHECK-NEXT:    ret
+entry:
+  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec17 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec19 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec20 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fadd fast <2 x float> %strided.vec, %strided.vec20
+  %1 = fsub fast <2 x float> %strided.vec17, %strided.vec19
+  %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to not transform
+define <4 x float> @add_external_use(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: add_external_use:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ext v2.16b, v0.16b, v0.16b, #8
+; CHECK-NEXT:    ext v3.16b, v1.16b, v1.16b, #8
+; CHECK-NEXT:    zip1 v4.2s, v0.2s, v2.2s
+; CHECK-NEXT:    zip2 v0.2s, v0.2s, v2.2s
+; CHECK-NEXT:    zip1 v2.2s, v1.2s, v3.2s
+; CHECK-NEXT:    zip2 v1.2s, v1.2s, v3.2s
+; CHECK-NEXT:    fsub v1.2s, v4.2s, v1.2s
+; CHECK-NEXT:    fadd v0.2s, v0.2s, v2.2s
+; CHECK-NEXT:    zip2 v2.2s, v1.2s, v0.2s
+; CHECK-NEXT:    zip1 v0.2s, v1.2s, v0.2s
+; CHECK-NEXT:    mov v0.d[1], v2.d[0]
+; CHECK-NEXT:    ret
+entry:
+  %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fsub fast <2 x float> %a.real, %b.imag
+  %1 = fadd fast <2 x float> %a.imag, %b.real
+  %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  %dup = shufflevector <2 x float> %0, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %interleaved.vec2 = shufflevector <4 x float> %interleaved.vec, <4 x float> %dup, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %interleaved.vec2
+}
+
+; Expected to transform
+define <4 x float> @mul_mul_with_fneg(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: mul_mul_with_fneg:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-NEXT:    fcmla v2.4s, v1.4s, v0.4s, #270
+; CHECK-NEXT:    fcmla v2.4s, v1.4s, v0.4s, #180
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
+entry:
+  %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fneg fast <2 x float> %a.imag
+  %1 = fmul fast <2 x float> %b.real, %0
+  %2 = fmul fast <2 x float> %a.real, %b.imag
+  %3 = fsub fast <2 x float> %1, %2
+  %4 = fmul fast <2 x float> %b.imag, %a.imag
+  %5 = fmul fast <2 x float> %a.real, %b.real
+  %6 = fsub fast <2 x float> %4, %5
+  %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %3, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to not transform
+define <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) {
+; CHECK-LABEL: abp90c12:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr s21, [sp, #32]
+; CHECK-NEXT:    add x9, sp, #48
+; CHECK-NEXT:    // kill: def $s2 killed $s2 def $q2
+; CHECK-NEXT:    // kill: def $s0 killed $s0 def $q0
+; CHECK-NEXT:    ldr s23, [sp, #40]
+; CHECK-NEXT:    add x11, sp, #56
+; CHECK-NEXT:    mov v0.s[1], v2.s[0]
+; CHECK-NEXT:    ldr s2, [sp]
+; CHECK-NEXT:    add x10, sp, #16
+; CHECK-NEXT:    ld1 { v21.s }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #64
+; CHECK-NEXT:    ld1 { v23.s }[1], [x11]
+; CHECK-NEXT:    // kill: def $s1 killed $s1 def $q1
+; CHECK-NEXT:    // kill: def $s3 killed $s3 def $q3
+; CHECK-NEXT:    ldr s22, [sp, #96]
+; CHECK-NEXT:    add x11, sp, #24
+; CHECK-NEXT:    ld1 { v2.s }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #72
+; CHECK-NEXT:    mov v1.s[1], v3.s[0]
+; CHECK-NEXT:    ld1 { v21.s }[2], [x9]
+; CHECK-NEXT:    ldr s24, [sp, #8]
+; CHECK-NEXT:    add x9, sp, #112
+; CHECK-NEXT:    ld1 { v23.s }[2], [x10]
+; CHECK-NEXT:    add x10, sp, #80
+; CHECK-NEXT:    // kill: def $s5 killed $s5 def $q5
+; CHECK-NEXT:    ldr s18, [sp, #128]
+; CHECK-NEXT:    // kill: def $s7 killed $s7 def $q7
+; CHECK-NEXT:    // kill: def $s4 killed $s4 def $q4
+; CHECK-NEXT:    // kill: def $s6 killed $s6 def $q6
+; CHECK-NEXT:    mov v1.s[2], v5.s[0]
+; CHECK-NEXT:    ldr s20, [sp, #104]
+; CHECK-NEXT:    ld1 { v24.s }[1], [x11]
+; CHECK-NEXT:    add x11, sp, #88
+; CHECK-NEXT:    ld1 { v22.s }[1], [x9]
+; CHECK-NEXT:    add x9, sp, #144
+; CHECK-NEXT:    ld1 { v21.s }[3], [x10]
+; CHECK-NEXT:    add x10, sp, #120
+; CHECK-NEXT:    mov v0.s[2], v4.s[0]
+; CHECK-NEXT:    ld1 { v23.s }[3], [x11]
+; CHECK-NEXT:    ld1 { v18.s }[1], [x9]
+; CHECK-NEXT:    add x11, sp, #152
+; CHECK-NEXT:    ld1 { v20.s }[1], [x10]
+; CHECK-NEXT:    add x10, sp, #160
+; CHECK-NEXT:    mov v1.s[3], v7.s[0]
+; CHECK-NEXT:    ldr s17, [sp, #136]
+; CHECK-NEXT:    ldr s19, [sp, #192]
+; CHECK-NEXT:    add x9, sp, #208
+; CHECK-NEXT:    mov v0.s[3], v6.s[0]
+; CHECK-NEXT:    ld1 { v18.s }[2], [x10]
+; CHECK-NEXT:    ld1 { v17.s }[1], [x11]
+; CHECK-NEXT:    add x10, sp, #176
+; CHECK-NEXT:    fmul v3.4s, v23.4s, v1.4s
+; CHECK-NEXT:    ld1 { v19.s }[1], [x9]
+; CHECK-NEXT:    fmul v4.4s, v20.4s, v24.4s
+; CHECK-NEXT:    add x9, sp, #168
+; CHECK-NEXT:    fmul v1.4s, v21.4s, v1.4s
+; CHECK-NEXT:    ld1 { v18.s }[3], [x10]
+; CHECK-NEXT:    fmul v5.4s, v22.4s, v24.4s
+; CHECK-NEXT:    ldr s16, [sp, #200]
+; CHECK-NEXT:    ld1 { v17.s }[2], [x9]
+; CHECK-NEXT:    add x11, sp, #216
+; CHECK-NEXT:    fneg v3.4s, v3.4s
+; CHECK-NEXT:    add x9, sp, #184
+; CHECK-NEXT:    fneg v4.4s, v4.4s
+; CHECK-NEXT:    fmla v1.4s, v0.4s, v23.4s
+; CHECK-NEXT:    fmla v5.4s, v2.4s, v20.4s
+; CHECK-NEXT:    ld1 { v16.s }[1], [x11]
+; CHECK-NEXT:    ld1 { v17.s }[3], [x9]
+; CHECK-NEXT:    fmla v3.4s, v0.4s, v21.4s
+; CHECK-NEXT:    fmla v4.4s, v2.4s, v22.4s
+; CHECK-NEXT:    fsub v0.4s, v18.4s, v1.4s
+; CHECK-NEXT:    fsub v1.4s, v19.4s, v5.4s
+; CHECK-NEXT:    fadd v2.4s, v17.4s, v3.4s
+; CHECK-NEXT:    fadd v3.4s, v16.4s, v4.4s
+; CHECK-NEXT:    ext v4.16b, v0.16b, v1.16b, #12
+; CHECK-NEXT:    ext v5.16b, v2.16b, v3.16b, #12
+; CHECK-NEXT:    trn2 v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    ext v4.16b, v0.16b, v4.16b, #12
+; CHECK-NEXT:    zip2 v3.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ext v5.16b, v2.16b, v5.16b, #8
+; CHECK-NEXT:    zip1 v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    rev64 v4.4s, v4.4s
+; CHECK-NEXT:    str q0, [x8]
+; CHECK-NEXT:    trn2 v4.4s, v4.4s, v5.4s
+; CHECK-NEXT:    ext v1.16b, v4.16b, v1.16b, #8
+; CHECK-NEXT:    mov v3.d[1], v4.d[0]
+; CHECK-NEXT:    stp q3, q1, [x8, #16]
+; CHECK-NEXT:    ret
+entry:
+  %ar = shufflevector <12 x float> %a, <12 x float> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10>
+  %ai = shufflevector <12 x float> %a, <12 x float> poison, <6 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
+  %br = shufflevector <12 x float> %b, <12 x float> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10>
+  %bi = shufflevector <12 x float> %b, <12 x float> poison, <6 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
+  %cr = shufflevector <12 x float> %c, <12 x float> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10>
+  %ci = shufflevector <12 x float> %c, <12 x float> poison, <6 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
+
+  %i6 = fmul fast <6 x float> %br, %ar
+  %i7 = fmul fast <6 x float> %bi, %ai
+  %xr = fsub fast <6 x float> %i6, %i7
+  %i9 = fmul fast <6 x float> %bi, %ar
+  %i10 = fmul fast <6 x float> %br, %ai
+  %xi = fadd fast <6 x float> %i9, %i10
+
+  %zr = fsub fast <6 x float> %cr, %xi
+  %zi = fadd fast <6 x float> %ci, %xr
+  %interleaved.vec = shufflevector <6 x float> %zr, <6 x float> %zi, <12 x i32> <i32 0, i32 6, i32 1, i32 7, i32 2, i32 8, i32 3, i32 9, i32 4, i32 10, i32 5, i32 11>
+  ret <12 x float> %interleaved.vec
+}