[llvm] d52e283 - [ARM][CodeGen] Add support for complex deinterleaving

Mon Nov 14 06:04:49 PST 2022

Author: Nicholas Guy
Date: 2022-11-14T14:02:27Z
New Revision: d52e2839f3b1a21d4a6090ccff6f4b7f1f89a1b3

URL: https://github.com/llvm/llvm-project/commit/d52e2839f3b1a21d4a6090ccff6f4b7f1f89a1b3
DIFF: https://github.com/llvm/llvm-project/commit/d52e2839f3b1a21d4a6090ccff6f4b7f1f89a1b3.diff

LOG: [ARM][CodeGen] Add support for complex deinterleaving

Adds the Complex Deinterleaving Pass implementing support for complex numbers in a target-independent manner, deferring to the TargetLowering for the given target to create a target-specific intrinsic.

Differential Revision: https://reviews.llvm.org/D114174

Added: 
    llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
    llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
    llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-add.ll
    llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-mul.ll
    llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-add.ll
    llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-mul.ll
    llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-add.ll
    llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-mul.ll
    llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll
    llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll

Modified: 
    llvm/include/llvm/CodeGen/Passes.h
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/include/llvm/InitializePasses.h
    llvm/lib/CodeGen/CMakeLists.txt
    llvm/lib/Target/ARM/ARMISelLowering.cpp
    llvm/lib/Target/ARM/ARMISelLowering.h
    llvm/lib/Target/ARM/ARMTargetMachine.cpp
    llvm/test/CodeGen/ARM/O3-pipeline.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
new file mode 100644
index 0000000000000..99df6e5ad1d71

--- /dev/null
+++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
@@ -0,0 +1,53 @@
+//===- ComplexDeinterleavingPass.h - Complex Deinterleaving Pass *- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements generation of target-specific intrinsics to support
+// handling of complex number arithmetic and deinterleaving.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H
+#define LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H
+
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+
+namespace llvm {
+
+class Function;
+class TargetMachine;
+
+struct ComplexDeinterleavingPass
+    : public PassInfoMixin<ComplexDeinterleavingPass> {
+private:
+  TargetMachine *TM;
+
+public:
+  ComplexDeinterleavingPass(TargetMachine *TM) : TM(TM) {}
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+enum class ComplexDeinterleavingOperation {
+  CAdd,
+  CMulPartial,
+  // The following 'operations' are used to represent internal states. Backends
+  // are not expected to try and support these in any capacity.
+  Shuffle
+};
+
+enum class ComplexDeinterleavingRotation {
+  Rotation_0 = 0,
+  Rotation_90 = 1,
+  Rotation_180 = 2,
+  Rotation_270 = 3,
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_COMPLEXDEINTERLEAVING_H

diff  --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 2ec803aaa66c5..5fcbc8ed9abe1 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -79,6 +79,10 @@ namespace llvm {
   /// matching during instruction selection.
   FunctionPass *createCodeGenPreparePass();
 
+  /// This pass implements generation of target-specific intrinsics to support
+  /// handling of complex number arithmetic
+  FunctionPass *createComplexDeinterleavingPass(const TargetMachine *TM);
+
   /// AtomicExpandID -- Lowers atomic operations in terms of either cmpxchg
   /// load-linked/store-conditional loops.
   extern char &AtomicExpandID;

diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index d0a7375c6a3b8..d688c5e41b949 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -27,6 +27,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/ComplexDeinterleavingPass.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/LowLevelType.h"
@@ -3103,6 +3104,26 @@ class TargetLoweringBase {
     return isOperationLegalOrCustom(Op, VT);
   }
 
+  /// Does this target support complex deinterleaving
+  virtual bool isComplexDeinterleavingSupported() const { return false; }
+
+  /// Does this target support complex deinterleaving with the given operation
+  /// and type
+  virtual bool isComplexDeinterleavingOperationSupported(
+      ComplexDeinterleavingOperation Operation, Type *Ty) const {
+    return false;
+  }
+
+  /// Create the IR node for the given complex deinterleaving operation.
+  /// If one cannot be created using all the given inputs, nullptr should be
+  /// returned.
+  virtual Value *createComplexDeinterleavingIR(
+      Instruction *I, ComplexDeinterleavingOperation OperationType,
+      ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
+      Value *Accumulator = nullptr) const {
+    return nullptr;
+  }
+
   //===--------------------------------------------------------------------===//
   // Runtime Library hooks
   //

diff  --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 6d389d432998f..ff8aa807dfaad 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -102,6 +102,7 @@ void initializeCallSiteSplittingLegacyPassPass(PassRegistry&);
 void initializeCalledValuePropagationLegacyPassPass(PassRegistry &);
 void initializeCheckDebugMachineModulePass(PassRegistry &);
 void initializeCodeGenPreparePass(PassRegistry&);
+void initializeComplexDeinterleavingLegacyPassPass(PassRegistry&);
 void initializeConstantHoistingLegacyPassPass(PassRegistry&);
 void initializeConstantMergeLegacyPassPass(PassRegistry&);
 void initializeConstraintEliminationPass(PassRegistry &);

diff  --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index ed83333bade54..2289ea5de82e3 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -43,6 +43,7 @@ add_llvm_component_library(LLVMCodeGen
   CodeGenPassBuilder.cpp
   CodeGenPrepare.cpp
   CommandFlags.cpp
+  ComplexDeinterleavingPass.cpp
   CriticalAntiDepBreaker.cpp
   DeadMachineInstructionElim.cpp
   DetectDeadLanes.cpp

diff  --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
new file mode 100644
index 0000000000000..c2a05cb81f9aa
--- /dev/null
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -0,0 +1,877 @@
+//===- ComplexDeinterleavingPass.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Identification:
+// This step is responsible for finding the patterns that can be lowered to
+// complex instructions, and building a graph to represent the complex
+// structures. Starting from the "Converging Shuffle" (a shuffle that
+// reinterleaves the complex components, with a mask of <0, 2, 1, 3>), the
+// operands are evaluated and identified as "Composite Nodes" (collections of
+// instructions that can potentially be lowered to a single complex
+// instruction). This is performed by checking the real and imaginary components
+// and tracking the data flow for each component while following the operand
+// pairs. Validity of each node is expected to be done upon creation, and any
+// validation errors should halt traversal and prevent further graph
+// construction.
+//
+// Replacement:
+// This step traverses the graph built up by identification, delegating to the
+// target to validate and generate the correct intrinsics, and plumbs them
+// together connecting each end of the new intrinsics graph to the existing
+// use-def chain. This step is assumed to finish successfully, as all
+// information is expected to be correct by this point.
+//
+//
+// Internal data structure:
+// ComplexDeinterleavingGraph:
+// Keeps references to all the valid CompositeNodes formed as part of the
+// transformation, and every Instruction contained within said nodes. It also
+// holds onto a reference to the root Instruction, and the root node that should
+// replace it.
+//
+// ComplexDeinterleavingCompositeNode:
+// A CompositeNode represents a single transformation point; each node should
+// transform into a single complex instruction (ignoring vector splitting, which
+// would generate more instructions per node). They are identified in a
+// depth-first manner, traversing and identifying the operands of each
+// instruction in the order they appear in the IR.
+// Each node maintains a reference  to its Real and Imaginary instructions,
+// as well as any additional instructions that make up the identified operation
+// (Internal instructions should only have uses within their containing node).
+// A Node also contains the rotation and operation type that it represents.
+// Operands contains pointers to other CompositeNodes, acting as the edges in
+// the graph. ReplacementValue is the transformed Value* that has been emitted
+// to the IR.
+//
+// Note: If the operation of a Node is Shuffle, only the Real, Imaginary, and
+// ReplacementValue fields of that Node are relevant, where the ReplacementValue
+// should be pre-populated.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ComplexDeinterleavingPass.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "complex-deinterleaving"
+
+STATISTIC(NumComplexTransformations, "Amount of complex patterns transformed");
+
+static cl::opt<bool> ComplexDeinterleavingEnabled(
+    "enable-complex-deinterleaving",
+    cl::desc("Enable generation of complex instructions"), cl::init(true),
+    cl::Hidden);
+
+/// Checks the given mask, and determines whether said mask is interleaving.
+///
+/// To be interleaving, a mask must alternate between `i` and `i + (Length /
+/// 2)`, and must contain all numbers within the range of `[0..Length)` (e.g. a
+/// 4x vector interleaving mask would be <0, 2, 1, 3>).
+static bool isInterleavingMask(ArrayRef<int> Mask);
+
+/// Checks the given mask, and determines whether said mask is deinterleaving.
+///
+/// To be deinterleaving, a mask must increment in steps of 2, and either start
+/// with 0 or 1.
+/// (e.g. an 8x vector deinterleaving mask would be either <0, 2, 4, 6> or
+/// <1, 3, 5, 7>).
+static bool isDeinterleavingMask(ArrayRef<int> Mask);
+
+namespace {
+
+class ComplexDeinterleavingLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  ComplexDeinterleavingLegacyPass(const TargetMachine *TM = nullptr)
+      : FunctionPass(ID), TM(TM) {
+    initializeComplexDeinterleavingLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Complex Deinterleaving Pass";
+  }
+
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+private:
+  const TargetMachine *TM;
+};
+
+class ComplexDeinterleavingGraph;
+struct ComplexDeinterleavingCompositeNode {
+
+  ComplexDeinterleavingCompositeNode(ComplexDeinterleavingOperation Op,
+                                     Instruction *R, Instruction *I)
+      : Operation(Op), Real(R), Imag(I) {}
+
+private:
+  friend class ComplexDeinterleavingGraph;
+  using NodePtr = std::shared_ptr<ComplexDeinterleavingCompositeNode>;
+  using RawNodePtr = ComplexDeinterleavingCompositeNode *;
+
+public:
+  ComplexDeinterleavingOperation Operation;
+  Instruction *Real;
+  Instruction *Imag;
+
+  // Instructions that should only exist within this node, there should be no
+  // users of these instructions outside the node. An example of these would be
+  // the multiply instructions of a partial multiply operation.
+  SmallVector<Instruction *> InternalInstructions;
+  ComplexDeinterleavingRotation Rotation;
+  SmallVector<RawNodePtr> Operands;
+  Value *ReplacementNode = nullptr;
+
+  void addInstruction(Instruction *I) { InternalInstructions.push_back(I); }
+  void addOperand(NodePtr Node) { Operands.push_back(Node.get()); }
+
+  bool hasAllInternalUses(SmallPtrSet<Instruction *, 16> &AllInstructions);
+
+  void dump() { dump(dbgs()); }
+  void dump(raw_ostream &OS) {
+    auto PrintValue = [&](Value *V) {
+      if (V) {
+        OS << "\"";
+        V->print(OS, true);
+        OS << "\"\n";
+      } else
+        OS << "nullptr\n";
+    };
+    auto PrintNodeRef = [&](RawNodePtr Ptr) {
+      if (Ptr)
+        OS << Ptr << "\n";
+      else
+        OS << "nullptr\n";
+    };
+
+    OS << "- CompositeNode: " << this << "\n";
+    OS << "  Real: ";
+    PrintValue(Real);
+    OS << "  Imag: ";
+    PrintValue(Imag);
+    OS << "  ReplacementNode: ";
+    PrintValue(ReplacementNode);
+    OS << "  Operation: " << (int)Operation << "\n";
+    OS << "  Rotation: " << ((int)Rotation * 90) << "\n";
+    OS << "  Operands: \n";
+    for (const auto &Op : Operands) {
+      OS << "    - ";
+      PrintNodeRef(Op);
+    }
+    OS << "  InternalInstructions:\n";
+    for (const auto &I : InternalInstructions) {
+      OS << "    - \"";
+      I->print(OS, true);
+      OS << "\"\n";
+    }
+  }
+};
+
+class ComplexDeinterleavingGraph {
+public:
+  using NodePtr = ComplexDeinterleavingCompositeNode::NodePtr;
+  using RawNodePtr = ComplexDeinterleavingCompositeNode::RawNodePtr;
+  explicit ComplexDeinterleavingGraph(const TargetLowering *tl) : TL(tl) {}
+
+private:
+  const TargetLowering *TL;
+  Instruction *RootValue;
+  NodePtr RootNode;
+  SmallVector<NodePtr> CompositeNodes;
+  SmallPtrSet<Instruction *, 16> AllInstructions;
+
+  NodePtr prepareCompositeNode(ComplexDeinterleavingOperation Operation,
+                               Instruction *R, Instruction *I) {
+    return std::make_shared<ComplexDeinterleavingCompositeNode>(Operation, R,
+                                                                I);
+  }
+
+  NodePtr submitCompositeNode(NodePtr Node) {
+    CompositeNodes.push_back(Node);
+    AllInstructions.insert(Node->Real);
+    AllInstructions.insert(Node->Imag);
+    for (auto *I : Node->InternalInstructions)
+      AllInstructions.insert(I);
+    return Node;
+  }
+
+  NodePtr getContainingComposite(Value *R, Value *I) {
+    for (const auto &CN : CompositeNodes) {
+      if (CN->Real == R && CN->Imag == I)
+        return CN;
+    }
+    return nullptr;
+  }
+
+  /// Identifies a complex partial multiply pattern and its rotation, based on
+  /// the following patterns
+  ///
+  ///  0:  r: cr + ar * br
+  ///      i: ci + ar * bi
+  /// 90:  r: cr - ai * bi
+  ///      i: ci + ai * br
+  /// 180: r: cr - ar * br
+  ///      i: ci - ar * bi
+  /// 270: r: cr + ai * bi
+  ///      i: ci - ai * br
+  NodePtr identifyPartialMul(Instruction *Real, Instruction *Imag);
+
+  /// Identify the other branch of a Partial Mul, taking the CommonOperandI that
+  /// is partially known from identifyPartialMul, filling in the other half of
+  /// the complex pair.
+  NodePtr identifyNodeWithImplicitAdd(
+      Instruction *I, Instruction *J,
+      std::pair<Instruction *, Instruction *> &CommonOperandI);
+
+  /// Identifies a complex add pattern and its rotation, based on the following
+  /// patterns.
+  ///
+  /// 90:  r: ar - bi
+  ///      i: ai + br
+  /// 270: r: ar + bi
+  ///      i: ai - br
+  NodePtr identifyAdd(Instruction *Real, Instruction *Imag);
+
+  NodePtr identifyNode(Instruction *I, Instruction *J);
+
+  Value *replaceNode(RawNodePtr Node);
+
+public:
+  void dump() { dump(dbgs()); }
+  void dump(raw_ostream &OS) {
+    for (const auto &Node : CompositeNodes)
+      Node->dump(OS);
+  }
+
+  /// Returns false if the deinterleaving operation should be cancelled for the
+  /// current graph.
+  bool identifyNodes(Instruction *RootI);
+
+  /// Perform the actual replacement of the underlying instruction graph.
+  /// Returns false if the deinterleaving operation should be cancelled for the
+  /// current graph.
+  void replaceNodes();
+};
+
+class ComplexDeinterleaving {
+public:
+  ComplexDeinterleaving(const TargetLowering *tl, const TargetLibraryInfo *tli)
+      : TL(tl), TLI(tli) {}
+  bool runOnFunction(Function &F);
+
+private:
+  bool evaluateBasicBlock(BasicBlock *B);
+
+  const TargetLowering *TL = nullptr;
+  const TargetLibraryInfo *TLI = nullptr;
+};
+
+} // namespace
+
+char ComplexDeinterleavingLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ComplexDeinterleavingLegacyPass, DEBUG_TYPE,
+                      "Complex Deinterleaving", false, false)
+INITIALIZE_PASS_END(ComplexDeinterleavingLegacyPass, DEBUG_TYPE,
+                    "Complex Deinterleaving", false, false)
+
+PreservedAnalyses ComplexDeinterleavingPass::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering();
+  auto &TLI = AM.getResult<llvm::TargetLibraryAnalysis>(F);
+  if (!ComplexDeinterleaving(TL, &TLI).runOnFunction(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserve<FunctionAnalysisManagerModuleProxy>();
+  return PA;
+}
+
+FunctionPass *llvm::createComplexDeinterleavingPass(const TargetMachine *TM) {
+  return new ComplexDeinterleavingLegacyPass(TM);
+}
+
+bool ComplexDeinterleavingLegacyPass::runOnFunction(Function &F) {
+  const auto *TL = TM->getSubtargetImpl(F)->getTargetLowering();
+  auto TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  return ComplexDeinterleaving(TL, &TLI).runOnFunction(F);
+}
+
+bool ComplexDeinterleaving::runOnFunction(Function &F) {
+  if (!ComplexDeinterleavingEnabled) {
+    LLVM_DEBUG(
+        dbgs() << "Complex deinterleaving has been explicitly disabled.\n");
+    return false;
+  }
+
+  if (!TL->isComplexDeinterleavingSupported()) {
+    LLVM_DEBUG(
+        dbgs() << "Complex deinterleaving has been disabled, target does "
+                  "not support lowering of complex number operations.\n");
+    return false;
+  }
+
+  bool Changed = false;
+  for (auto &B : F)
+    Changed |= evaluateBasicBlock(&B);
+
+  return Changed;
+}
+
+static bool isInterleavingMask(ArrayRef<int> Mask) {
+  // If the size is not even, it's not an interleaving mask
+  if ((Mask.size() & 1))
+    return false;
+
+  int HalfNumElements = Mask.size() / 2;
+  for (int Idx = 0; Idx < HalfNumElements; ++Idx) {
+    int MaskIdx = Idx * 2;
+    if (Mask[MaskIdx] != Idx || Mask[MaskIdx + 1] != (Idx + HalfNumElements))
+      return false;
+  }
+
+  return true;
+}
+
+static bool isDeinterleavingMask(ArrayRef<int> Mask) {
+  int Offset = Mask[0];
+  int HalfNumElements = Mask.size() / 2;
+
+  for (int Idx = 1; Idx < HalfNumElements; ++Idx) {
+    if (Mask[Idx] != (Idx * 2) + Offset)
+      return false;
+  }
+
+  return true;
+}
+
+bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B) {
+  bool Changed = false;
+
+  SmallVector<Instruction *> DeadInstrRoots;
+
+  for (auto &I : *B) {
+    auto *SVI = dyn_cast<ShuffleVectorInst>(&I);
+    if (!SVI)
+      continue;
+
+    // Look for a shufflevector that takes separate vectors of the real and
+    // imaginary components and recombines them into a single vector.
+    if (!isInterleavingMask(SVI->getShuffleMask()))
+      continue;
+
+    ComplexDeinterleavingGraph Graph(TL);
+    if (!Graph.identifyNodes(SVI))
+      continue;
+
+    Graph.replaceNodes();
+    DeadInstrRoots.push_back(SVI);
+    Changed = true;
+  }
+
+  for (const auto &I : DeadInstrRoots) {
+    if (!I || I->getParent() == nullptr)
+      continue;
+    llvm::RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+  }
+
+  return Changed;
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyNodeWithImplicitAdd(
+    Instruction *Real, Instruction *Imag,
+    std::pair<Instruction *, Instruction *> &PartialMatch) {
+  LLVM_DEBUG(dbgs() << "identifyNodeWithImplicitAdd " << *Real << " / " << *Imag
+                    << "\n");
+
+  if (!Real->hasOneUse() || !Imag->hasOneUse()) {
+    LLVM_DEBUG(dbgs() << "  - Mul operand has multiple uses.\n");
+    return nullptr;
+  }
+
+  if (Real->getOpcode() != Instruction::FMul ||
+      Imag->getOpcode() != Instruction::FMul) {
+    LLVM_DEBUG(dbgs() << "  - Real or imaginary instruction is not fmul\n");
+    return nullptr;
+  }
+
+  Instruction *R0 = dyn_cast<Instruction>(Real->getOperand(0));
+  Instruction *R1 = dyn_cast<Instruction>(Real->getOperand(1));
+  Instruction *I0 = dyn_cast<Instruction>(Imag->getOperand(0));
+  Instruction *I1 = dyn_cast<Instruction>(Imag->getOperand(1));
+  if (!R0 || !R1 || !I0 || !I1) {
+    LLVM_DEBUG(dbgs() << "  - Mul operand not Instruction\n");
+    return nullptr;
+  }
+
+  // A +/+ has a rotation of 0. If any of the operands are fneg, we flip the
+  // rotations and use the operand.
+  unsigned Negs = 0;
+  SmallVector<Instruction *> FNegs;
+  if (R0->getOpcode() == Instruction::FNeg ||
+      R1->getOpcode() == Instruction::FNeg) {
+    Negs |= 1;
+    if (R0->getOpcode() == Instruction::FNeg) {
+      FNegs.push_back(R0);
+      R0 = dyn_cast<Instruction>(R0->getOperand(0));
+    } else {
+      FNegs.push_back(R1);
+      R1 = dyn_cast<Instruction>(R1->getOperand(0));
+    }
+    if (!R0 || !R1)
+      return nullptr;
+  }
+  if (I0->getOpcode() == Instruction::FNeg ||
+      I1->getOpcode() == Instruction::FNeg) {
+    Negs |= 2;
+    Negs ^= 1;
+    if (I0->getOpcode() == Instruction::FNeg) {
+      FNegs.push_back(I0);
+      I0 = dyn_cast<Instruction>(I0->getOperand(0));
+    } else {
+      FNegs.push_back(I1);
+      I1 = dyn_cast<Instruction>(I1->getOperand(0));
+    }
+    if (!I0 || !I1)
+      return nullptr;
+  }
+
+  ComplexDeinterleavingRotation Rotation = (ComplexDeinterleavingRotation)Negs;
+
+  Instruction *CommonOperand;
+  Instruction *UncommonRealOp;
+  Instruction *UncommonImagOp;
+
+  if (R0 == I0 || R0 == I1) {
+    CommonOperand = R0;
+    UncommonRealOp = R1;
+  } else if (R1 == I0 || R1 == I1) {
+    CommonOperand = R1;
+    UncommonRealOp = R0;
+  } else {
+    LLVM_DEBUG(dbgs() << "  - No equal operand\n");
+    return nullptr;
+  }
+
+  UncommonImagOp = (CommonOperand == I0) ? I1 : I0;
+  if (Rotation == ComplexDeinterleavingRotation::Rotation_90 ||
+      Rotation == ComplexDeinterleavingRotation::Rotation_270)
+    std::swap(UncommonRealOp, UncommonImagOp);
+
+  // Between identifyPartialMul and here we need to have found a complete valid
+  // pair from the CommonOperand of each part.
+  if (Rotation == ComplexDeinterleavingRotation::Rotation_0 ||
+      Rotation == ComplexDeinterleavingRotation::Rotation_180)
+    PartialMatch.first = CommonOperand;
+  else
+    PartialMatch.second = CommonOperand;
+
+  if (!PartialMatch.first || !PartialMatch.second) {
+    LLVM_DEBUG(dbgs() << "  - Incomplete partial match\n");
+    return nullptr;
+  }
+
+  NodePtr CommonNode = identifyNode(PartialMatch.first, PartialMatch.second);
+  if (!CommonNode) {
+    LLVM_DEBUG(dbgs() << "  - No CommonNode identified\n");
+    return nullptr;
+  }
+
+  NodePtr UncommonNode = identifyNode(UncommonRealOp, UncommonImagOp);
+  if (!UncommonNode) {
+    LLVM_DEBUG(dbgs() << "  - No UncommonNode identified\n");
+    return nullptr;
+  }
+
+  NodePtr Node = prepareCompositeNode(
+      ComplexDeinterleavingOperation::CMulPartial, Real, Imag);
+  Node->Rotation = Rotation;
+  Node->addOperand(CommonNode);
+  Node->addOperand(UncommonNode);
+  Node->InternalInstructions.append(FNegs);
+  return submitCompositeNode(Node);
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyPartialMul(Instruction *Real,
+                                               Instruction *Imag) {
+  LLVM_DEBUG(dbgs() << "identifyPartialMul " << *Real << " / " << *Imag
+                    << "\n");
+  // Determine rotation
+  ComplexDeinterleavingRotation Rotation;
+  if (Real->getOpcode() == Instruction::FAdd &&
+      Imag->getOpcode() == Instruction::FAdd)
+    Rotation = ComplexDeinterleavingRotation::Rotation_0;
+  else if (Real->getOpcode() == Instruction::FSub &&
+           Imag->getOpcode() == Instruction::FAdd)
+    Rotation = ComplexDeinterleavingRotation::Rotation_90;
+  else if (Real->getOpcode() == Instruction::FSub &&
+           Imag->getOpcode() == Instruction::FSub)
+    Rotation = ComplexDeinterleavingRotation::Rotation_180;
+  else if (Real->getOpcode() == Instruction::FAdd &&
+           Imag->getOpcode() == Instruction::FSub)
+    Rotation = ComplexDeinterleavingRotation::Rotation_270;
+  else {
+    LLVM_DEBUG(dbgs() << "  - Unhandled rotation.\n");
+    return nullptr;
+  }
+
+  if (!Real->getFastMathFlags().allowContract() ||
+      !Imag->getFastMathFlags().allowContract()) {
+    LLVM_DEBUG(dbgs() << "  - Contract is missing from the FastMath flags.\n");
+    return nullptr;
+  }
+
+  Value *CR = Real->getOperand(0);
+  Instruction *RealMulI = dyn_cast<Instruction>(Real->getOperand(1));
+  if (!RealMulI)
+    return nullptr;
+  Value *CI = Imag->getOperand(0);
+  Instruction *ImagMulI = dyn_cast<Instruction>(Imag->getOperand(1));
+  if (!ImagMulI)
+    return nullptr;
+
+  if (!RealMulI->hasOneUse() || !ImagMulI->hasOneUse()) {
+    LLVM_DEBUG(dbgs() << "  - Mul instruction has multiple uses\n");
+    return nullptr;
+  }
+
+  Instruction *R0 = dyn_cast<Instruction>(RealMulI->getOperand(0));
+  Instruction *R1 = dyn_cast<Instruction>(RealMulI->getOperand(1));
+  Instruction *I0 = dyn_cast<Instruction>(ImagMulI->getOperand(0));
+  Instruction *I1 = dyn_cast<Instruction>(ImagMulI->getOperand(1));
+  if (!R0 || !R1 || !I0 || !I1) {
+    LLVM_DEBUG(dbgs() << "  - Mul operand not Instruction\n");
+    return nullptr;
+  }
+
+  Instruction *CommonOperand;
+  Instruction *UncommonRealOp;
+  Instruction *UncommonImagOp;
+
+  if (R0 == I0 || R0 == I1) {
+    CommonOperand = R0;
+    UncommonRealOp = R1;
+  } else if (R1 == I0 || R1 == I1) {
+    CommonOperand = R1;
+    UncommonRealOp = R0;
+  } else {
+    LLVM_DEBUG(dbgs() << "  - No equal operand\n");
+    return nullptr;
+  }
+
+  UncommonImagOp = (CommonOperand == I0) ? I1 : I0;
+  if (Rotation == ComplexDeinterleavingRotation::Rotation_90 ||
+      Rotation == ComplexDeinterleavingRotation::Rotation_270)
+    std::swap(UncommonRealOp, UncommonImagOp);
+
+  std::pair<Instruction *, Instruction *> PartialMatch(
+      (Rotation == ComplexDeinterleavingRotation::Rotation_0 ||
+       Rotation == ComplexDeinterleavingRotation::Rotation_180)
+          ? CommonOperand
+          : nullptr,
+      (Rotation == ComplexDeinterleavingRotation::Rotation_90 ||
+       Rotation == ComplexDeinterleavingRotation::Rotation_270)
+          ? CommonOperand
+          : nullptr);
+  NodePtr CNode = identifyNodeWithImplicitAdd(
+      cast<Instruction>(CR), cast<Instruction>(CI), PartialMatch);
+  if (!CNode) {
+    LLVM_DEBUG(dbgs() << "  - No cnode identified\n");
+    return nullptr;
+  }
+
+  NodePtr UncommonRes = identifyNode(UncommonRealOp, UncommonImagOp);
+  if (!UncommonRes) {
+    LLVM_DEBUG(dbgs() << "  - No UncommonRes identified\n");
+    return nullptr;
+  }
+
+  assert(PartialMatch.first && PartialMatch.second);
+  NodePtr CommonRes = identifyNode(PartialMatch.first, PartialMatch.second);
+  if (!CommonRes) {
+    LLVM_DEBUG(dbgs() << "  - No CommonRes identified\n");
+    return nullptr;
+  }
+
+  NodePtr Node = prepareCompositeNode(
+      ComplexDeinterleavingOperation::CMulPartial, Real, Imag);
+  Node->addInstruction(RealMulI);
+  Node->addInstruction(ImagMulI);
+  Node->Rotation = Rotation;
+  Node->addOperand(CommonRes);
+  Node->addOperand(UncommonRes);
+  Node->addOperand(CNode);
+  return submitCompositeNode(Node);
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyAdd(Instruction *Real, Instruction *Imag) {
+  LLVM_DEBUG(dbgs() << "identifyAdd " << *Real << " / " << *Imag << "\n");
+
+  // Determine rotation
+  ComplexDeinterleavingRotation Rotation;
+  if (Real->getOpcode() == Instruction::FSub &&
+      Imag->getOpcode() == Instruction::FAdd)
+    Rotation = ComplexDeinterleavingRotation::Rotation_90;
+  else if (Real->getOpcode() == Instruction::FAdd &&
+           Imag->getOpcode() == Instruction::FSub)
+    Rotation = ComplexDeinterleavingRotation::Rotation_270;
+  else {
+    LLVM_DEBUG(dbgs() << " - Unhandled case, rotation is not assigned.\n");
+    return nullptr;
+  }
+
+  auto *AR = cast<Instruction>(Real->getOperand(0));
+  auto *BI = cast<Instruction>(Real->getOperand(1));
+  auto *AI = cast<Instruction>(Imag->getOperand(0));
+  auto *BR = cast<Instruction>(Imag->getOperand(1));
+
+  NodePtr ResA = identifyNode(AR, AI);
+  if (!ResA) {
+    LLVM_DEBUG(dbgs() << " - AR/AI is not identified as a composite node.\n");
+    return nullptr;
+  }
+  NodePtr ResB = identifyNode(BR, BI);
+  if (!ResB) {
+    LLVM_DEBUG(dbgs() << " - BR/BI is not identified as a composite node.\n");
+    return nullptr;
+  }
+
+  NodePtr Node =
+      prepareCompositeNode(ComplexDeinterleavingOperation::CAdd, Real, Imag);
+  Node->Rotation = Rotation;
+  Node->addOperand(ResA);
+  Node->addOperand(ResB);
+  return submitCompositeNode(Node);
+}
+
+static bool isInstructionPairAdd(Instruction *A, Instruction *B) {
+  unsigned OpcA = A->getOpcode();
+  unsigned OpcB = B->getOpcode();
+  return (OpcA == Instruction::FSub && OpcB == Instruction::FAdd) ||
+         (OpcA == Instruction::FAdd && OpcB == Instruction::FSub);
+}
+
+static bool isInstructionPairMul(Instruction *A, Instruction *B) {
+  auto Pattern =
+      m_BinOp(m_FMul(m_Value(), m_Value()), m_FMul(m_Value(), m_Value()));
+
+  return match(A, Pattern) && match(B, Pattern);
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyNode(Instruction *Real, Instruction *Imag) {
+  LLVM_DEBUG(dbgs() << "identifyNode on " << *Real << " / " << *Imag << "\n");
+  if (NodePtr CN = getContainingComposite(Real, Imag)) {
+    LLVM_DEBUG(dbgs() << " - Folding to existing node\n");
+    return CN;
+  }
+
+  auto *RealShuffle = dyn_cast<ShuffleVectorInst>(Real);
+  auto *ImagShuffle = dyn_cast<ShuffleVectorInst>(Imag);
+  if (RealShuffle && ImagShuffle) {
+    Value *RealOp1 = RealShuffle->getOperand(1);
+    if (!isa<UndefValue>(RealOp1) && !isa<ConstantAggregateZero>(RealOp1)) {
+      LLVM_DEBUG(dbgs() << " - RealOp1 is not undef or zero.\n");
+      return nullptr;
+    }
+    Value *ImagOp1 = ImagShuffle->getOperand(1);
+    if (!isa<UndefValue>(ImagOp1) && !isa<ConstantAggregateZero>(ImagOp1)) {
+      LLVM_DEBUG(dbgs() << " - ImagOp1 is not undef or zero.\n");
+      return nullptr;
+    }
+
+    Value *RealOp0 = RealShuffle->getOperand(0);
+    Value *ImagOp0 = ImagShuffle->getOperand(0);
+
+    if (RealOp0 != ImagOp0) {
+      LLVM_DEBUG(dbgs() << " - Shuffle operands are not equal.\n");
+      return nullptr;
+    }
+
+    ArrayRef<int> RealMask = RealShuffle->getShuffleMask();
+    ArrayRef<int> ImagMask = ImagShuffle->getShuffleMask();
+    if (!isDeinterleavingMask(RealMask) || !isDeinterleavingMask(ImagMask)) {
+      LLVM_DEBUG(dbgs() << " - Masks are not deinterleaving.\n");
+      return nullptr;
+    }
+
+    if (RealMask[0] != 0 || ImagMask[0] != 1) {
+      LLVM_DEBUG(dbgs() << " - Masks do not have the correct initial value.\n");
+      return nullptr;
+    }
+
+    // Type checking, the shuffle type should be a vector type of the same
+    // scalar type, but half the size
+    auto CheckType = [&](ShuffleVectorInst *Shuffle) {
+      Value *Op = Shuffle->getOperand(0);
+      auto *ShuffleTy = cast<FixedVectorType>(Shuffle->getType());
+      auto *OpTy = cast<FixedVectorType>(Op->getType());
+
+      if (OpTy->getScalarType() != ShuffleTy->getScalarType())
+        return false;
+      if ((ShuffleTy->getNumElements() * 2) != OpTy->getNumElements())
+        return false;
+
+      return true;
+    };
+
+    auto CheckDeinterleavingShuffle = [&](ShuffleVectorInst *Shuffle) -> bool {
+      if (!CheckType(Shuffle))
+        return false;
+
+      ArrayRef<int> Mask = Shuffle->getShuffleMask();
+      int Last = *Mask.rbegin();
+
+      Value *Op = Shuffle->getOperand(0);
+      auto *OpTy = cast<FixedVectorType>(Op->getType());
+      int NumElements = OpTy->getNumElements();
+
+      // Ensure that the deinterleaving shuffle only pulls from the first
+      // shuffle operand.
+      return Last < NumElements;
+    };
+
+    if (RealShuffle->getType() != ImagShuffle->getType()) {
+      LLVM_DEBUG(dbgs() << " - Shuffle types aren't equal.\n");
+      return nullptr;
+    }
+    if (!CheckDeinterleavingShuffle(RealShuffle)) {
+      LLVM_DEBUG(dbgs() << " - RealShuffle is invalid type.\n");
+      return nullptr;
+    }
+    if (!CheckDeinterleavingShuffle(ImagShuffle)) {
+      LLVM_DEBUG(dbgs() << " - ImagShuffle is invalid type.\n");
+      return nullptr;
+    }
+
+    NodePtr PlaceholderNode =
+        prepareCompositeNode(llvm::ComplexDeinterleavingOperation::Shuffle,
+                             RealShuffle, ImagShuffle);
+    PlaceholderNode->ReplacementNode = RealShuffle->getOperand(0);
+    return submitCompositeNode(PlaceholderNode);
+  }
+  if (RealShuffle || ImagShuffle)
+    return nullptr;
+
+  auto *VTy = cast<FixedVectorType>(Real->getType());
+  auto *NewVTy =
+      FixedVectorType::get(VTy->getScalarType(), VTy->getNumElements() * 2);
+
+  if (TL->isComplexDeinterleavingOperationSupported(
+          ComplexDeinterleavingOperation::CMulPartial, NewVTy) &&
+      isInstructionPairMul(Real, Imag)) {
+    return identifyPartialMul(Real, Imag);
+  }
+
+  if (TL->isComplexDeinterleavingOperationSupported(
+          ComplexDeinterleavingOperation::CAdd, NewVTy) &&
+      isInstructionPairAdd(Real, Imag)) {
+    return identifyAdd(Real, Imag);
+  }
+
+  return nullptr;
+}
+
+bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
+  Instruction *Real;
+  Instruction *Imag;
+  if (!match(RootI, m_Shuffle(m_Instruction(Real), m_Instruction(Imag))))
+    return false;
+
+  RootValue = RootI;
+  AllInstructions.insert(RootI);
+  RootNode = identifyNode(Real, Imag);
+
+  LLVM_DEBUG({
+    Function *F = RootI->getFunction();
+    BasicBlock *B = RootI->getParent();
+    dbgs() << "Complex deinterleaving graph for " << F->getName()
+           << "::" << B->getName() << ".\n";
+    dump(dbgs());
+    dbgs() << "\n";
+  });
+
+  // Check all instructions have internal uses
+  for (const auto &Node : CompositeNodes) {
+    if (!Node->hasAllInternalUses(AllInstructions)) {
+      LLVM_DEBUG(dbgs() << "  - Invalid internal uses\n");
+      return false;
+    }
+  }
+  return RootNode != nullptr;
+}
+
+Value *ComplexDeinterleavingGraph::replaceNode(
+    ComplexDeinterleavingGraph::RawNodePtr Node) {
+  if (Node->ReplacementNode)
+    return Node->ReplacementNode;
+
+  Value *Input0 = replaceNode(Node->Operands[0]);
+  Value *Input1 = replaceNode(Node->Operands[1]);
+  Value *Accumulator =
+      Node->Operands.size() > 2 ? replaceNode(Node->Operands[2]) : nullptr;
+
+  assert(Input0->getType() == Input1->getType() &&
+         "Node inputs need to be of the same type");
+
+  Node->ReplacementNode = TL->createComplexDeinterleavingIR(
+      Node->Real, Node->Operation, Node->Rotation, Input0, Input1, Accumulator);
+
+  assert(Node->ReplacementNode && "Target failed to create Intrinsic call.");
+  NumComplexTransformations += 1;
+  return Node->ReplacementNode;
+}
+
+void ComplexDeinterleavingGraph::replaceNodes() {
+  Value *R = replaceNode(RootNode.get());
+  assert(R && "Unable to find replacement for RootValue");
+  RootValue->replaceAllUsesWith(R);
+}
+
+bool ComplexDeinterleavingCompositeNode::hasAllInternalUses(
+    SmallPtrSet<Instruction *, 16> &AllInstructions) {
+  if (Operation == ComplexDeinterleavingOperation::Shuffle)
+    return true;
+
+  for (auto *User : Real->users()) {
+    if (!AllInstructions.contains(cast<Instruction>(User)))
+      return false;
+  }
+  for (auto *User : Imag->users()) {
+    if (!AllInstructions.contains(cast<Instruction>(User)))
+      return false;
+  }
+  for (auto *I : InternalInstructions) {
+    for (auto *User : I->users()) {
+      if (!AllInstructions.contains(cast<Instruction>(User)))
+        return false;
+    }
+  }
+  return true;
+}
\ No newline at end of file

diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index b822f15ed193b..b90bc4bd3a741 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21833,3 +21833,97 @@ void ARMTargetLowering::finalizeLowering(MachineFunction &MF) const {
   MF.getFrameInfo().computeMaxCallFrameSize(MF);
   TargetLoweringBase::finalizeLowering(MF);
 }
+
+bool ARMTargetLowering::isComplexDeinterleavingSupported() const {
+  return Subtarget->hasMVEFloatOps();
+}
+
+bool ARMTargetLowering::isComplexDeinterleavingOperationSupported(
+    ComplexDeinterleavingOperation Operation, Type *Ty) const {
+  auto *VTy = dyn_cast<FixedVectorType>(Ty);
+  if (!VTy)
+    return false;
+
+  auto *ScalarTy = VTy->getScalarType();
+  unsigned NumElements = VTy->getNumElements();
+
+  unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
+  if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))
+    return false;
+
+  // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
+  return ScalarTy->isHalfTy() || ScalarTy->isFloatTy();
+}
+
+Value *ARMTargetLowering::createComplexDeinterleavingIR(
+    Instruction *I, ComplexDeinterleavingOperation OperationType,
+    ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
+    Value *Accumulator) const {
+
+  FixedVectorType *Ty = cast<FixedVectorType>(InputA->getType());
+
+  IRBuilder<> B(I);
+
+  unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
+
+  assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
+
+  if (TyWidth > 128) {
+    int Stride = Ty->getNumElements() / 2;
+    auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
+    auto SplitSeqVec = llvm::to_vector(SplitSeq);
+    ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
+    ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
+
+    auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
+    auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
+    auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
+    auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
+    Value *LowerSplitAcc = nullptr;
+    Value *UpperSplitAcc = nullptr;
+
+    if (Accumulator) {
+      LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
+      UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
+    }
+
+    auto *LowerSplitInt = createComplexDeinterleavingIR(
+        I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
+    auto *UpperSplitInt = createComplexDeinterleavingIR(
+        I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
+
+    ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
+    return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
+  }
+
+  auto *IntTy = Type::getInt32Ty(B.getContext());
+
+  ConstantInt *ConstRotation = nullptr;
+  if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
+    ConstRotation = ConstantInt::get(IntTy, (int)Rotation);
+
+    if (Accumulator)
+      return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
+                               {ConstRotation, Accumulator, InputB, InputA});
+    return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
+                             {ConstRotation, InputB, InputA});
+  }
+
+  if (OperationType == ComplexDeinterleavingOperation::CAdd) {
+    // 1 means the value is not halved.
+    auto *ConstHalving = ConstantInt::get(IntTy, 1);
+
+    if (Rotation == ComplexDeinterleavingRotation::Rotation_90)
+      ConstRotation = ConstantInt::get(IntTy, 0);
+    else if (Rotation == ComplexDeinterleavingRotation::Rotation_270)
+      ConstRotation = ConstantInt::get(IntTy, 1);
+
+    if (!ConstRotation)
+      return nullptr; // Invalid rotation for arm_mve_vcaddq
+
+    return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
+                             {ConstHalving, ConstRotation, InputA, InputB});
+  }
+
+  return nullptr;
+}

diff  --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 1403e4c8c0a11..7c3349da82b0e 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -736,6 +736,15 @@ class VectorType;
 
     bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override;
 
+    bool isComplexDeinterleavingSupported() const override;
+    bool isComplexDeinterleavingOperationSupported(
+        ComplexDeinterleavingOperation Operation, Type *Ty) const override;
+
+    Value *createComplexDeinterleavingIR(
+        Instruction *I, ComplexDeinterleavingOperation OperationType,
+        ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
+        Value *Accumulator = nullptr) const override;
+
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,

diff  --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index d95c21d6504b7..16489162bb8b7 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -426,9 +426,13 @@ void ARMPassConfig::addIRPasses() {
   TargetPassConfig::addIRPasses();
 
   // Run the parallel DSP pass.
-  if (getOptLevel() == CodeGenOpt::Aggressive) 
+  if (getOptLevel() == CodeGenOpt::Aggressive)
     addPass(createARMParallelDSPPass());
 
+  // Match complex arithmetic patterns
+  if (TM->getOptLevel() >= CodeGenOpt::Default)
+    addPass(createComplexDeinterleavingPass(TM));
+
   // Match interleaved memory accesses to ldN/stN intrinsics.
   if (TM->getOptLevel() != CodeGenOpt::None)
     addPass(createInterleavedAccessPass());

diff  --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index f1a12254866c9..1f2e962579e7f 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -47,6 +47,7 @@
 ; CHECK-NEXT:      Basic Alias Analysis (stateless AA impl)
 ; CHECK-NEXT:      Function Alias Analysis Results
 ; CHECK-NEXT:      Transform functions to use DSP intrinsics
+; CHECK-NEXT:      Complex Deinterleaving Pass
 ; CHECK-NEXT:      Interleaved Access Pass
 ; CHECK-NEXT:      Type Promotion
 ; CHECK-NEXT:      CodeGen Prepare

diff  --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-add.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-add.ll
new file mode 100644
index 0000000000000..9920618a68d57
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-add.ll
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s
+
+target triple = "thumbv8.1m.main-none-none-eabi"
+
+
+; Expected to not transform
+define arm_aapcs_vfpcc <2 x half> @complex_add_v2f16(<2 x half> %a, <2 x half> %b) {
+; CHECK-LABEL: complex_add_v2f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovx.f16 s2, s4
+; CHECK-NEXT:    vadd.f16 s2, s2, s0
+; CHECK-NEXT:    vmovx.f16 s0, s0
+; CHECK-NEXT:    vsub.f16 s0, s4, s0
+; CHECK-NEXT:    vins.f16 s0, s2
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <2 x half> %a, <2 x half> zeroinitializer, <1 x i32> <i32 0>
+  %a.imag = shufflevector <2 x half> %a, <2 x half> zeroinitializer, <1 x i32> <i32 1>
+  %b.real = shufflevector <2 x half> %b, <2 x half> zeroinitializer, <1 x i32> <i32 0>
+  %b.imag = shufflevector <2 x half> %b, <2 x half> zeroinitializer, <1 x i32> <i32 1>
+  %0 = fsub fast <1 x half> %b.real, %a.imag
+  %1 = fadd fast <1 x half> %b.imag, %a.real
+  %interleaved.vec = shufflevector <1 x half> %0, <1 x half> %1, <2 x i32> <i32 0, i32 1>
+  ret <2 x half> %interleaved.vec
+}
+
+; Expected to not transform
+define arm_aapcs_vfpcc <4 x half> @complex_add_v4f16(<4 x half> %a, <4 x half> %b) {
+; CHECK-LABEL: complex_add_v4f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovx.f16 s12, s4
+; CHECK-NEXT:    vmovx.f16 s2, s5
+; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vins.f16 s12, s2
+; CHECK-NEXT:    vmovx.f16 s2, s1
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vins.f16 s8, s2
+; CHECK-NEXT:    vins.f16 s4, s5
+; CHECK-NEXT:    vadd.f16 q3, q3, q0
+; CHECK-NEXT:    vsub.f16 q0, q1, q2
+; CHECK-NEXT:    vmovx.f16 s1, s0
+; CHECK-NEXT:    vmovx.f16 s2, s12
+; CHECK-NEXT:    vins.f16 s0, s12
+; CHECK-NEXT:    vins.f16 s1, s2
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x half> %a, <4 x half> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x half> %b, <4 x half> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x half> %b, <4 x half> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %0 = fsub fast <2 x half> %b.real, %a.imag
+  %1 = fadd fast <2 x half> %b.imag, %a.real
+  %interleaved.vec = shufflevector <2 x half> %0, <2 x half> %1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x half> %interleaved.vec
+}
+
+; Expected to transform
+define arm_aapcs_vfpcc <8 x half> @complex_add_v8f16(<8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: complex_add_v8f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcadd.f16 q0, q1, q0, #90
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <8 x half> %a, <8 x half> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %a.imag = shufflevector <8 x half> %a, <8 x half> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %b.real = shufflevector <8 x half> %b, <8 x half> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b.imag = shufflevector <8 x half> %b, <8 x half> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fsub fast <4 x half> %b.real, %a.imag
+  %1 = fadd fast <4 x half> %b.imag, %a.real
+  %interleaved.vec = shufflevector <4 x half> %0, <4 x half> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x half> %interleaved.vec
+}
+
+; Expected to transform
+define arm_aapcs_vfpcc <16 x half> @complex_add_v16f16(<16 x half> %a, <16 x half> %b) {
+; CHECK-LABEL: complex_add_v16f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcadd.f16 q0, q2, q0, #90
+; CHECK-NEXT:    vcadd.f16 q1, q3, q1, #90
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %a.imag = shufflevector <16 x half> %a, <16 x half> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %b.real = shufflevector <16 x half> %b, <16 x half> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %b.imag = shufflevector <16 x half> %b, <16 x half> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %0 = fsub fast <8 x half> %b.real, %a.imag
+  %1 = fadd fast <8 x half> %b.imag, %a.real
+  %interleaved.vec = shufflevector <8 x half> %0, <8 x half> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <16 x half> %interleaved.vec
+}
+
+; Expected to transform
+define arm_aapcs_vfpcc <32 x half> @complex_add_v32f16(<32 x half> %a, <32 x half> %b) {
+; CHECK-LABEL: complex_add_v32f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    add r0, sp, #16
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    add r0, sp, #32
+; CHECK-NEXT:    vcadd.f16 q0, q4, q0, #90
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    add r0, sp, #48
+; CHECK-NEXT:    vcadd.f16 q1, q4, q1, #90
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    add r0, sp, #64
+; CHECK-NEXT:    vcadd.f16 q2, q4, q2, #90
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    vcadd.f16 q3, q4, q3, #90
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <32 x half> %a, <32 x half> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %a.imag = shufflevector <32 x half> %a, <32 x half> zeroinitializer, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %b.real = shufflevector <32 x half> %b, <32 x half> zeroinitializer, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %b.imag = shufflevector <32 x half> %b, <32 x half> zeroinitializer, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %0 = fsub fast <16 x half> %b.real, %a.imag
+  %1 = fadd fast <16 x half> %b.imag, %a.real
+  %interleaved.vec = shufflevector <16 x half> %0, <16 x half> %1, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <32 x half> %interleaved.vec
+}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-mul.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-mul.ll
new file mode 100644
index 0000000000000..11e604b5079b8
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f16-mul.ll
@@ -0,0 +1,168 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s
+
+target triple = "thumbv8.1m.main-none-none-eabi"
+
+; Expected to not transform
+define arm_aapcs_vfpcc <2 x half> @complex_mul_v2f16(<2 x half> %a, <2 x half> %b) {
+; CHECK-LABEL: complex_mul_v2f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmovx.f16 s2, s4
+; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vmul.f16 s6, s2, s0
+; CHECK-NEXT:    vfma.f16 s6, s4, s8
+; CHECK-NEXT:    vmul.f16 s8, s8, s2
+; CHECK-NEXT:    vfnms.f16 s8, s4, s0
+; CHECK-NEXT:    vins.f16 s8, s6
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> <i32 0>
+  %a.imag = shufflevector <2 x half> %a, <2 x half> poison, <1 x i32> <i32 1>
+  %b.real = shufflevector <2 x half> %b, <2 x half> poison, <1 x i32> <i32 0>
+  %b.imag = shufflevector <2 x half> %b, <2 x half> poison, <1 x i32> <i32 1>
+  %0 = fmul fast <1 x half> %b.imag, %a.real
+  %1 = fmul fast <1 x half> %b.real, %a.imag
+  %2 = fadd fast <1 x half> %1, %0
+  %3 = fmul fast <1 x half> %b.real, %a.real
+  %4 = fmul fast <1 x half> %a.imag, %b.imag
+  %5 = fsub fast <1 x half> %3, %4
+  %interleaved.vec = shufflevector <1 x half> %5, <1 x half> %2, <2 x i32> <i32 0, i32 1>
+  ret <2 x half> %interleaved.vec
+}
+
+; Expected to not transform
+define arm_aapcs_vfpcc <4 x half> @complex_mul_v4f16(<4 x half> %a, <4 x half> %b) {
+; CHECK-LABEL: complex_mul_v4f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vmovx.f16 s8, s0
+; CHECK-NEXT:    vmovx.f16 s2, s1
+; CHECK-NEXT:    vins.f16 s8, s2
+; CHECK-NEXT:    vmovx.f16 s12, s4
+; CHECK-NEXT:    vmovx.f16 s2, s5
+; CHECK-NEXT:    vins.f16 s0, s1
+; CHECK-NEXT:    vins.f16 s12, s2
+; CHECK-NEXT:    vins.f16 s4, s5
+; CHECK-NEXT:    vmul.f16 q4, q3, q0
+; CHECK-NEXT:    vfma.f16 q4, q1, q2
+; CHECK-NEXT:    vmul.f16 q2, q2, q3
+; CHECK-NEXT:    vneg.f16 q2, q2
+; CHECK-NEXT:    vfma.f16 q2, q1, q0
+; CHECK-NEXT:    vmovx.f16 s0, s16
+; CHECK-NEXT:    vmovx.f16 s9, s8
+; CHECK-NEXT:    vins.f16 s8, s16
+; CHECK-NEXT:    vins.f16 s9, s0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <4 x half> %a, <4 x half> poison, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x half> %a, <4 x half> poison, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x half> %b, <4 x half> poison, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x half> %b, <4 x half> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x half> %b.imag, %a.real
+  %1 = fmul fast <2 x half> %b.real, %a.imag
+  %2 = fadd fast <2 x half> %1, %0
+  %3 = fmul fast <2 x half> %b.real, %a.real
+  %4 = fmul fast <2 x half> %a.imag, %b.imag
+  %5 = fsub fast <2 x half> %3, %4
+  %interleaved.vec = shufflevector <2 x half> %5, <2 x half> %2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x half> %interleaved.vec
+}
+
+; Expected to transform
+define arm_aapcs_vfpcc <8 x half> @complex_mul_v8f16(<8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: complex_mul_v8f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmul.f16 q2, q0, q1, #0
+; CHECK-NEXT:    vcmla.f16 q2, q0, q1, #90
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %a.imag = shufflevector <8 x half> %a, <8 x half> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %b.real = shufflevector <8 x half> %b, <8 x half> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b.imag = shufflevector <8 x half> %b, <8 x half> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fmul fast <4 x half> %b.imag, %a.real
+  %1 = fmul fast <4 x half> %b.real, %a.imag
+  %2 = fadd fast <4 x half> %1, %0
+  %3 = fmul fast <4 x half> %b.real, %a.real
+  %4 = fmul fast <4 x half> %a.imag, %b.imag
+  %5 = fsub fast <4 x half> %3, %4
+  %interleaved.vec = shufflevector <4 x half> %5, <4 x half> %2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x half> %interleaved.vec
+}
+
+; Expected to transform
+define arm_aapcs_vfpcc <16 x half> @complex_mul_v16f16(<16 x half> %a, <16 x half> %b) {
+; CHECK-LABEL: complex_mul_v16f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vcmul.f16 q4, q0, q2, #0
+; CHECK-NEXT:    vcmla.f16 q4, q0, q2, #90
+; CHECK-NEXT:    vcmul.f16 q2, q1, q3, #0
+; CHECK-NEXT:    vcmla.f16 q2, q1, q3, #90
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vmov q1, q2
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %a.imag = shufflevector <16 x half> %a, <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %b.real = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %b.imag = shufflevector <16 x half> %b, <16 x half> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %0 = fmul fast <8 x half> %b.imag, %a.real
+  %1 = fmul fast <8 x half> %b.real, %a.imag
+  %2 = fadd fast <8 x half> %1, %0
+  %3 = fmul fast <8 x half> %b.real, %a.real
+  %4 = fmul fast <8 x half> %a.imag, %b.imag
+  %5 = fsub fast <8 x half> %3, %4
+  %interleaved.vec = shufflevector <8 x half> %5, <8 x half> %2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <16 x half> %interleaved.vec
+}
+
+; Expected to transform
+define arm_aapcs_vfpcc <32 x half> @complex_mul_v32f16(<32 x half> %a, <32 x half> %b) {
+; CHECK-LABEL: complex_mul_v32f16:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    add r0, sp, #48
+; CHECK-NEXT:    vmov q4, q0
+; CHECK-NEXT:    vldrw.u32 q5, [r0]
+; CHECK-NEXT:    add r0, sp, #64
+; CHECK-NEXT:    vcmul.f16 q0, q0, q5, #0
+; CHECK-NEXT:    vcmla.f16 q0, q4, q5, #90
+; CHECK-NEXT:    vldrw.u32 q5, [r0]
+; CHECK-NEXT:    add r0, sp, #80
+; CHECK-NEXT:    vcmul.f16 q4, q1, q5, #0
+; CHECK-NEXT:    vcmla.f16 q4, q1, q5, #90
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    add r0, sp, #96
+; CHECK-NEXT:    vcmul.f16 q5, q2, q1, #0
+; CHECK-NEXT:    vcmla.f16 q5, q2, q1, #90
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov q2, q5
+; CHECK-NEXT:    vcmul.f16 q6, q3, q1, #0
+; CHECK-NEXT:    vcmla.f16 q6, q3, q1, #90
+; CHECK-NEXT:    vmov q1, q4
+; CHECK-NEXT:    vmov q3, q6
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %a.imag = shufflevector <32 x half> %a, <32 x half> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %b.real = shufflevector <32 x half> %b, <32 x half> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
+  %b.imag = shufflevector <32 x half> %b, <32 x half> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
+  %0 = fmul fast <16 x half> %b.imag, %a.real
+  %1 = fmul fast <16 x half> %b.real, %a.imag
+  %2 = fadd fast <16 x half> %1, %0
+  %3 = fmul fast <16 x half> %b.real, %a.real
+  %4 = fmul fast <16 x half> %a.imag, %b.imag
+  %5 = fsub fast <16 x half> %3, %4
+  %interleaved.vec = shufflevector <16 x half> %5, <16 x half> %2, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <32 x half> %interleaved.vec
+}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-add.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-add.ll
new file mode 100644
index 0000000000000..d9161ab370dd6
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-add.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s
+
+target triple = "thumbv8.1m.main-none-none-eabi"
+
+
+; Expected to not transform
+define arm_aapcs_vfpcc <2 x float> @complex_add_v2f32(<2 x float> %a, <2 x float> %b) {
+; CHECK-LABEL: complex_add_v2f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vadd.f32 s5, s5, s0
+; CHECK-NEXT:    vsub.f32 s4, s4, s1
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <1 x i32> <i32 0>
+  %a.imag = shufflevector <2 x float> %a, <2 x float> zeroinitializer, <1 x i32> <i32 1>
+  %b.real = shufflevector <2 x float> %b, <2 x float> zeroinitializer, <1 x i32> <i32 0>
+  %b.imag = shufflevector <2 x float> %b, <2 x float> zeroinitializer, <1 x i32> <i32 1>
+  %0 = fsub fast <1 x float> %b.real, %a.imag
+  %1 = fadd fast <1 x float> %b.imag, %a.real
+  %interleaved.vec = shufflevector <1 x float> %0, <1 x float> %1, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %interleaved.vec
+}
+
+; Expected to transform
+define arm_aapcs_vfpcc <4 x float> @complex_add_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: complex_add_v4f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcadd.f32 q2, q1, q0, #90
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x float> %a, <4 x float> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x float> %b, <4 x float> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %0 = fsub fast <2 x float> %b.real, %a.imag
+  %1 = fadd fast <2 x float> %b.imag, %a.real
+  %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to transform
+define arm_aapcs_vfpcc <8 x float> @complex_add_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: complex_add_v8f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vcadd.f32 q4, q2, q0, #90
+; CHECK-NEXT:    vcadd.f32 q2, q3, q1, #90
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vmov q1, q2
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %a.imag = shufflevector <8 x float> %a, <8 x float> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %b.real = shufflevector <8 x float> %b, <8 x float> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b.imag = shufflevector <8 x float> %b, <8 x float> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fsub fast <4 x float> %b.real, %a.imag
+  %1 = fadd fast <4 x float> %b.imag, %a.real
+  %interleaved.vec = shufflevector <4 x float> %0, <4 x float> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x float> %interleaved.vec
+}
+
+; Expected to transform
+define arm_aapcs_vfpcc <16 x float> @complex_add_v16f32(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: complex_add_v16f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    add r3, sp, #64
+; CHECK-NEXT:    add r2, sp, #80
+; CHECK-NEXT:    vldrw.u32 q5, [r3]
+; CHECK-NEXT:    add r1, sp, #96
+; CHECK-NEXT:    add r0, sp, #112
+; CHECK-NEXT:    vcadd.f32 q4, q5, q0, #90
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vcadd.f32 q5, q0, q1, #90
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vmov q1, q5
+; CHECK-NEXT:    vcadd.f32 q6, q0, q2, #90
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov q2, q6
+; CHECK-NEXT:    vcadd.f32 q7, q0, q3, #90
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vmov q3, q7
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %a.imag = shufflevector <16 x float> %a, <16 x float> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %b.real = shufflevector <16 x float> %b, <16 x float> zeroinitializer, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %b.imag = shufflevector <16 x float> %b, <16 x float> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %0 = fsub fast <8 x float> %b.real, %a.imag
+  %1 = fadd fast <8 x float> %b.imag, %a.real
+  %interleaved.vec = shufflevector <8 x float> %0, <8 x float> %1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <16 x float> %interleaved.vec
+}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-mul.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-mul.ll
new file mode 100644
index 0000000000000..8c7039f82b888
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f32-mul.ll
@@ -0,0 +1,124 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+mve.fp -o - | FileCheck %s
+
+target triple = "thumbv8.1m.main-none-none-eabi"
+
+; Expected to not transform
+define arm_aapcs_vfpcc <2 x float> @complex_mul_v2f32(<2 x float> %a, <2 x float> %b) {
+; CHECK-LABEL: complex_mul_v2f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmul.f32 s9, s5, s0
+; CHECK-NEXT:    vmul.f32 s8, s1, s5
+; CHECK-NEXT:    vfma.f32 s9, s4, s1
+; CHECK-NEXT:    vfnms.f32 s8, s4, s0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <2 x float> %a, <2 x float> poison, <1 x i32> <i32 0>
+  %a.imag = shufflevector <2 x float> %a, <2 x float> poison, <1 x i32> <i32 1>
+  %b.real = shufflevector <2 x float> %b, <2 x float> poison, <1 x i32> <i32 0>
+  %b.imag = shufflevector <2 x float> %b, <2 x float> poison, <1 x i32> <i32 1>
+  %0 = fmul fast <1 x float> %b.imag, %a.real
+  %1 = fmul fast <1 x float> %b.real, %a.imag
+  %2 = fadd fast <1 x float> %1, %0
+  %3 = fmul fast <1 x float> %b.real, %a.real
+  %4 = fmul fast <1 x float> %a.imag, %b.imag
+  %5 = fsub fast <1 x float> %3, %4
+  %interleaved.vec = shufflevector <1 x float> %5, <1 x float> %2, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %interleaved.vec
+}
+
+; Expected to transform
+define arm_aapcs_vfpcc <4 x float> @complex_mul_v4f32(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: complex_mul_v4f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmul.f32 q2, q0, q1, #0
+; CHECK-NEXT:    vcmla.f32 q2, q0, q1, #90
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x float> %b.imag, %a.real
+  %1 = fmul fast <2 x float> %b.real, %a.imag
+  %2 = fadd fast <2 x float> %1, %0
+  %3 = fmul fast <2 x float> %b.real, %a.real
+  %4 = fmul fast <2 x float> %a.imag, %b.imag
+  %5 = fsub fast <2 x float> %3, %4
+  %interleaved.vec = shufflevector <2 x float> %5, <2 x float> %2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to transform
+define arm_aapcs_vfpcc <8 x float> @complex_mul_v8f32(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: complex_mul_v8f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vcmul.f32 q4, q0, q2, #0
+; CHECK-NEXT:    vcmla.f32 q4, q0, q2, #90
+; CHECK-NEXT:    vcmul.f32 q2, q1, q3, #0
+; CHECK-NEXT:    vcmla.f32 q2, q1, q3, #90
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vmov q1, q2
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %a.imag = shufflevector <8 x float> %a, <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %b.real = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b.imag = shufflevector <8 x float> %b, <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fmul fast <4 x float> %b.imag, %a.real
+  %1 = fmul fast <4 x float> %b.real, %a.imag
+  %2 = fadd fast <4 x float> %1, %0
+  %3 = fmul fast <4 x float> %b.real, %a.real
+  %4 = fmul fast <4 x float> %a.imag, %b.imag
+  %5 = fsub fast <4 x float> %3, %4
+  %interleaved.vec = shufflevector <4 x float> %5, <4 x float> %2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x float> %interleaved.vec
+}
+
+; Expected to transform
+define arm_aapcs_vfpcc <16 x float> @complex_mul_v16f32(<16 x float> %a, <16 x float> %b) {
+; CHECK-LABEL: complex_mul_v16f32:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    add r3, sp, #64
+; CHECK-NEXT:    add r2, sp, #80
+; CHECK-NEXT:    vldrw.u32 q5, [r3]
+; CHECK-NEXT:    add r1, sp, #96
+; CHECK-NEXT:    add r0, sp, #112
+; CHECK-NEXT:    vcmul.f32 q4, q0, q5, #0
+; CHECK-NEXT:    vcmla.f32 q4, q0, q5, #90
+; CHECK-NEXT:    vldrw.u32 q0, [r2]
+; CHECK-NEXT:    vcmul.f32 q5, q1, q0, #0
+; CHECK-NEXT:    vcmla.f32 q5, q1, q0, #90
+; CHECK-NEXT:    vldrw.u32 q0, [r1]
+; CHECK-NEXT:    vmov q1, q5
+; CHECK-NEXT:    vcmul.f32 q6, q2, q0, #0
+; CHECK-NEXT:    vcmla.f32 q6, q2, q0, #90
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmov q2, q6
+; CHECK-NEXT:    vcmul.f32 q7, q3, q0, #0
+; CHECK-NEXT:    vcmla.f32 q7, q3, q0, #90
+; CHECK-NEXT:    vmov q0, q4
+; CHECK-NEXT:    vmov q3, q7
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <16 x float> %a, <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %a.imag = shufflevector <16 x float> %a, <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %b.real = shufflevector <16 x float> %b, <16 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %b.imag = shufflevector <16 x float> %b, <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %0 = fmul fast <8 x float> %b.imag, %a.real
+  %1 = fmul fast <8 x float> %b.real, %a.imag
+  %2 = fadd fast <8 x float> %1, %0
+  %3 = fmul fast <8 x float> %b.real, %a.real
+  %4 = fmul fast <8 x float> %a.imag, %b.imag
+  %5 = fsub fast <8 x float> %3, %4
+  %interleaved.vec = shufflevector <8 x float> %5, <8 x float> %2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <16 x float> %interleaved.vec
+}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-add.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-add.ll
new file mode 100644
index 0000000000000..15859cd6fa182
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-add.ll
@@ -0,0 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+mve.fp,+fp64 -o - | FileCheck %s
+
+target triple = "thumbv8.1m.main-none-none-eabi"
+
+
+; Expected to not transform
+define arm_aapcs_vfpcc <2 x double> @complex_add_v2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: complex_add_v2f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vadd.f64 d3, d3, d0
+; CHECK-NEXT:    vsub.f64 d2, d2, d1
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> <i32 0>
+  %a.imag = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <1 x i32> <i32 1>
+  %b.real = shufflevector <2 x double> %b, <2 x double> zeroinitializer, <1 x i32> <i32 0>
+  %b.imag = shufflevector <2 x double> %b, <2 x double> zeroinitializer, <1 x i32> <i32 1>
+  %0 = fsub fast <1 x double> %b.real, %a.imag
+  %1 = fadd fast <1 x double> %b.imag, %a.real
+  %interleaved.vec = shufflevector <1 x double> %0, <1 x double> %1, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %interleaved.vec
+}
+
+; Expected to not transform
+define arm_aapcs_vfpcc <4 x double> @complex_add_v4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: complex_add_v4f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vadd.f64 d5, d5, d0
+; CHECK-NEXT:    vsub.f64 d4, d4, d1
+; CHECK-NEXT:    vadd.f64 d7, d7, d2
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    vsub.f64 d6, d6, d3
+; CHECK-NEXT:    vmov q1, q3
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x double> %a, <4 x double> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x double> %b, <4 x double> zeroinitializer, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x double> %b, <4 x double> zeroinitializer, <2 x i32> <i32 1, i32 3>
+  %0 = fsub fast <2 x double> %b.real, %a.imag
+  %1 = fadd fast <2 x double> %b.imag, %a.real
+  %interleaved.vec = shufflevector <2 x double> %0, <2 x double> %1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x double> %interleaved.vec
+}
+
+; Expected to not transform
+define arm_aapcs_vfpcc <8 x double> @complex_add_v8f64(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: complex_add_v8f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    add r0, sp, #32
+; CHECK-NEXT:    vmov q4, q1
+; CHECK-NEXT:    vmov q1, q0
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    add r0, sp, #48
+; CHECK-NEXT:    vadd.f64 d1, d1, d2
+; CHECK-NEXT:    vsub.f64 d0, d0, d3
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    add r0, sp, #64
+; CHECK-NEXT:    vadd.f64 d3, d3, d8
+; CHECK-NEXT:    vsub.f64 d2, d2, d9
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    add r0, sp, #80
+; CHECK-NEXT:    vadd.f64 d9, d9, d4
+; CHECK-NEXT:    vsub.f64 d8, d8, d5
+; CHECK-NEXT:    vldrw.u32 q2, [r0]
+; CHECK-NEXT:    vadd.f64 d11, d5, d6
+; CHECK-NEXT:    vsub.f64 d10, d4, d7
+; CHECK-NEXT:    vmov q2, q4
+; CHECK-NEXT:    vmov q3, q5
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %a.imag = shufflevector <8 x double> %a, <8 x double> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %b.real = shufflevector <8 x double> %b, <8 x double> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b.imag = shufflevector <8 x double> %b, <8 x double> zeroinitializer, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fsub fast <4 x double> %b.real, %a.imag
+  %1 = fadd fast <4 x double> %b.imag, %a.real
+  %interleaved.vec = shufflevector <4 x double> %0, <4 x double> %1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x double> %interleaved.vec
+}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-mul.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-mul.ll
new file mode 100644
index 0000000000000..5334e29214617
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-f64-mul.ll
@@ -0,0 +1,127 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+mve.fp,+fp64 -o - | FileCheck %s
+
+target triple = "thumbv8.1m.main-none-none-eabi"
+
+; Expected to not transform
+define arm_aapcs_vfpcc <2 x double> @complex_mul_v2f64(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: complex_mul_v2f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmul.f64 d5, d3, d0
+; CHECK-NEXT:    vmul.f64 d4, d1, d3
+; CHECK-NEXT:    vfma.f64 d5, d2, d1
+; CHECK-NEXT:    vfnms.f64 d4, d2, d0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> <i32 0>
+  %a.imag = shufflevector <2 x double> %a, <2 x double> poison, <1 x i32> <i32 1>
+  %b.real = shufflevector <2 x double> %b, <2 x double> poison, <1 x i32> <i32 0>
+  %b.imag = shufflevector <2 x double> %b, <2 x double> poison, <1 x i32> <i32 1>
+  %0 = fmul fast <1 x double> %b.imag, %a.real
+  %1 = fmul fast <1 x double> %b.real, %a.imag
+  %2 = fadd fast <1 x double> %1, %0
+  %3 = fmul fast <1 x double> %b.real, %a.real
+  %4 = fmul fast <1 x double> %a.imag, %b.imag
+  %5 = fsub fast <1 x double> %3, %4
+  %interleaved.vec = shufflevector <1 x double> %5, <1 x double> %2, <2 x i32> <i32 0, i32 1>
+  ret <2 x double> %interleaved.vec
+}
+
+; Expected to not transform
+define arm_aapcs_vfpcc <4 x double> @complex_mul_v4f64(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: complex_mul_v4f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vmul.f64 d9, d7, d2
+; CHECK-NEXT:    vmov q5, q0
+; CHECK-NEXT:    vmul.f64 d8, d3, d7
+; CHECK-NEXT:    vfma.f64 d9, d6, d3
+; CHECK-NEXT:    vfnms.f64 d8, d6, d2
+; CHECK-NEXT:    vmul.f64 d1, d5, d10
+; CHECK-NEXT:    vmov q1, q4
+; CHECK-NEXT:    vmul.f64 d0, d11, d5
+; CHECK-NEXT:    vfma.f64 d1, d4, d11
+; CHECK-NEXT:    vfnms.f64 d0, d4, d10
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x double> %b, <4 x double> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x double> %b.imag, %a.real
+  %1 = fmul fast <2 x double> %b.real, %a.imag
+  %2 = fadd fast <2 x double> %1, %0
+  %3 = fmul fast <2 x double> %b.real, %a.real
+  %4 = fmul fast <2 x double> %a.imag, %b.imag
+  %5 = fsub fast <2 x double> %3, %4
+  %interleaved.vec = shufflevector <2 x double> %5, <2 x double> %2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x double> %interleaved.vec
+}
+
+; Expected to not transform
+define arm_aapcs_vfpcc <8 x double> @complex_mul_v8f64(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: complex_mul_v8f64:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #64
+; CHECK-NEXT:    sub sp, #64
+; CHECK-NEXT:    add r0, sp, #128
+; CHECK-NEXT:    vmov q7, q1
+; CHECK-NEXT:    vldrw.u32 q4, [r0]
+; CHECK-NEXT:    add r0, sp, #160
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov q6, q0
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    add r0, sp, #176
+; CHECK-NEXT:    vmov q5, q3
+; CHECK-NEXT:    vstrw.32 q2, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vmul.f64 d5, d3, d0
+; CHECK-NEXT:    vstrw.32 q1, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q5, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vmul.f64 d4, d1, d3
+; CHECK-NEXT:    vldrw.u32 q1, [r0]
+; CHECK-NEXT:    vmov q0, q5
+; CHECK-NEXT:    add r0, sp, #144
+; CHECK-NEXT:    vstrw.32 q1, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmul.f64 d11, d3, d0
+; CHECK-NEXT:    vmul.f64 d10, d1, d3
+; CHECK-NEXT:    vldrw.u32 q0, [r0]
+; CHECK-NEXT:    vmul.f64 d7, d9, d12
+; CHECK-NEXT:    vmul.f64 d2, d15, d1
+; CHECK-NEXT:    vmul.f64 d3, d1, d14
+; CHECK-NEXT:    vmul.f64 d6, d13, d9
+; CHECK-NEXT:    vfma.f64 d7, d8, d13
+; CHECK-NEXT:    vfnms.f64 d6, d8, d12
+; CHECK-NEXT:    vldrw.u32 q4, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vfma.f64 d3, d0, d15
+; CHECK-NEXT:    vfnms.f64 d2, d0, d14
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT:    vfma.f64 d5, d0, d9
+; CHECK-NEXT:    vfnms.f64 d4, d0, d8
+; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vfma.f64 d11, d0, d9
+; CHECK-NEXT:    vfnms.f64 d10, d0, d8
+; CHECK-NEXT:    vmov q0, q3
+; CHECK-NEXT:    vmov q3, q5
+; CHECK-NEXT:    add sp, #64
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real   = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %a.imag = shufflevector <8 x double> %a, <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %b.real = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b.imag = shufflevector <8 x double> %b, <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %0 = fmul fast <4 x double> %b.imag, %a.real
+  %1 = fmul fast <4 x double> %b.real, %a.imag
+  %2 = fadd fast <4 x double> %1, %0
+  %3 = fmul fast <4 x double> %b.real, %a.real
+  %4 = fmul fast <4 x double> %a.imag, %b.imag
+  %5 = fsub fast <4 x double> %3, %4
+  %interleaved.vec = shufflevector <4 x double> %5, <4 x double> %2, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  ret <8 x double> %interleaved.vec
+}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll
new file mode 100644
index 0000000000000..b16b06bd45023
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll
@@ -0,0 +1,387 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+mve.fp,+fp64 -o - | FileCheck %s
+
+target triple = "thumbv8.1m.main-none-none-eabi"
+
+; Expected to transform
+define arm_aapcs_vfpcc <4 x float> @mul_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: mul_mul:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmul.f32 q3, q0, q1, #0
+; CHECK-NEXT:    vcmla.f32 q3, q0, q1, #90
+; CHECK-NEXT:    vcmul.f32 q0, q3, q2, #0
+; CHECK-NEXT:    vcmla.f32 q0, q3, q2, #90
+; CHECK-NEXT:    bx lr
+entry:
+  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec151 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec153 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec154 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x float> %strided.vec154, %strided.vec151
+  %1 = fmul fast <2 x float> %strided.vec153, %strided.vec
+  %2 = fmul fast <2 x float> %strided.vec154, %strided.vec
+  %3 = fmul fast <2 x float> %strided.vec153, %strided.vec151
+  %4 = fadd fast <2 x float> %3, %2
+  %5 = fsub fast <2 x float> %1, %0
+  %strided.vec156 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec157 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %6 = fmul fast <2 x float> %4, %strided.vec156
+  %7 = fmul fast <2 x float> %5, %strided.vec157
+  %8 = fadd fast <2 x float> %6, %7
+  %9 = fmul fast <2 x float> %strided.vec156, %5
+  %10 = fmul fast <2 x float> %4, %strided.vec157
+  %11 = fsub fast <2 x float> %9, %10
+  %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to not transform
+define arm_aapcs_vfpcc <4 x float> @add_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: add_mul:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vsub.f32 q3, q1, q2
+; CHECK-NEXT:    vsub.f32 q0, q1, q0
+; CHECK-NEXT:    vmov.f32 s16, s9
+; CHECK-NEXT:    vmov.f32 s13, s14
+; CHECK-NEXT:    vmov.f32 s17, s11
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmul.f32 q1, q3, q4
+; CHECK-NEXT:    vmov.f32 s1, s3
+; CHECK-NEXT:    vmov.f32 s9, s10
+; CHECK-NEXT:    vfma.f32 q1, q2, q0
+; CHECK-NEXT:    vmul.f32 q0, q4, q0
+; CHECK-NEXT:    vneg.f32 q4, q0
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vfma.f32 q4, q2, q3
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vmov.f32 s0, s16
+; CHECK-NEXT:    vmov.f32 s2, s17
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %0 = fsub fast <4 x float> %b, %c
+  %1 = shufflevector <4 x float> %0, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec58 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec59 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %2 = fmul fast <2 x float> %1, %strided.vec59
+  %3 = fsub fast <4 x float> %b, %a
+  %4 = shufflevector <4 x float> %3, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %5 = fmul fast <2 x float> %strided.vec58, %4
+  %6 = fadd fast <2 x float> %5, %2
+  %7 = fmul fast <2 x float> %strided.vec58, %1
+  %8 = fmul fast <2 x float> %strided.vec59, %4
+  %9 = fsub fast <2 x float> %7, %8
+  %interleaved.vec = shufflevector <2 x float> %9, <2 x float> %6, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to not transform
+define arm_aapcs_vfpcc <4 x float> @mul_mul270_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: mul_mul270_mul:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d12}
+; CHECK-NEXT:    vpush {d12}
+; CHECK-NEXT:    .vsave {d10}
+; CHECK-NEXT:    vpush {d10}
+; CHECK-NEXT:    .vsave {d8}
+; CHECK-NEXT:    vpush {d8}
+; CHECK-NEXT:    vmov.f32 s20, s4
+; CHECK-NEXT:    vmov.f32 s16, s8
+; CHECK-NEXT:    vmov.f32 s17, s10
+; CHECK-NEXT:    vmov.f32 s21, s6
+; CHECK-NEXT:    vmul.f32 q3, q5, q4
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vneg.f32 q3, q3
+; CHECK-NEXT:    vmov.f32 s24, s9
+; CHECK-NEXT:    vmov.f32 s25, s11
+; CHECK-NEXT:    vmov.f32 s5, s7
+; CHECK-NEXT:    vmul.f32 q2, q1, q4
+; CHECK-NEXT:    vmov.f32 s16, s0
+; CHECK-NEXT:    vfma.f32 q3, q1, q6
+; CHECK-NEXT:    vmov.f32 s17, s2
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vfma.f32 q2, q5, q6
+; CHECK-NEXT:    vmul.f32 q1, q3, q4
+; CHECK-NEXT:    vmov.f32 s1, s3
+; CHECK-NEXT:    vfma.f32 q1, q2, q0
+; CHECK-NEXT:    vmul.f32 q0, q3, q0
+; CHECK-NEXT:    vneg.f32 q3, q0
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vfma.f32 q3, q2, q4
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vmov.f32 s0, s12
+; CHECK-NEXT:    vmov.f32 s2, s13
+; CHECK-NEXT:    vpop {d8}
+; CHECK-NEXT:    vpop {d10}
+; CHECK-NEXT:    vpop {d12}
+; CHECK-NEXT:    bx lr
+entry:
+  %strided.vec = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec81 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec83 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec84 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x float> %strided.vec84, %strided.vec
+  %1 = fmul fast <2 x float> %strided.vec83, %strided.vec81
+  %2 = fadd fast <2 x float> %1, %0
+  %strided.vec86 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec87 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %3 = fmul fast <2 x float> %2, %strided.vec87
+  %4 = fmul fast <2 x float> %strided.vec84, %strided.vec81
+  %5 = fmul fast <2 x float> %strided.vec83, %strided.vec
+  %6 = fsub fast <2 x float> %4, %5
+  %7 = fmul fast <2 x float> %6, %strided.vec86
+  %8 = fadd fast <2 x float> %3, %7
+  %9 = fmul fast <2 x float> %2, %strided.vec86
+  %10 = fmul fast <2 x float> %6, %strided.vec87
+  %11 = fsub fast <2 x float> %9, %10
+  %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; (a * b) * a
+; Expected to transform
+define arm_aapcs_vfpcc <4 x float> @mul_triangle(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: mul_triangle:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmul.f32 q2, q1, q0, #0
+; CHECK-NEXT:    vcmla.f32 q2, q1, q0, #90
+; CHECK-NEXT:    vcmul.f32 q1, q0, q2, #0
+; CHECK-NEXT:    vcmla.f32 q1, q0, q2, #90
+; CHECK-NEXT:    vmov q0, q1
+; CHECK-NEXT:    bx lr
+entry:
+  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x float> %strided.vec37, %strided.vec
+  %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35
+  %2 = fsub fast <2 x float> %0, %1
+  %3 = fmul fast <2 x float> %2, %strided.vec35
+  %4 = fmul fast <2 x float> %strided.vec38, %strided.vec
+  %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37
+  %6 = fadd fast <2 x float> %4, %5
+  %7 = fmul fast <2 x float> %6, %strided.vec
+  %8 = fadd fast <2 x float> %3, %7
+  %9 = fmul fast <2 x float> %2, %strided.vec
+  %10 = fmul fast <2 x float> %6, %strided.vec35
+  %11 = fsub fast <2 x float> %9, %10
+  %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+
+; d * (b * a) * (c * a)
+; Expected to transform
+define arm_aapcs_vfpcc <4 x float> @mul_diamond(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d) {
+; CHECK-LABEL: mul_diamond:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vcmul.f32 q4, q1, q0, #0
+; CHECK-NEXT:    vcmla.f32 q4, q1, q0, #90
+; CHECK-NEXT:    vcmul.f32 q1, q4, q3, #0
+; CHECK-NEXT:    vcmla.f32 q1, q4, q3, #90
+; CHECK-NEXT:    vcmul.f32 q3, q2, q0, #0
+; CHECK-NEXT:    vcmla.f32 q3, q2, q0, #90
+; CHECK-NEXT:    vcmul.f32 q0, q3, q1, #0
+; CHECK-NEXT:    vcmla.f32 q0, q3, q1, #90
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %c.real = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %c.imag = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %d.real = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %d.imag = shufflevector <4 x float> %d, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x float> %a.imag, %b.real
+  %1 = fmul fast <2 x float> %a.real, %b.imag
+  %2 = fadd fast <2 x float> %1, %0
+  %3 = fmul fast <2 x float> %a.real, %b.real
+  %4 = fmul fast <2 x float> %b.imag, %a.imag
+  %5 = fsub fast <2 x float> %3, %4
+  %6 = fmul fast <2 x float> %d.real, %5
+  %7 = fmul fast <2 x float> %2, %d.imag
+  %8 = fmul fast <2 x float> %d.real, %2
+  %9 = fmul fast <2 x float> %5, %d.imag
+  %10 = fsub fast <2 x float> %6, %7
+  %11 = fadd fast <2 x float> %8, %9
+  %12 = fmul fast <2 x float> %c.real, %a.imag
+  %13 = fmul fast <2 x float> %c.imag, %a.real
+  %14 = fadd fast <2 x float> %13, %12
+  %15 = fmul fast <2 x float> %14, %10
+  %16 = fmul fast <2 x float> %c.real, %a.real
+  %17 = fmul fast <2 x float> %c.imag, %a.imag
+  %18 = fsub fast <2 x float> %16, %17
+  %19 = fmul fast <2 x float> %18, %11
+  %20 = fadd fast <2 x float> %15, %19
+  %21 = fmul fast <2 x float> %18, %10
+  %22 = fmul fast <2 x float> %14, %11
+  %23 = fsub fast <2 x float> %21, %22
+  %interleaved.vec = shufflevector <2 x float> %23, <2 x float> %20, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to transform
+define arm_aapcs_vfpcc <4 x float> @mul_add90_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: mul_add90_mul:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vcmul.f32 q3, q2, q0, #0
+; CHECK-NEXT:    vcmul.f32 q4, q1, q0, #0
+; CHECK-NEXT:    vcmla.f32 q4, q1, q0, #90
+; CHECK-NEXT:    vcmla.f32 q3, q2, q0, #90
+; CHECK-NEXT:    vcadd.f32 q0, q3, q4, #90
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %ai = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %br = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %bi = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %cr = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %ci = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+
+  %i6 = fmul fast <2 x float> %br, %ar
+  %i7 = fmul fast <2 x float> %bi, %ai
+  %xr = fsub fast <2 x float> %i6, %i7
+  %i9 = fmul fast <2 x float> %bi, %ar
+  %i10 = fmul fast <2 x float> %br, %ai
+  %xi = fadd fast <2 x float> %i9, %i10
+
+  %j6 = fmul fast <2 x float> %cr, %ar
+  %j7 = fmul fast <2 x float> %ci, %ai
+  %yr = fsub fast <2 x float> %j6, %j7
+  %j9 = fmul fast <2 x float> %ci, %ar
+  %j10 = fmul fast <2 x float> %cr, %ai
+  %yi = fadd fast <2 x float> %j9, %j10
+
+  %zr = fsub fast <2 x float> %yr, %xi
+  %zi = fadd fast <2 x float> %yi, %xr
+  %interleaved.vec = shufflevector <2 x float> %zr, <2 x float> %zi, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to not transform
+define arm_aapcs_vfpcc <4 x float> @mul_triangle_addmul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: mul_triangle_addmul:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vmov.f32 s16, s0
+; CHECK-NEXT:    vmov.f32 s20, s5
+; CHECK-NEXT:    vmov.f32 s17, s2
+; CHECK-NEXT:    vmov.f32 s21, s7
+; CHECK-NEXT:    vmov.f32 s5, s6
+; CHECK-NEXT:    vmul.f32 q3, q5, q4
+; CHECK-NEXT:    vmul.f32 q4, q1, q4
+; CHECK-NEXT:    vmov.f32 s0, s1
+; CHECK-NEXT:    vmov.f32 s1, s3
+; CHECK-NEXT:    vmov q6, q4
+; CHECK-NEXT:    vfms.f32 q6, q5, q0
+; CHECK-NEXT:    vmov q7, q3
+; CHECK-NEXT:    vfma.f32 q3, q1, q0
+; CHECK-NEXT:    vmov.f32 s20, s8
+; CHECK-NEXT:    vmov.f32 s21, s10
+; CHECK-NEXT:    vmov.f32 s4, s9
+; CHECK-NEXT:    vfma.f32 q7, q5, q0
+; CHECK-NEXT:    vmov.f32 s5, s11
+; CHECK-NEXT:    vadd.f32 q5, q7, q6
+; CHECK-NEXT:    vfms.f32 q4, q1, q0
+; CHECK-NEXT:    vmov.f32 s1, s20
+; CHECK-NEXT:    vsub.f32 q1, q4, q3
+; CHECK-NEXT:    vmov.f32 s3, s21
+; CHECK-NEXT:    vmov.f32 s0, s4
+; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %ar = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %ai = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %br = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %bi = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %cr = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %ci = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+
+  %i6 = fmul fast <2 x float> %br, %ar
+  %i7 = fmul fast <2 x float> %bi, %ai
+  %xr = fsub fast <2 x float> %i6, %i7
+  %i9 = fmul fast <2 x float> %bi, %ar
+  %i10 = fmul fast <2 x float> %br, %ai
+  %xi = fadd fast <2 x float> %i9, %i10
+
+  ;%j6 = fmul fast <2 x float> %cr, %ar
+  %j7 = fmul fast <2 x float> %ci, %ai
+  %yr = fsub fast <2 x float> %i6, %j7
+  ;%j9 = fmul fast <2 x float> %ci, %ar
+  %j10 = fmul fast <2 x float> %cr, %ai
+  %yi = fadd fast <2 x float> %i9, %j10
+
+  %zr = fsub fast <2 x float> %yr, %xi
+  %zi = fadd fast <2 x float> %yi, %xr
+  %interleaved.vec = shufflevector <2 x float> %zr, <2 x float> %zi, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to not transform
+define arm_aapcs_vfpcc <4 x float> @mul_triangle_multiuses(<4 x float> %a, <4 x float> %b, ptr %p) {
+; CHECK-LABEL: mul_triangle_multiuses:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9}
+; CHECK-NEXT:    vpush {d8, d9}
+; CHECK-NEXT:    vmov.f32 s16, s4
+; CHECK-NEXT:    vmov.f32 s8, s1
+; CHECK-NEXT:    vmov.f32 s17, s6
+; CHECK-NEXT:    vmov.f32 s9, s3
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmul.f32 q3, q2, q4
+; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vmov.f32 s5, s7
+; CHECK-NEXT:    vfma.f32 q3, q1, q0
+; CHECK-NEXT:    vmul.f32 q1, q1, q2
+; CHECK-NEXT:    vneg.f32 q1, q1
+; CHECK-NEXT:    vfma.f32 q1, q4, q0
+; CHECK-NEXT:    vmov.f32 s18, s12
+; CHECK-NEXT:    vmov.f32 s16, s4
+; CHECK-NEXT:    vmov.f32 s17, s5
+; CHECK-NEXT:    vmov.f32 s19, s13
+; CHECK-NEXT:    vstrw.32 q4, [r0]
+; CHECK-NEXT:    vmul.f32 q4, q3, q0
+; CHECK-NEXT:    vfma.f32 q4, q1, q2
+; CHECK-NEXT:    vmul.f32 q2, q3, q2
+; CHECK-NEXT:    vneg.f32 q2, q2
+; CHECK-NEXT:    vfma.f32 q2, q1, q0
+; CHECK-NEXT:    vmov.f32 s1, s16
+; CHECK-NEXT:    vmov.f32 s0, s8
+; CHECK-NEXT:    vmov.f32 s2, s9
+; CHECK-NEXT:    vmov.f32 s3, s17
+; CHECK-NEXT:    vpop {d8, d9}
+; CHECK-NEXT:    bx lr
+entry:
+  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec35 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec37 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec38 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x float> %strided.vec37, %strided.vec
+  %1 = fmul fast <2 x float> %strided.vec38, %strided.vec35
+  %2 = fsub fast <2 x float> %0, %1
+  %3 = fmul fast <2 x float> %2, %strided.vec35
+  %4 = fmul fast <2 x float> %strided.vec38, %strided.vec
+  %5 = fmul fast <2 x float> %strided.vec35, %strided.vec37
+  %6 = fadd fast <2 x float> %4, %5
+  %otheruse = shufflevector <2 x float> %2, <2 x float> %6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  store <4 x float> %otheruse, ptr %p
+  %7 = fmul fast <2 x float> %6, %strided.vec
+  %8 = fadd fast <2 x float> %3, %7
+  %9 = fmul fast <2 x float> %2, %strided.vec
+  %10 = fmul fast <2 x float> %6, %strided.vec35
+  %11 = fsub fast <2 x float> %9, %10
+  %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}

diff  --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll
new file mode 100644
index 0000000000000..38c56c674267f
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll
@@ -0,0 +1,316 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+mve.fp,+fp64 -o - | FileCheck %s
+
+target triple = "thumbv8.1m.main-none-none-eabi"
+
+; Expected to transform
+define arm_aapcs_vfpcc <4 x float> @simple_mul(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: simple_mul:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmul.f32 q2, q0, q1, #0
+; CHECK-NEXT:    vcmla.f32 q2, q0, q1, #90
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec17 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec19 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec20 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x float> %strided.vec20, %strided.vec
+  %1 = fmul fast <2 x float> %strided.vec19, %strided.vec17
+  %2 = fadd fast <2 x float> %1, %0
+  %3 = fmul fast <2 x float> %strided.vec19, %strided.vec
+  %4 = fmul fast <2 x float> %strided.vec17, %strided.vec20
+  %5 = fsub fast <2 x float> %3, %4
+  %interleaved.vec = shufflevector <2 x float> %5, <2 x float> %2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to not transform
+define arm_aapcs_vfpcc <4 x float> @simple_mul_no_contract(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: simple_mul_no_contract:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11}
+; CHECK-NEXT:    vmov.f32 s8, s5
+; CHECK-NEXT:    vmov.f32 s12, s1
+; CHECK-NEXT:    vmov.f32 s9, s7
+; CHECK-NEXT:    vmov.f32 s13, s3
+; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vmul.f32 q4, q3, q2
+; CHECK-NEXT:    vmov.f32 s5, s6
+; CHECK-NEXT:    vmul.f32 q2, q2, q0
+; CHECK-NEXT:    vmul.f32 q5, q1, q0
+; CHECK-NEXT:    vfma.f32 q2, q1, q3
+; CHECK-NEXT:    vsub.f32 q4, q5, q4
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vmov.f32 s0, s16
+; CHECK-NEXT:    vmov.f32 s2, s17
+; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    vpop {d8, d9, d10, d11}
+; CHECK-NEXT:    bx lr
+entry:
+  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec17 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec19 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec20 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x float> %strided.vec20, %strided.vec
+  %1 = fmul fast <2 x float> %strided.vec19, %strided.vec17
+  %2 = fadd fast <2 x float> %1, %0
+  %3 = fmul fast <2 x float> %strided.vec19, %strided.vec
+  %4 = fmul fast <2 x float> %strided.vec17, %strided.vec20
+  %5 = fsub <2 x float> %3, %4
+  %interleaved.vec = shufflevector <2 x float> %5, <2 x float> %2, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to transform
+define arm_aapcs_vfpcc <4 x float> @three_way_mul(<4 x float> %a, <4 x float> %b, <4 x float> %c) {
+; CHECK-LABEL: three_way_mul:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmul.f32 q3, q1, q0, #0
+; CHECK-NEXT:    vcmla.f32 q3, q1, q0, #90
+; CHECK-NEXT:    vcmul.f32 q0, q2, q3, #0
+; CHECK-NEXT:    vcmla.f32 q0, q2, q3, #90
+; CHECK-NEXT:    bx lr
+entry:
+  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec39 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec41 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec42 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec44 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec45 = shufflevector <4 x float> %c, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fmul fast <2 x float> %strided.vec41, %strided.vec
+  %1 = fmul fast <2 x float> %strided.vec42, %strided.vec39
+  %2 = fsub fast <2 x float> %0, %1
+  %3 = fmul fast <2 x float> %2, %strided.vec45
+  %4 = fmul fast <2 x float> %strided.vec42, %strided.vec
+  %5 = fmul fast <2 x float> %strided.vec39, %strided.vec41
+  %6 = fadd fast <2 x float> %4, %5
+  %7 = fmul fast <2 x float> %6, %strided.vec44
+  %8 = fadd fast <2 x float> %3, %7
+  %9 = fmul fast <2 x float> %2, %strided.vec44
+  %10 = fmul fast <2 x float> %6, %strided.vec45
+  %11 = fsub fast <2 x float> %9, %10
+  %interleaved.vec = shufflevector <2 x float> %11, <2 x float> %8, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to transform
+define arm_aapcs_vfpcc <4 x float> @simple_add_90(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: simple_add_90:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcadd.f32 q2, q1, q0, #90
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec17 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec19 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec20 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fsub fast <2 x float> %strided.vec19, %strided.vec17
+  %1 = fadd fast <2 x float> %strided.vec20, %strided.vec
+  %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to not transform, fadd commutativity is not yet implemented
+define arm_aapcs_vfpcc <4 x float> @simple_add_270_false(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: simple_add_270_false:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s12, s1
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s9, s6
+; CHECK-NEXT:    vmov.f32 s13, s3
+; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vsub.f32 q2, q3, q2
+; CHECK-NEXT:    vmov.f32 s5, s7
+; CHECK-NEXT:    vadd.f32 q1, q1, q0
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vmov.f32 s0, s4
+; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    bx lr
+entry:
+  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec17 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec19 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec20 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fadd fast <2 x float> %strided.vec20, %strided.vec
+  %1 = fsub fast <2 x float> %strided.vec17, %strided.vec19
+  %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to transform
+define arm_aapcs_vfpcc <4 x float> @simple_add_270_true(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: simple_add_270_true:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcadd.f32 q2, q0, q1, #270
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec17 = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %strided.vec19 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %strided.vec20 = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fadd fast <2 x float> %strided.vec, %strided.vec20
+  %1 = fsub fast <2 x float> %strided.vec17, %strided.vec19
+  %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to not transform
+define arm_aapcs_vfpcc <4 x float> @add_external_use(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: add_external_use:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vmov.f32 s8, s4
+; CHECK-NEXT:    vmov.f32 s12, s1
+; CHECK-NEXT:    vmov.f32 s4, s5
+; CHECK-NEXT:    vmov.f32 s9, s6
+; CHECK-NEXT:    vmov.f32 s13, s3
+; CHECK-NEXT:    vmov.f32 s5, s7
+; CHECK-NEXT:    vadd.f32 q2, q3, q2
+; CHECK-NEXT:    vmov.f32 s1, s2
+; CHECK-NEXT:    vsub.f32 q1, q0, q1
+; CHECK-NEXT:    vmov.f32 s1, s8
+; CHECK-NEXT:    vmov.f32 s0, s4
+; CHECK-NEXT:    vmov.f32 s2, s5
+; CHECK-NEXT:    vmov.f32 s3, s9
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fsub fast <2 x float> %a.real, %b.imag
+  %1 = fadd fast <2 x float> %a.imag, %b.real
+  %interleaved.vec = shufflevector <2 x float> %0, <2 x float> %1, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  %dup = shufflevector <2 x float> %0, <2 x float> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  %interleaved.vec2 = shufflevector <4 x float> %interleaved.vec, <4 x float> %dup, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x float> %interleaved.vec2
+}
+
+define arm_aapcs_vfpcc <4 x float> @mul_mul_with_fneg(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: mul_mul_with_fneg:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    vcmul.f32 q2, q1, q0, #270
+; CHECK-NEXT:    vcmla.f32 q2, q1, q0, #180
+; CHECK-NEXT:    vmov q0, q2
+; CHECK-NEXT:    bx lr
+entry:
+  %a.real = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %a.imag = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %b.real = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 0, i32 2>
+  %b.imag = shufflevector <4 x float> %b, <4 x float> poison, <2 x i32> <i32 1, i32 3>
+  %0 = fneg fast <2 x float> %a.imag
+  %1 = fmul fast <2 x float> %b.real, %0
+  %2 = fmul fast <2 x float> %a.real, %b.imag
+  %3 = fsub fast <2 x float> %1, %2
+  %4 = fmul fast <2 x float> %b.imag, %a.imag
+  %5 = fmul fast <2 x float> %a.real, %b.real
+  %6 = fsub fast <2 x float> %4, %5
+  %interleaved.vec = shufflevector <2 x float> %6, <2 x float> %3, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+  ret <4 x float> %interleaved.vec
+}
+
+; Expected to not transform
+define arm_aapcs_vfpcc <12 x float> @abp90c12(<12 x float> %a, <12 x float> %b, <12 x float> %c) {
+; CHECK-LABEL: abp90c12:
+; CHECK:       @ %bb.0: @ %entry
+; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    .pad #48
+; CHECK-NEXT:    sub sp, #48
+; CHECK-NEXT:    vldr s23, [sp, #124]
+; CHECK-NEXT:    vmov.f32 s20, s13
+; CHECK-NEXT:    vldr s22, [sp, #116]
+; CHECK-NEXT:    vmov.f32 s25, s11
+; CHECK-NEXT:    vmov.f32 s13, s10
+; CHECK-NEXT:    vldr s19, [sp, #120]
+; CHECK-NEXT:    vmov.f32 s11, s6
+; CHECK-NEXT:    vldr s18, [sp, #112]
+; CHECK-NEXT:    vmov.f32 s6, s5
+; CHECK-NEXT:    vldr s31, [sp, #172]
+; CHECK-NEXT:    vmov.f32 s10, s4
+; CHECK-NEXT:    vldr s30, [sp, #164]
+; CHECK-NEXT:    vmov.f32 s21, s15
+; CHECK-NEXT:    vldr s29, [sp, #156]
+; CHECK-NEXT:    vmov.f32 s5, s3
+; CHECK-NEXT:    vldr s28, [sp, #148]
+; CHECK-NEXT:    vmov.f32 s4, s1
+; CHECK-NEXT:    vmov.f32 s24, s9
+; CHECK-NEXT:    vmov.f32 s16, s12
+; CHECK-NEXT:    vstrw.32 q6, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmov.f32 s12, s8
+; CHECK-NEXT:    vldr s27, [sp, #168]
+; CHECK-NEXT:    vmov.f32 s17, s14
+; CHECK-NEXT:    vldr s26, [sp, #160]
+; CHECK-NEXT:    vmov.f32 s9, s2
+; CHECK-NEXT:    vldr s25, [sp, #152]
+; CHECK-NEXT:    vmov.f32 s8, s0
+; CHECK-NEXT:    vmul.f32 q0, q5, q1
+; CHECK-NEXT:    vmul.f32 q1, q4, q1
+; CHECK-NEXT:    vneg.f32 q0, q0
+; CHECK-NEXT:    vldr s24, [sp, #144]
+; CHECK-NEXT:    vfma.f32 q1, q5, q2
+; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT:    vstrw.32 q3, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT:    vsub.f32 q6, q6, q1
+; CHECK-NEXT:    vldrw.u32 q1, [sp] @ 16-byte Reload
+; CHECK-NEXT:    vldr s13, [sp, #140]
+; CHECK-NEXT:    vfma.f32 q1, q4, q2
+; CHECK-NEXT:    vldr s12, [sp, #132]
+; CHECK-NEXT:    vadd.f32 q1, q7, q1
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vldr s1, [sp, #136]
+; CHECK-NEXT:    vstrw.32 q3, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT:    vmul.f32 q2, q3, q7
+; CHECK-NEXT:    vldr s0, [sp, #128]
+; CHECK-NEXT:    vldrw.u32 q3, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT:    vneg.f32 q2, q2
+; CHECK-NEXT:    vldr s21, [sp, #184]
+; CHECK-NEXT:    vfma.f32 q2, q0, q3
+; CHECK-NEXT:    vmul.f32 q0, q0, q7
+; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT:    vldr s20, [sp, #176]
+; CHECK-NEXT:    vldr s17, [sp, #188]
+; CHECK-NEXT:    vldr s16, [sp, #180]
+; CHECK-NEXT:    vfma.f32 q0, q7, q3
+; CHECK-NEXT:    vsub.f32 q3, q5, q0
+; CHECK-NEXT:    vmov.f32 s1, s4
+; CHECK-NEXT:    vadd.f32 q4, q4, q2
+; CHECK-NEXT:    vmov.f32 s3, s5
+; CHECK-NEXT:    vmov.f32 s5, s6
+; CHECK-NEXT:    vmov.f32 s0, s24
+; CHECK-NEXT:    vmov.f32 s2, s25
+; CHECK-NEXT:    vmov.f32 s4, s26
+; CHECK-NEXT:    vmov.f32 s6, s27
+; CHECK-NEXT:    vmov.f32 s8, s12
+; CHECK-NEXT:    vmov.f32 s9, s16
+; CHECK-NEXT:    vmov.f32 s10, s13
+; CHECK-NEXT:    vmov.f32 s11, s17
+; CHECK-NEXT:    add sp, #48
+; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
+; CHECK-NEXT:    bx lr
+entry:
+  %ar = shufflevector <12 x float> %a, <12 x float> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10>
+  %ai = shufflevector <12 x float> %a, <12 x float> poison, <6 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
+  %br = shufflevector <12 x float> %b, <12 x float> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10>
+  %bi = shufflevector <12 x float> %b, <12 x float> poison, <6 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
+  %cr = shufflevector <12 x float> %c, <12 x float> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10>
+  %ci = shufflevector <12 x float> %c, <12 x float> poison, <6 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11>
+
+  %i6 = fmul fast <6 x float> %br, %ar
+  %i7 = fmul fast <6 x float> %bi, %ai
+  %xr = fsub fast <6 x float> %i6, %i7
+  %i9 = fmul fast <6 x float> %bi, %ar
+  %i10 = fmul fast <6 x float> %br, %ai
+  %xi = fadd fast <6 x float> %i9, %i10
+
+  %zr = fsub fast <6 x float> %cr, %xi
+  %zi = fadd fast <6 x float> %ci, %xr
+  %interleaved.vec = shufflevector <6 x float> %zr, <6 x float> %zi, <12 x i32> <i32 0, i32 6, i32 1, i32 7, i32 2, i32 8, i32 3, i32 9, i32 4, i32 10, i32 5, i32 11>
+  ret <12 x float> %interleaved.vec
+}