[llvm] 6850bc3 - [CodeGen] Enable AArch64 SVE FCMLA/FCADD instruction generation in ComplexDeinterleaving

Fri Apr 21 03:29:33 PDT 2023

Author: Igor Kirillov
Date: 2023-04-21T09:58:35Z
New Revision: 6850bc35c6b5420451ad96188870f069a6d35784

URL: https://github.com/llvm/llvm-project/commit/6850bc35c6b5420451ad96188870f069a6d35784
DIFF: https://github.com/llvm/llvm-project/commit/6850bc35c6b5420451ad96188870f069a6d35784.diff

LOG: [CodeGen] Enable AArch64 SVE FCMLA/FCADD instruction generation in ComplexDeinterleaving

This commit adds support for scalable vector types in theComplexDeinterleaving
pass, allowing it to recognize and handle `llvm.vector.interleave2` and
`llvm.vector.deinterleave2` intrinsics for both fixed and scalable vectors

Differential Revision: https://reviews.llvm.org/D147451

Added: 
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll

Modified: 
    llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
    llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
    llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
    llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
index 3d11bf3651a36..d9b8cc2258314 100644

--- a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
+++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
@@ -38,7 +38,7 @@ enum class ComplexDeinterleavingOperation {
   CMulPartial,
   // The following 'operations' are used to represent internal states. Backends
   // are not expected to try and support these in any capacity.
-  Shuffle,
+  Deinterleave,
   Symmetric
 };
 

diff  --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index b7fc502b9a2ee..f2599fc05b478 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -18,6 +18,11 @@
 // pairs. Validity of each node is expected to be done upon creation, and any
 // validation errors should halt traversal and prevent further graph
 // construction.
+// Instead of relying on Shuffle operations, vector interleaving and
+// deinterleaving can be represented by vector.interleave2 and
+// vector.deinterleave2 intrinsics. Scalable vectors can be represented only by
+// these intrinsics, whereas, fixed-width vectors are recognized for both
+// shufflevector instruction and intrinsics.
 //
 // Replacement:
 // This step traverses the graph built up by identification, delegating to the
@@ -250,6 +255,17 @@ class ComplexDeinterleavingGraph {
 
   NodePtr identifyNode(Instruction *I, Instruction *J);
 
+  NodePtr identifyRoot(Instruction *I);
+
+  /// Identifies the Deinterleave operation applied to a vector containing
+  /// complex numbers. There are two ways to represent the Deinterleave
+  /// operation:
+  /// * Using two shufflevectors with even indices for /pReal instruction and
+  /// odd indices for /pImag instructions (only for fixed-width vectors)
+  /// * Using two extractvalue instructions applied to `vector.deinterleave2`
+  /// intrinsic (for both fixed and scalable vectors)
+  NodePtr identifyDeinterleave(Instruction *Real, Instruction *Imag);
+
   Value *replaceNode(RawNodePtr Node);
 
 public:
@@ -365,19 +381,8 @@ static bool isDeinterleavingMask(ArrayRef<int> Mask) {
 
 bool ComplexDeinterleaving::evaluateBasicBlock(BasicBlock *B) {
   ComplexDeinterleavingGraph Graph(TL, TLI);
-
-  for (auto &I : *B) {
-    auto *SVI = dyn_cast<ShuffleVectorInst>(&I);
-    if (!SVI)
-      continue;
-
-    // Look for a shufflevector that takes separate vectors of the real and
-    // imaginary components and recombines them into a single vector.
-    if (!isInterleavingMask(SVI->getShuffleMask()))
-      continue;
-
-    Graph.identifyNodes(SVI);
-  }
+  for (auto &I : *B)
+    Graph.identifyNodes(&I);
 
   if (Graph.checkNodes()) {
     Graph.replaceNodes();
@@ -748,100 +753,12 @@ ComplexDeinterleavingGraph::identifyNode(Instruction *Real, Instruction *Imag) {
     return CN;
   }
 
-  auto *RealShuffle = dyn_cast<ShuffleVectorInst>(Real);
-  auto *ImagShuffle = dyn_cast<ShuffleVectorInst>(Imag);
-  if (RealShuffle && ImagShuffle) {
-    Value *RealOp1 = RealShuffle->getOperand(1);
-    if (!isa<UndefValue>(RealOp1) && !isa<ConstantAggregateZero>(RealOp1)) {
-      LLVM_DEBUG(dbgs() << " - RealOp1 is not undef or zero.\n");
-      return nullptr;
-    }
-    Value *ImagOp1 = ImagShuffle->getOperand(1);
-    if (!isa<UndefValue>(ImagOp1) && !isa<ConstantAggregateZero>(ImagOp1)) {
-      LLVM_DEBUG(dbgs() << " - ImagOp1 is not undef or zero.\n");
-      return nullptr;
-    }
-
-    Value *RealOp0 = RealShuffle->getOperand(0);
-    Value *ImagOp0 = ImagShuffle->getOperand(0);
-
-    if (RealOp0 != ImagOp0) {
-      LLVM_DEBUG(dbgs() << " - Shuffle operands are not equal.\n");
-      return nullptr;
-    }
-
-    ArrayRef<int> RealMask = RealShuffle->getShuffleMask();
-    ArrayRef<int> ImagMask = ImagShuffle->getShuffleMask();
-    if (!isDeinterleavingMask(RealMask) || !isDeinterleavingMask(ImagMask)) {
-      LLVM_DEBUG(dbgs() << " - Masks are not deinterleaving.\n");
-      return nullptr;
-    }
-
-    if (RealMask[0] != 0 || ImagMask[0] != 1) {
-      LLVM_DEBUG(dbgs() << " - Masks do not have the correct initial value.\n");
-      return nullptr;
-    }
-
-    // Type checking, the shuffle type should be a vector type of the same
-    // scalar type, but half the size
-    auto CheckType = [&](ShuffleVectorInst *Shuffle) {
-      Value *Op = Shuffle->getOperand(0);
-      auto *ShuffleTy = cast<FixedVectorType>(Shuffle->getType());
-      auto *OpTy = cast<FixedVectorType>(Op->getType());
-
-      if (OpTy->getScalarType() != ShuffleTy->getScalarType())
-        return false;
-      if ((ShuffleTy->getNumElements() * 2) != OpTy->getNumElements())
-        return false;
-
-      return true;
-    };
-
-    auto CheckDeinterleavingShuffle = [&](ShuffleVectorInst *Shuffle) -> bool {
-      if (!CheckType(Shuffle))
-        return false;
-
-      ArrayRef<int> Mask = Shuffle->getShuffleMask();
-      int Last = *Mask.rbegin();
-
-      Value *Op = Shuffle->getOperand(0);
-      auto *OpTy = cast<FixedVectorType>(Op->getType());
-      int NumElements = OpTy->getNumElements();
-
-      // Ensure that the deinterleaving shuffle only pulls from the first
-      // shuffle operand.
-      return Last < NumElements;
-    };
-
-    if (RealShuffle->getType() != ImagShuffle->getType()) {
-      LLVM_DEBUG(dbgs() << " - Shuffle types aren't equal.\n");
-      return nullptr;
-    }
-    if (!CheckDeinterleavingShuffle(RealShuffle)) {
-      LLVM_DEBUG(dbgs() << " - RealShuffle is invalid type.\n");
-      return nullptr;
-    }
-    if (!CheckDeinterleavingShuffle(ImagShuffle)) {
-      LLVM_DEBUG(dbgs() << " - ImagShuffle is invalid type.\n");
-      return nullptr;
-    }
-
-    NodePtr PlaceholderNode =
-        prepareCompositeNode(llvm::ComplexDeinterleavingOperation::Shuffle,
-                             RealShuffle, ImagShuffle);
-    PlaceholderNode->ReplacementNode = RealShuffle->getOperand(0);
-    FinalInstructions.insert(RealShuffle);
-    FinalInstructions.insert(ImagShuffle);
-    return submitCompositeNode(PlaceholderNode);
-  }
-  if (RealShuffle || ImagShuffle) {
-    LLVM_DEBUG(dbgs() << " - There's a shuffle where there shouldn't be.\n");
-    return nullptr;
-  }
+  NodePtr Node = identifyDeinterleave(Real, Imag);
+  if (Node)
+    return Node;
 
-  auto *VTy = cast<FixedVectorType>(Real->getType());
-  auto *NewVTy =
-      FixedVectorType::get(VTy->getScalarType(), VTy->getNumElements() * 2);
+  auto *VTy = cast<VectorType>(Real->getType());
+  auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
 
   if (TL->isComplexDeinterleavingOperationSupported(
           ComplexDeinterleavingOperation::CMulPartial, NewVTy) &&
@@ -862,13 +779,10 @@ ComplexDeinterleavingGraph::identifyNode(Instruction *Real, Instruction *Imag) {
 }
 
 bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
-  Instruction *Real;
-  Instruction *Imag;
-  if (!match(RootI, m_Shuffle(m_Instruction(Real), m_Instruction(Imag))))
+  auto RootNode = identifyRoot(RootI);
+  if (!RootNode)
     return false;
 
-  auto RootNode = identifyNode(Real, Imag);
-
   LLVM_DEBUG({
     Function *F = RootI->getFunction();
     BasicBlock *B = RootI->getParent();
@@ -877,14 +791,9 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
     dump(dbgs());
     dbgs() << "\n";
   });
-
-  if (RootNode) {
-    RootToNode[RootI] = RootNode;
-    OrderedRoots.push_back(RootI);
-    return true;
-  }
-
-  return false;
+  RootToNode[RootI] = RootNode;
+  OrderedRoots.push_back(RootI);
+  return true;
 }
 
 bool ComplexDeinterleavingGraph::checkNodes() {
@@ -960,6 +869,147 @@ bool ComplexDeinterleavingGraph::checkNodes() {
   return !RootToNode.empty();
 }
 
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyRoot(Instruction *RootI) {
+  if (auto *Intrinsic = dyn_cast<IntrinsicInst>(RootI)) {
+    if (Intrinsic->getIntrinsicID() !=
+        Intrinsic::experimental_vector_interleave2)
+      return nullptr;
+
+    auto *Real = dyn_cast<Instruction>(Intrinsic->getOperand(0));
+    auto *Imag = dyn_cast<Instruction>(Intrinsic->getOperand(1));
+    if (!Real || !Imag)
+      return nullptr;
+
+    return identifyNode(Real, Imag);
+  }
+
+  auto *SVI = dyn_cast<ShuffleVectorInst>(RootI);
+  if (!SVI)
+    return nullptr;
+
+  // Look for a shufflevector that takes separate vectors of the real and
+  // imaginary components and recombines them into a single vector.
+  if (!isInterleavingMask(SVI->getShuffleMask()))
+    return nullptr;
+
+  Instruction *Real;
+  Instruction *Imag;
+  if (!match(RootI, m_Shuffle(m_Instruction(Real), m_Instruction(Imag))))
+    return nullptr;
+
+  return identifyNode(Real, Imag);
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyDeinterleave(Instruction *Real,
+                                                 Instruction *Imag) {
+  Instruction *I = nullptr;
+  Value *FinalValue = nullptr;
+  if (match(Real, m_ExtractValue<0>(m_Instruction(I))) &&
+      match(Imag, m_ExtractValue<1>(m_Specific(I))) &&
+      match(I, m_Intrinsic<Intrinsic::experimental_vector_deinterleave2>(
+                   m_Value(FinalValue)))) {
+    NodePtr PlaceholderNode = prepareCompositeNode(
+        llvm::ComplexDeinterleavingOperation::Deinterleave, Real, Imag);
+    PlaceholderNode->ReplacementNode = FinalValue;
+    FinalInstructions.insert(Real);
+    FinalInstructions.insert(Imag);
+    return submitCompositeNode(PlaceholderNode);
+  }
+
+  auto *RealShuffle = dyn_cast<ShuffleVectorInst>(Real);
+  auto *ImagShuffle = dyn_cast<ShuffleVectorInst>(Imag);
+  if (!RealShuffle || !ImagShuffle) {
+    if (RealShuffle || ImagShuffle)
+      LLVM_DEBUG(dbgs() << " - There's a shuffle where there shouldn't be.\n");
+    return nullptr;
+  }
+
+  Value *RealOp1 = RealShuffle->getOperand(1);
+  if (!isa<UndefValue>(RealOp1) && !isa<ConstantAggregateZero>(RealOp1)) {
+    LLVM_DEBUG(dbgs() << " - RealOp1 is not undef or zero.\n");
+    return nullptr;
+  }
+  Value *ImagOp1 = ImagShuffle->getOperand(1);
+  if (!isa<UndefValue>(ImagOp1) && !isa<ConstantAggregateZero>(ImagOp1)) {
+    LLVM_DEBUG(dbgs() << " - ImagOp1 is not undef or zero.\n");
+    return nullptr;
+  }
+
+  Value *RealOp0 = RealShuffle->getOperand(0);
+  Value *ImagOp0 = ImagShuffle->getOperand(0);
+
+  if (RealOp0 != ImagOp0) {
+    LLVM_DEBUG(dbgs() << " - Shuffle operands are not equal.\n");
+    return nullptr;
+  }
+
+  ArrayRef<int> RealMask = RealShuffle->getShuffleMask();
+  ArrayRef<int> ImagMask = ImagShuffle->getShuffleMask();
+  if (!isDeinterleavingMask(RealMask) || !isDeinterleavingMask(ImagMask)) {
+    LLVM_DEBUG(dbgs() << " - Masks are not deinterleaving.\n");
+    return nullptr;
+  }
+
+  if (RealMask[0] != 0 || ImagMask[0] != 1) {
+    LLVM_DEBUG(dbgs() << " - Masks do not have the correct initial value.\n");
+    return nullptr;
+  }
+
+  // Type checking, the shuffle type should be a vector type of the same
+  // scalar type, but half the size
+  auto CheckType = [&](ShuffleVectorInst *Shuffle) {
+    Value *Op = Shuffle->getOperand(0);
+    auto *ShuffleTy = cast<FixedVectorType>(Shuffle->getType());
+    auto *OpTy = cast<FixedVectorType>(Op->getType());
+
+    if (OpTy->getScalarType() != ShuffleTy->getScalarType())
+      return false;
+    if ((ShuffleTy->getNumElements() * 2) != OpTy->getNumElements())
+      return false;
+
+    return true;
+  };
+
+  auto CheckDeinterleavingShuffle = [&](ShuffleVectorInst *Shuffle) -> bool {
+    if (!CheckType(Shuffle))
+      return false;
+
+    ArrayRef<int> Mask = Shuffle->getShuffleMask();
+    int Last = *Mask.rbegin();
+
+    Value *Op = Shuffle->getOperand(0);
+    auto *OpTy = cast<FixedVectorType>(Op->getType());
+    int NumElements = OpTy->getNumElements();
+
+    // Ensure that the deinterleaving shuffle only pulls from the first
+    // shuffle operand.
+    return Last < NumElements;
+  };
+
+  if (RealShuffle->getType() != ImagShuffle->getType()) {
+    LLVM_DEBUG(dbgs() << " - Shuffle types aren't equal.\n");
+    return nullptr;
+  }
+  if (!CheckDeinterleavingShuffle(RealShuffle)) {
+    LLVM_DEBUG(dbgs() << " - RealShuffle is invalid type.\n");
+    return nullptr;
+  }
+  if (!CheckDeinterleavingShuffle(ImagShuffle)) {
+    LLVM_DEBUG(dbgs() << " - ImagShuffle is invalid type.\n");
+    return nullptr;
+  }
+
+  NodePtr PlaceholderNode =
+      prepareCompositeNode(llvm::ComplexDeinterleavingOperation::Deinterleave,
+                           RealShuffle, ImagShuffle);
+  PlaceholderNode->ReplacementNode = RealShuffle->getOperand(0);
+  FinalInstructions.insert(RealShuffle);
+  FinalInstructions.insert(ImagShuffle);
+  return submitCompositeNode(PlaceholderNode);
+}
+
 static Value *replaceSymmetricNode(ComplexDeinterleavingGraph::RawNodePtr Node,
                                    Value *InputA, Value *InputB) {
   Instruction *I = Node->Real;

diff  --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 3939f4de416bb..39799ee5d278d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -24625,20 +24625,30 @@ bool AArch64TargetLowering::isConstantUnsignedBitfieldExtractLegal(
 }
 
 bool AArch64TargetLowering::isComplexDeinterleavingSupported() const {
-  return Subtarget->hasComplxNum();
+  return Subtarget->hasSVE() || Subtarget->hasComplxNum();
 }
 
 bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
     ComplexDeinterleavingOperation Operation, Type *Ty) const {
-  auto *VTy = dyn_cast<FixedVectorType>(Ty);
+  auto *VTy = dyn_cast<VectorType>(Ty);
   if (!VTy)
     return false;
 
+  // If the vector is scalable, SVE is enabled, implying support for complex
+  // numbers. Otherwirse, we need to ensure complex number support is avaialble
+  if (!VTy->isScalableTy() && !Subtarget->hasComplxNum())
+    return false;
+
   auto *ScalarTy = VTy->getScalarType();
-  unsigned NumElements = VTy->getNumElements();
+  unsigned NumElements = VTy->getElementCount().getKnownMinValue();
 
+  // We can only process vectors that have a bit size of 128 or higher (with an
+  // additional 64 bits for Neon). Additionally, these vectors must have a
+  // power-of-2 size, as we later split them into the smallest supported size
+  // and merging them back together after applying complex operation.
   unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
-  if ((VTyWidth < 128 && VTyWidth != 64) || !llvm::isPowerOf2_32(VTyWidth))
+  if ((VTyWidth < 128 && (VTy->isScalableTy() || VTyWidth != 64)) ||
+      !llvm::isPowerOf2_32(VTyWidth))
     return false;
 
   return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
@@ -24649,57 +24659,75 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
     Instruction *I, ComplexDeinterleavingOperation OperationType,
     ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
     Value *Accumulator) const {
-  FixedVectorType *Ty = cast<FixedVectorType>(InputA->getType());
+  VectorType *Ty = cast<VectorType>(InputA->getType());
+  bool IsScalable = Ty->isScalableTy();
 
   IRBuilder<> B(I);
 
-  unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
+  unsigned TyWidth =
+      Ty->getScalarSizeInBits() * Ty->getElementCount().getKnownMinValue();
 
   assert(((TyWidth >= 128 && llvm::isPowerOf2_32(TyWidth)) || TyWidth == 64) &&
          "Vector type must be either 64 or a power of 2 that is at least 128");
 
   if (TyWidth > 128) {
-    int Stride = Ty->getNumElements() / 2;
-    auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
-    auto SplitSeqVec = llvm::to_vector(SplitSeq);
-    ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
-    ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
-
-    auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
-    auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
-    auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
-    auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
+    int Stride = Ty->getElementCount().getKnownMinValue() / 2;
+    auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
+    auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
+    auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
+    auto *UpperSplitA =
+        B.CreateExtractVector(HalfTy, InputA, B.getInt64(Stride));
+    auto *UpperSplitB =
+        B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
     Value *LowerSplitAcc = nullptr;
     Value *UpperSplitAcc = nullptr;
-
     if (Accumulator) {
-      LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
-      UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
+      LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
+      UpperSplitAcc =
+          B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
     }
-
     auto *LowerSplitInt = createComplexDeinterleavingIR(
         I, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
     auto *UpperSplitInt = createComplexDeinterleavingIR(
         I, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
 
-    ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
-    return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
+    auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
+                                        B.getInt64(0));
+    return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
   }
 
   if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
+    if (Accumulator == nullptr)
+      Accumulator = ConstantFP::get(Ty, 0);
+
+    if (IsScalable) {
+      auto *Mask = B.CreateVectorSplat(Ty->getElementCount(), B.getInt1(true));
+      return B.CreateIntrinsic(
+          Intrinsic::aarch64_sve_fcmla, Ty,
+          {Mask, Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
+    }
+
     Intrinsic::ID IdMap[4] = {Intrinsic::aarch64_neon_vcmla_rot0,
                               Intrinsic::aarch64_neon_vcmla_rot90,
                               Intrinsic::aarch64_neon_vcmla_rot180,
                               Intrinsic::aarch64_neon_vcmla_rot270};
 
-    if (Accumulator == nullptr)
-      Accumulator = ConstantFP::get(Ty, 0);
 
     return B.CreateIntrinsic(IdMap[(int)Rotation], Ty,
                              {Accumulator, InputB, InputA});
   }
 
   if (OperationType == ComplexDeinterleavingOperation::CAdd) {
+    if (IsScalable) {
+      auto *Mask = B.CreateVectorSplat(Ty->getElementCount(), B.getInt1(true));
+      if (Rotation == ComplexDeinterleavingRotation::Rotation_90 ||
+          Rotation == ComplexDeinterleavingRotation::Rotation_270)
+        return B.CreateIntrinsic(
+            Intrinsic::aarch64_sve_fcadd, Ty,
+            {Mask, InputA, InputB, B.getInt32((int)Rotation * 90)});
+      return nullptr;
+    }
+
     Intrinsic::ID IntId = Intrinsic::not_intrinsic;
     if (Rotation == ComplexDeinterleavingRotation::Rotation_90)
       IntId = Intrinsic::aarch64_neon_vcadd_rot90;

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
new file mode 100644
index 0000000000000..b2a4c5c3ef3ad
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add-scalable.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+sve -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; Expected to not transform
+define <vscale x 4 x half> @complex_add_v4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
+; CHECK-LABEL: complex_add_v4f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    uunpkhi z3.d, z1.s
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uzp1 z4.d, z0.d, z2.d
+; CHECK-NEXT:    uzp2 z0.d, z0.d, z2.d
+; CHECK-NEXT:    uzp2 z2.d, z1.d, z3.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z3.d
+; CHECK-NEXT:    fsubr z0.h, p0/m, z0.h, z1.h
+; CHECK-NEXT:    movprfx z1, z2
+; CHECK-NEXT:    fadd z1.h, p0/m, z1.h, z4.h
+; CHECK-NEXT:    zip2 z2.d, z0.d, z1.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z1.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z2.s
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %a)
+  %a.real = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %b)
+  %b.real = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %b.deinterleaved, 1
+  %0 = fsub fast <vscale x 2 x half> %b.real, %a.imag
+  %1 = fadd fast <vscale x 2 x half> %b.imag, %a.real
+  %interleaved.vec = tail call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1)
+  ret <vscale x 4 x half> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 8 x half> @complex_add_v8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: complex_add_v8f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    fcadd z1.h, p0/m, z1.h, z0.h, #90
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %a)
+  %a.real = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %b)
+  %b.real = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %b.deinterleaved, 1
+  %0 = fsub fast <vscale x 4 x half> %b.real, %a.imag
+  %1 = fadd fast <vscale x 4 x half> %b.imag, %a.real
+  %interleaved.vec = tail call <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1)
+  ret <vscale x 8 x half> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 16 x half> @complex_add_v16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b) {
+; CHECK-LABEL: complex_add_v16f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    fcadd z2.h, p0/m, z2.h, z0.h, #90
+; CHECK-NEXT:    fcadd z3.h, p0/m, z3.h, z1.h, #90
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    mov z1.d, z3.d
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %a)
+  %a.real = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %b)
+  %b.real = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %b.deinterleaved, 1
+  %0 = fsub fast <vscale x 8 x half> %b.real, %a.imag
+  %1 = fadd fast <vscale x 8 x half> %b.imag, %a.real
+  %interleaved.vec = tail call <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1)
+  ret <vscale x 16 x half> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 32 x half> @complex_add_v32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b) {
+; CHECK-LABEL: complex_add_v32f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    fcadd z6.h, p0/m, z6.h, z2.h, #90
+; CHECK-NEXT:    fcadd z4.h, p0/m, z4.h, z0.h, #90
+; CHECK-NEXT:    fcadd z5.h, p0/m, z5.h, z1.h, #90
+; CHECK-NEXT:    fcadd z7.h, p0/m, z7.h, z3.h, #90
+; CHECK-NEXT:    mov z0.d, z4.d
+; CHECK-NEXT:    mov z1.d, z5.d
+; CHECK-NEXT:    mov z2.d, z6.d
+; CHECK-NEXT:    mov z3.d, z7.d
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %a)
+  %a.real = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %b)
+  %b.real = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %b.deinterleaved, 1
+  %0 = fsub fast <vscale x 16 x half> %b.real, %a.imag
+  %1 = fadd fast <vscale x 16 x half> %b.imag, %a.real
+  %interleaved.vec = tail call <vscale x 32 x half> @llvm.experimental.vector.interleave2.nxv32f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1)
+  ret <vscale x 32 x half> %interleaved.vec
+}
+
+declare { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
+declare <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
+
+declare { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
+declare <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
+
+declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
+declare <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
+
+declare { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half>)
+declare <vscale x 32 x half> @llvm.experimental.vector.interleave2.nxv32f16(<vscale x 16 x half>, <vscale x 16 x half>)

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
index 64a436b4031d8..407a2bb347a16 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-add.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16 -o - | FileCheck %s
+; RUN: llc < %s --mattr=+complxnum,+neon,+fullfp16,+sve -o - | FileCheck %s
 
 target triple = "aarch64-arm-none-eabi"
 
@@ -98,3 +99,70 @@ entry:
   %interleaved.vec = shufflevector <16 x half> %0, <16 x half> %1, <32 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
   ret <32 x half> %interleaved.vec
 }
+
+; Expected to transform
+define <4 x half> @complex_add_v4f16_with_intrinsic(<4 x half> %a, <4 x half> %b) {
+; CHECK-LABEL: complex_add_v4f16_with_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcadd v0.4h, v1.4h, v0.4h, #90
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <2 x half>, <2 x half> } @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %a)
+  %a.real = extractvalue { <2 x half>, <2 x half> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <2 x half>, <2 x half> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <2 x half>, <2 x half> } @llvm.experimental.vector.deinterleave2.v4f16(<4 x half> %b)
+  %b.real = extractvalue { <2 x half>, <2 x half> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <2 x half>, <2 x half> } %b.deinterleaved, 1
+  %0 = fsub fast <2 x half> %b.real, %a.imag
+  %1 = fadd fast <2 x half> %b.imag, %a.real
+  %interleaved.vec = tail call <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half> %0, <2 x half> %1)
+  ret <4 x half> %interleaved.vec
+}
+
+; Expected to transform
+define <8 x half> @complex_add_v8f16_with_intrinsic(<8 x half> %a, <8 x half> %b) {
+; CHECK-LABEL: complex_add_v8f16_with_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcadd v0.8h, v1.8h, v0.8h, #90
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <4 x half>, <4 x half> } @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %a)
+  %a.real = extractvalue { <4 x half>, <4 x half> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <4 x half>, <4 x half> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <4 x half>, <4 x half> } @llvm.experimental.vector.deinterleave2.v8f16(<8 x half> %b)
+  %b.real = extractvalue { <4 x half>, <4 x half> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <4 x half>, <4 x half> } %b.deinterleaved, 1
+  %0 = fsub fast <4 x half> %b.real, %a.imag
+  %1 = fadd fast <4 x half> %b.imag, %a.real
+  %interleaved.vec = tail call <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half> %0, <4 x half> %1)
+  ret <8 x half> %interleaved.vec
+}
+
+; Expected to transform
+define <16 x half> @complex_add_v16f16_with_intrinsic(<16 x half> %a, <16 x half> %b) {
+; CHECK-LABEL: complex_add_v16f16_with_intrinsic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcadd v0.8h, v2.8h, v0.8h, #90
+; CHECK-NEXT:    fcadd v1.8h, v3.8h, v1.8h, #90
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <8 x half>, <8 x half> } @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %a)
+  %a.real = extractvalue { <8 x half>, <8 x half> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <8 x half>, <8 x half> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <8 x half>, <8 x half> } @llvm.experimental.vector.deinterleave2.v16f16(<16 x half> %b)
+  %b.real = extractvalue { <8 x half>, <8 x half> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <8 x half>, <8 x half> } %b.deinterleaved, 1
+  %0 = fsub fast <8 x half> %b.real, %a.imag
+  %1 = fadd fast <8 x half> %b.imag, %a.real
+  %interleaved.vec = tail call <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half> %0, <8 x half> %1)
+  ret <16 x half> %interleaved.vec
+}
+
+declare { <2 x half>, <2 x half> } @llvm.experimental.vector.deinterleave2.v4f16(<4 x half>)
+declare <4 x half> @llvm.experimental.vector.interleave2.v4f16(<2 x half>, <2 x half>)
+
+declare { <4 x half>, <4 x half> } @llvm.experimental.vector.deinterleave2.v8f16(<8 x half>)
+declare <8 x half> @llvm.experimental.vector.interleave2.v8f16(<4 x half>, <4 x half>)
+
+declare { <8 x half>, <8 x half> } @llvm.experimental.vector.deinterleave2.v16f16(<16 x half>)
+declare <16 x half> @llvm.experimental.vector.interleave2.v16f16(<8 x half>, <8 x half>)

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
new file mode 100644
index 0000000000000..70f2c5a582b6c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f16-mul-scalable.ll
@@ -0,0 +1,153 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+sve -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; Expected to transform
+define <vscale x 4 x half> @complex_mul_v4f16(<vscale x 4 x half> %a, <vscale x 4 x half> %b) {
+; CHECK-LABEL: complex_mul_v4f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    uunpkhi z2.d, z0.s
+; CHECK-NEXT:    uunpklo z0.d, z0.s
+; CHECK-NEXT:    uunpkhi z3.d, z1.s
+; CHECK-NEXT:    uunpklo z1.d, z1.s
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    uzp2 z4.d, z0.d, z2.d
+; CHECK-NEXT:    uzp1 z0.d, z0.d, z2.d
+; CHECK-NEXT:    uzp2 z2.d, z1.d, z3.d
+; CHECK-NEXT:    uzp1 z1.d, z1.d, z3.d
+; CHECK-NEXT:    movprfx z3, z2
+; CHECK-NEXT:    fmul z3.h, p0/m, z3.h, z0.h
+; CHECK-NEXT:    fmla z3.h, p0/m, z1.h, z4.h
+; CHECK-NEXT:    fmul z2.h, p0/m, z2.h, z4.h
+; CHECK-NEXT:    fnmsb z0.h, p0/m, z1.h, z2.h
+; CHECK-NEXT:    zip2 z1.d, z0.d, z3.d
+; CHECK-NEXT:    zip1 z0.d, z0.d, z3.d
+; CHECK-NEXT:    uzp1 z0.s, z0.s, z1.s
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %a)
+  %a.real = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half> %b)
+  %b.real = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 2 x half>, <vscale x 2 x half> } %b.deinterleaved, 1
+  %0 = fmul fast <vscale x 2 x half> %b.imag, %a.real
+  %1 = fmul fast <vscale x 2 x half> %b.real, %a.imag
+  %2 = fadd fast <vscale x 2 x half> %1, %0
+  %3 = fmul fast <vscale x 2 x half> %b.real, %a.real
+  %4 = fmul fast <vscale x 2 x half> %a.imag, %b.imag
+  %5 = fsub fast <vscale x 2 x half> %3, %4
+  %interleaved.vec = tail call <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half> %5, <vscale x 2 x half> %2)
+  ret <vscale x 4 x half> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 8 x half> @complex_mul_v8f16(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: complex_mul_v8f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z2.h, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    fcmla z2.h, p0/m, z1.h, z0.h, #0
+; CHECK-NEXT:    fcmla z2.h, p0/m, z1.h, z0.h, #90
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %a)
+  %a.real = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half> %b)
+  %b.real = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 4 x half>, <vscale x 4 x half> } %b.deinterleaved, 1
+  %0 = fmul fast <vscale x 4 x half> %b.imag, %a.real
+  %1 = fmul fast <vscale x 4 x half> %b.real, %a.imag
+  %2 = fadd fast <vscale x 4 x half> %1, %0
+  %3 = fmul fast <vscale x 4 x half> %b.real, %a.real
+  %4 = fmul fast <vscale x 4 x half> %a.imag, %b.imag
+  %5 = fsub fast <vscale x 4 x half> %3, %4
+  %interleaved.vec = tail call <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half> %5, <vscale x 4 x half> %2)
+  ret <vscale x 8 x half> %interleaved.vec
+}
+; Expected to transform
+define <vscale x 16 x half> @complex_mul_v16f16(<vscale x 16 x half> %a, <vscale x 16 x half> %b) {
+; CHECK-LABEL: complex_mul_v16f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z4.h, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    fcmla z4.h, p0/m, z3.h, z1.h, #0
+; CHECK-NEXT:    fcmla z5.h, p0/m, z2.h, z0.h, #0
+; CHECK-NEXT:    fcmla z4.h, p0/m, z3.h, z1.h, #90
+; CHECK-NEXT:    fcmla z5.h, p0/m, z2.h, z0.h, #90
+; CHECK-NEXT:    mov z1.d, z4.d
+; CHECK-NEXT:    mov z0.d, z5.d
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %a)
+  %a.real = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half> %b)
+  %b.real = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 8 x half>, <vscale x 8 x half> } %b.deinterleaved, 1
+  %0 = fmul fast <vscale x 8 x half> %b.imag, %a.real
+  %1 = fmul fast <vscale x 8 x half> %b.real, %a.imag
+  %2 = fadd fast <vscale x 8 x half> %1, %0
+  %3 = fmul fast <vscale x 8 x half> %b.real, %a.real
+  %4 = fmul fast <vscale x 8 x half> %a.imag, %b.imag
+  %5 = fsub fast <vscale x 8 x half> %3, %4
+  %interleaved.vec = tail call <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half> %5, <vscale x 8 x half> %2)
+  ret <vscale x 16 x half> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 32 x half> @complex_mul_v32f16(<vscale x 32 x half> %a, <vscale x 32 x half> %b) {
+; CHECK-LABEL: complex_mul_v32f16:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z24.h, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.h
+; CHECK-NEXT:    mov z25.d, z24.d
+; CHECK-NEXT:    mov z26.d, z24.d
+; CHECK-NEXT:    mov z27.d, z24.d
+; CHECK-NEXT:    fcmla z25.h, p0/m, z4.h, z0.h, #0
+; CHECK-NEXT:    fcmla z26.h, p0/m, z5.h, z1.h, #0
+; CHECK-NEXT:    fcmla z27.h, p0/m, z6.h, z2.h, #0
+; CHECK-NEXT:    fcmla z24.h, p0/m, z7.h, z3.h, #0
+; CHECK-NEXT:    fcmla z25.h, p0/m, z4.h, z0.h, #90
+; CHECK-NEXT:    fcmla z26.h, p0/m, z5.h, z1.h, #90
+; CHECK-NEXT:    fcmla z27.h, p0/m, z6.h, z2.h, #90
+; CHECK-NEXT:    fcmla z24.h, p0/m, z7.h, z3.h, #90
+; CHECK-NEXT:    mov z0.d, z25.d
+; CHECK-NEXT:    mov z1.d, z26.d
+; CHECK-NEXT:    mov z2.d, z27.d
+; CHECK-NEXT:    mov z3.d, z24.d
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %a)
+  %a.real = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half> %b)
+  %b.real = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 16 x half>, <vscale x 16 x half> } %b.deinterleaved, 1
+  %0 = fmul fast <vscale x 16 x half> %b.imag, %a.real
+  %1 = fmul fast <vscale x 16 x half> %b.real, %a.imag
+  %2 = fadd fast <vscale x 16 x half> %1, %0
+  %3 = fmul fast <vscale x 16 x half> %b.real, %a.real
+  %4 = fmul fast <vscale x 16 x half> %a.imag, %b.imag
+  %5 = fsub fast <vscale x 16 x half> %3, %4
+  %interleaved.vec = tail call <vscale x 32 x half> @llvm.experimental.vector.interleave2.nxv32f16(<vscale x 16 x half> %5, <vscale x 16 x half> %2)
+  ret <vscale x 32 x half> %interleaved.vec
+}
+
+declare { <vscale x 2 x half>, <vscale x 2 x half> } @llvm.experimental.vector.deinterleave2.nxv4f16(<vscale x 4 x half>)
+declare <vscale x 4 x half> @llvm.experimental.vector.interleave2.nxv4f16(<vscale x 2 x half>, <vscale x 2 x half>)
+
+declare { <vscale x 4 x half>, <vscale x 4 x half> } @llvm.experimental.vector.deinterleave2.nxv8f16(<vscale x 8 x half>)
+declare <vscale x 8 x half> @llvm.experimental.vector.interleave2.nxv8f16(<vscale x 4 x half>, <vscale x 4 x half>)
+
+declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.experimental.vector.deinterleave2.nxv16f16(<vscale x 16 x half>)
+declare <vscale x 16 x half> @llvm.experimental.vector.interleave2.nxv16f16(<vscale x 8 x half>, <vscale x 8 x half>)
+
+declare { <vscale x 16 x half>, <vscale x 16 x half> } @llvm.experimental.vector.deinterleave2.nxv32f16(<vscale x 32 x half>)
+declare <vscale x 32 x half> @llvm.experimental.vector.interleave2.nxv32f16(<vscale x 16 x half>, <vscale x 16 x half>)
+
+

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll
new file mode 100644
index 0000000000000..205df040362f5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-add-scalable.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+sve -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; Expected to transform
+define <vscale x 4 x float> @complex_add_v4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: complex_add_v4f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcadd z1.s, p0/m, z1.s, z0.s, #90
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %a)
+  %a.real = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %b)
+  %b.real = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %b.deinterleaved, 1
+  %0 = fsub fast <vscale x 2 x float> %b.real, %a.imag
+  %1 = fadd fast <vscale x 2 x float> %b.imag, %a.real
+  %interleaved.vec = tail call <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1)
+  ret <vscale x 4 x float> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 8 x float> @complex_add_v8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b) {
+; CHECK-LABEL: complex_add_v8f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcadd z2.s, p0/m, z2.s, z0.s, #90
+; CHECK-NEXT:    fcadd z3.s, p0/m, z3.s, z1.s, #90
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    mov z1.d, z3.d
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %a)
+  %a.real = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %b)
+  %b.real = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %b.deinterleaved, 1
+  %0 = fsub fast <vscale x 4 x float> %b.real, %a.imag
+  %1 = fadd fast <vscale x 4 x float> %b.imag, %a.real
+  %interleaved.vec = tail call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1)
+  ret <vscale x 8 x float> %interleaved.vec
+}
+; Expected to transform
+define <vscale x 16 x float> @complex_add_v16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b) {
+; CHECK-LABEL: complex_add_v16f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcadd z6.s, p0/m, z6.s, z2.s, #90
+; CHECK-NEXT:    fcadd z4.s, p0/m, z4.s, z0.s, #90
+; CHECK-NEXT:    fcadd z5.s, p0/m, z5.s, z1.s, #90
+; CHECK-NEXT:    fcadd z7.s, p0/m, z7.s, z3.s, #90
+; CHECK-NEXT:    mov z0.d, z4.d
+; CHECK-NEXT:    mov z1.d, z5.d
+; CHECK-NEXT:    mov z2.d, z6.d
+; CHECK-NEXT:    mov z3.d, z7.d
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %a)
+  %a.real = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %b)
+  %b.real = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %b.deinterleaved, 1
+  %0 = fsub fast <vscale x 8 x float> %b.real, %a.imag
+  %1 = fadd fast <vscale x 8 x float> %b.imag, %a.real
+  %interleaved.vec = tail call <vscale x 16 x float> @llvm.experimental.vector.interleave2.nxv16f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1)
+  ret <vscale x 16 x float> %interleaved.vec
+}
+
+declare { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
+
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+
+declare { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float>)
+declare <vscale x 16 x float> @llvm.experimental.vector.interleave2.nxv16f32(<vscale x 8 x float>, <vscale x 8 x float>)

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll
new file mode 100644
index 0000000000000..b3fdfe28f47e9
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f32-mul-scalable.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+sve -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; Expected to transform
+define <vscale x 4 x float> @complex_mul_v4f32(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: complex_mul_v4f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z2.s, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    fcmla z2.s, p0/m, z1.s, z0.s, #0
+; CHECK-NEXT:    fcmla z2.s, p0/m, z1.s, z0.s, #90
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %a)
+  %a.real = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float> %b)
+  %b.real = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 2 x float>, <vscale x 2 x float> } %b.deinterleaved, 1
+  %0 = fmul fast <vscale x 2 x float> %b.imag, %a.real
+  %1 = fmul fast <vscale x 2 x float> %b.real, %a.imag
+  %2 = fadd fast <vscale x 2 x float> %1, %0
+  %3 = fmul fast <vscale x 2 x float> %b.real, %a.real
+  %4 = fmul fast <vscale x 2 x float> %a.imag, %b.imag
+  %5 = fsub fast <vscale x 2 x float> %3, %4
+  %interleaved.vec = tail call <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float> %5, <vscale x 2 x float> %2)
+  ret <vscale x 4 x float> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 8 x float> @complex_mul_v8f32(<vscale x 8 x float> %a, <vscale x 8 x float> %b) {
+; CHECK-LABEL: complex_mul_v8f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z4.s, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    fcmla z4.s, p0/m, z3.s, z1.s, #0
+; CHECK-NEXT:    fcmla z5.s, p0/m, z2.s, z0.s, #0
+; CHECK-NEXT:    fcmla z4.s, p0/m, z3.s, z1.s, #90
+; CHECK-NEXT:    fcmla z5.s, p0/m, z2.s, z0.s, #90
+; CHECK-NEXT:    mov z1.d, z4.d
+; CHECK-NEXT:    mov z0.d, z5.d
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %a)
+  %a.real = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float> %b)
+  %b.real = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } %b.deinterleaved, 1
+  %0 = fmul fast <vscale x 4 x float> %b.imag, %a.real
+  %1 = fmul fast <vscale x 4 x float> %b.real, %a.imag
+  %2 = fadd fast <vscale x 4 x float> %1, %0
+  %3 = fmul fast <vscale x 4 x float> %b.real, %a.real
+  %4 = fmul fast <vscale x 4 x float> %a.imag, %b.imag
+  %5 = fsub fast <vscale x 4 x float> %3, %4
+  %interleaved.vec = tail call <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float> %5, <vscale x 4 x float> %2)
+  ret <vscale x 8 x float> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 16 x float> @complex_mul_v16f32(<vscale x 16 x float> %a, <vscale x 16 x float> %b) {
+; CHECK-LABEL: complex_mul_v16f32:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z24.s, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.s
+; CHECK-NEXT:    mov z25.d, z24.d
+; CHECK-NEXT:    mov z26.d, z24.d
+; CHECK-NEXT:    mov z27.d, z24.d
+; CHECK-NEXT:    fcmla z25.s, p0/m, z4.s, z0.s, #0
+; CHECK-NEXT:    fcmla z26.s, p0/m, z5.s, z1.s, #0
+; CHECK-NEXT:    fcmla z27.s, p0/m, z6.s, z2.s, #0
+; CHECK-NEXT:    fcmla z24.s, p0/m, z7.s, z3.s, #0
+; CHECK-NEXT:    fcmla z25.s, p0/m, z4.s, z0.s, #90
+; CHECK-NEXT:    fcmla z26.s, p0/m, z5.s, z1.s, #90
+; CHECK-NEXT:    fcmla z27.s, p0/m, z6.s, z2.s, #90
+; CHECK-NEXT:    fcmla z24.s, p0/m, z7.s, z3.s, #90
+; CHECK-NEXT:    mov z0.d, z25.d
+; CHECK-NEXT:    mov z1.d, z26.d
+; CHECK-NEXT:    mov z2.d, z27.d
+; CHECK-NEXT:    mov z3.d, z24.d
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %a)
+  %a.real = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float> %b)
+  %b.real = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 8 x float>, <vscale x 8 x float> } %b.deinterleaved, 1
+  %0 = fmul fast <vscale x 8 x float> %b.imag, %a.real
+  %1 = fmul fast <vscale x 8 x float> %b.real, %a.imag
+  %2 = fadd fast <vscale x 8 x float> %1, %0
+  %3 = fmul fast <vscale x 8 x float> %b.real, %a.real
+  %4 = fmul fast <vscale x 8 x float> %a.imag, %b.imag
+  %5 = fsub fast <vscale x 8 x float> %3, %4
+  %interleaved.vec = tail call <vscale x 16 x float> @llvm.experimental.vector.interleave2.nxv16f32(<vscale x 8 x float> %5, <vscale x 8 x float> %2)
+  ret <vscale x 16 x float> %interleaved.vec
+}
+
+declare { <vscale x 2 x float>, <vscale x 2 x float> } @llvm.experimental.vector.deinterleave2.nxv4f32(<vscale x 4 x float>)
+declare <vscale x 4 x float> @llvm.experimental.vector.interleave2.nxv4f32(<vscale x 2 x float>, <vscale x 2 x float>)
+
+declare { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.experimental.vector.deinterleave2.nxv8f32(<vscale x 8 x float>)
+declare <vscale x 8 x float> @llvm.experimental.vector.interleave2.nxv8f32(<vscale x 4 x float>, <vscale x 4 x float>)
+
+declare { <vscale x 8 x float>, <vscale x 8 x float> } @llvm.experimental.vector.deinterleave2.nxv16f32(<vscale x 16 x float>)
+declare <vscale x 16 x float> @llvm.experimental.vector.interleave2.nxv16f32(<vscale x 8 x float>, <vscale x 8 x float>)
+

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll
new file mode 100644
index 0000000000000..42d805f23c63b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-add-scalable.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+sve -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; Expected to transform
+define <vscale x 2 x double> @complex_add_v2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: complex_add_v2f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcadd z1.d, p0/m, z1.d, z0.d, #90
+; CHECK-NEXT:    mov z0.d, z1.d
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %a)
+  %a.real = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %b)
+  %b.real = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %b.deinterleaved, 1
+  %0 = fsub fast <vscale x 1 x double> %b.real, %a.imag
+  %1 = fadd fast <vscale x 1 x double> %b.imag, %a.real
+  %interleaved.vec = tail call <vscale x 2 x double> @llvm.experimental.vector.interleave2.nxv2f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1)
+  ret <vscale x 2 x double> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 4 x double> @complex_add_v4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b) {
+; CHECK-LABEL: complex_add_v4f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcadd z2.d, p0/m, z2.d, z0.d, #90
+; CHECK-NEXT:    fcadd z3.d, p0/m, z3.d, z1.d, #90
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    mov z1.d, z3.d
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %a.real = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %b.real = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %b.deinterleaved, 1
+  %0 = fsub fast <vscale x 2 x double> %b.real, %a.imag
+  %1 = fadd fast <vscale x 2 x double> %b.imag, %a.real
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1)
+  ret <vscale x 4 x double> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 8 x double> @complex_add_v8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b) {
+; CHECK-LABEL: complex_add_v8f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcadd z6.d, p0/m, z6.d, z2.d, #90
+; CHECK-NEXT:    fcadd z4.d, p0/m, z4.d, z0.d, #90
+; CHECK-NEXT:    fcadd z5.d, p0/m, z5.d, z1.d, #90
+; CHECK-NEXT:    fcadd z7.d, p0/m, z7.d, z3.d, #90
+; CHECK-NEXT:    mov z0.d, z4.d
+; CHECK-NEXT:    mov z1.d, z5.d
+; CHECK-NEXT:    mov z2.d, z6.d
+; CHECK-NEXT:    mov z3.d, z7.d
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %a)
+  %a.real = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %b)
+  %b.real = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %b.deinterleaved, 1
+  %0 = fsub fast <vscale x 4 x double> %b.real, %a.imag
+  %1 = fadd fast <vscale x 4 x double> %b.imag, %a.real
+  %interleaved.vec = tail call <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1)
+  ret <vscale x 8 x double> %interleaved.vec
+}
+
+declare { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.experimental.vector.interleave2.nxv2f64(<vscale x 1 x double>, <vscale x 1 x double>)
+
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+
+declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
+declare <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)

diff  --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll
new file mode 100644
index 0000000000000..b3a0baf081245
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-f64-mul-scalable.ll
@@ -0,0 +1,110 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s --mattr=+sve -o - | FileCheck %s
+
+target triple = "aarch64-arm-none-eabi"
+
+; Expected to transform
+define <vscale x 2 x double> @complex_mul_v2f64(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: complex_mul_v2f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z2.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    fcmla z2.d, p0/m, z1.d, z0.d, #0
+; CHECK-NEXT:    fcmla z2.d, p0/m, z1.d, z0.d, #90
+; CHECK-NEXT:    mov z0.d, z2.d
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %a)
+  %a.real = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double> %b)
+  %b.real = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 1 x double>, <vscale x 1 x double> } %b.deinterleaved, 1
+  %0 = fmul fast <vscale x 1 x double> %b.imag, %a.real
+  %1 = fmul fast <vscale x 1 x double> %b.real, %a.imag
+  %2 = fadd fast <vscale x 1 x double> %1, %0
+  %3 = fmul fast <vscale x 1 x double> %b.real, %a.real
+  %4 = fmul fast <vscale x 1 x double> %a.imag, %b.imag
+  %5 = fsub fast <vscale x 1 x double> %3, %4
+  %interleaved.vec = tail call <vscale x 2 x double> @llvm.experimental.vector.interleave2.nxv2f64(<vscale x 1 x double> %5, <vscale x 1 x double> %2)
+  ret <vscale x 2 x double> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 4 x double> @complex_mul_v4f64(<vscale x 4 x double> %a, <vscale x 4 x double> %b) {
+; CHECK-LABEL: complex_mul_v4f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z4.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z5.d, z4.d
+; CHECK-NEXT:    fcmla z4.d, p0/m, z3.d, z1.d, #0
+; CHECK-NEXT:    fcmla z5.d, p0/m, z2.d, z0.d, #0
+; CHECK-NEXT:    fcmla z4.d, p0/m, z3.d, z1.d, #90
+; CHECK-NEXT:    fcmla z5.d, p0/m, z2.d, z0.d, #90
+; CHECK-NEXT:    mov z1.d, z4.d
+; CHECK-NEXT:    mov z0.d, z5.d
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
+  %a.real = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %b)
+  %b.real = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %b.deinterleaved, 1
+  %0 = fmul fast <vscale x 2 x double> %b.imag, %a.real
+  %1 = fmul fast <vscale x 2 x double> %b.real, %a.imag
+  %2 = fadd fast <vscale x 2 x double> %1, %0
+  %3 = fmul fast <vscale x 2 x double> %b.real, %a.real
+  %4 = fmul fast <vscale x 2 x double> %a.imag, %b.imag
+  %5 = fsub fast <vscale x 2 x double> %3, %4
+  %interleaved.vec = tail call <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double> %5, <vscale x 2 x double> %2)
+  ret <vscale x 4 x double> %interleaved.vec
+}
+
+; Expected to transform
+define <vscale x 8 x double> @complex_mul_v8f64(<vscale x 8 x double> %a, <vscale x 8 x double> %b) {
+; CHECK-LABEL: complex_mul_v8f64:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov z24.d, #0 // =0x0
+; CHECK-NEXT:    ptrue p0.d
+; CHECK-NEXT:    mov z25.d, z24.d
+; CHECK-NEXT:    mov z26.d, z24.d
+; CHECK-NEXT:    mov z27.d, z24.d
+; CHECK-NEXT:    fcmla z25.d, p0/m, z4.d, z0.d, #0
+; CHECK-NEXT:    fcmla z26.d, p0/m, z5.d, z1.d, #0
+; CHECK-NEXT:    fcmla z27.d, p0/m, z6.d, z2.d, #0
+; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z3.d, #0
+; CHECK-NEXT:    fcmla z25.d, p0/m, z4.d, z0.d, #90
+; CHECK-NEXT:    fcmla z26.d, p0/m, z5.d, z1.d, #90
+; CHECK-NEXT:    fcmla z27.d, p0/m, z6.d, z2.d, #90
+; CHECK-NEXT:    fcmla z24.d, p0/m, z7.d, z3.d, #90
+; CHECK-NEXT:    mov z0.d, z25.d
+; CHECK-NEXT:    mov z1.d, z26.d
+; CHECK-NEXT:    mov z2.d, z27.d
+; CHECK-NEXT:    mov z3.d, z24.d
+; CHECK-NEXT:    ret
+entry:
+  %a.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %a)
+  %a.real = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %a.deinterleaved, 0
+  %a.imag = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %a.deinterleaved, 1
+  %b.deinterleaved = tail call { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double> %b)
+  %b.real = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %b.deinterleaved, 0
+  %b.imag = extractvalue { <vscale x 4 x double>, <vscale x 4 x double> } %b.deinterleaved, 1
+  %0 = fmul fast <vscale x 4 x double> %b.imag, %a.real
+  %1 = fmul fast <vscale x 4 x double> %b.real, %a.imag
+  %2 = fadd fast <vscale x 4 x double> %1, %0
+  %3 = fmul fast <vscale x 4 x double> %b.real, %a.real
+  %4 = fmul fast <vscale x 4 x double> %a.imag, %b.imag
+  %5 = fsub fast <vscale x 4 x double> %3, %4
+  %interleaved.vec = tail call <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double> %5, <vscale x 4 x double> %2)
+  ret <vscale x 8 x double> %interleaved.vec
+}
+
+declare { <vscale x 1 x double>, <vscale x 1 x double> } @llvm.experimental.vector.deinterleave2.nxv2f64(<vscale x 2 x double>)
+declare <vscale x 2 x double> @llvm.experimental.vector.interleave2.nxv2f64(<vscale x 1 x double>, <vscale x 1 x double>)
+
+declare { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double>)
+declare <vscale x 4 x double> @llvm.experimental.vector.interleave2.nxv4f64(<vscale x 2 x double>, <vscale x 2 x double>)
+
+declare { <vscale x 4 x double>, <vscale x 4 x double> } @llvm.experimental.vector.deinterleave2.nxv8f64(<vscale x 8 x double>)
+declare <vscale x 8 x double> @llvm.experimental.vector.interleave2.nxv8f64(<vscale x 4 x double>, <vscale x 4 x double>)