[llvm] 1a1e761 - [CodeGen] Improve handling -Ofast generated code by ComplexDeinterleaving pass
Igor Kirillov via llvm-commits
llvm-commits at lists.llvm.org
Wed May 31 11:32:35 PDT 2023
Author: Igor Kirillov
Date: 2023-05-31T18:31:38Z
New Revision: 1a1e76100e3f99c2bf0babcab52da333c12631e2
URL: https://github.com/llvm/llvm-project/commit/1a1e76100e3f99c2bf0babcab52da333c12631e2
DIFF: https://github.com/llvm/llvm-project/commit/1a1e76100e3f99c2bf0babcab52da333c12631e2.diff
LOG: [CodeGen] Improve handling -Ofast generated code by ComplexDeinterleaving pass
Code generated with -Ofast and -O3 -ffp-contract=fast (add
-ffinite-math-only to enable vectorization) can differ significantly.
Code compiled with -O3 can be deinterleaved using patterns as the
instruction order is preserved. However, with the -Ofast flag, there
can be multiple changes in the computation sequence, and even the real
and imaginary parts may not be calculated in parallel.
For more details, refer to
llvm/test/CodeGen/AArch64/complex-deinterleaving-*-fast.ll and
llvm/test/CodeGen/AArch64/complex-deinterleaving-*-contract.ll tests.
This patch implements a more general approach and enables handling most
-Ofast cases.
Differential Revision: https://reviews.llvm.org/D148558
Added:
Modified:
llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll
llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll
llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll
llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll
llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 4351d68ebc87c..ec7abb298d9f9 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -143,6 +143,11 @@ struct ComplexDeinterleavingCompositeNode {
Instruction *Real;
Instruction *Imag;
+ // This two members are required exclusively for generating
+ // ComplexDeinterleavingOperation::Symmetric operations.
+ unsigned Opcode;
+ FastMathFlags Flags;
+
ComplexDeinterleavingRotation Rotation =
ComplexDeinterleavingRotation::Rotation_0;
SmallVector<RawNodePtr> Operands;
@@ -186,8 +191,26 @@ struct ComplexDeinterleavingCompositeNode {
class ComplexDeinterleavingGraph {
public:
+ struct Product {
+ Instruction *Multiplier;
+ Instruction *Multiplicand;
+ bool IsPositive;
+ };
+
+ using Addend = std::pair<Instruction *, bool>;
using NodePtr = ComplexDeinterleavingCompositeNode::NodePtr;
using RawNodePtr = ComplexDeinterleavingCompositeNode::RawNodePtr;
+
+ // Helper struct for holding info about potential partial multiplication
+ // candidates
+ struct PartialMulCandidate {
+ Instruction *Common;
+ NodePtr Node;
+ unsigned RealIdx;
+ unsigned ImagIdx;
+ bool IsNodeInverted;
+ };
+
explicit ComplexDeinterleavingGraph(const TargetLowering *TL,
const TargetLibraryInfo *TLI)
: TL(TL), TLI(TLI) {}
@@ -256,6 +279,40 @@ class ComplexDeinterleavingGraph {
NodePtr identifyNode(Instruction *I, Instruction *J);
+ /// Determine if a sum of complex numbers can be formed from \p RealAddends
+ /// and \p ImagAddens. If \p Accumulator is not null, add the result to it.
+ /// Return nullptr if it is not possible to construct a complex number.
+ /// \p Flags are needed to generate symmetric Add and Sub operations.
+ NodePtr identifyAdditions(std::list<Addend> &RealAddends,
+ std::list<Addend> &ImagAddends, FastMathFlags Flags,
+ NodePtr Accumulator);
+
+ /// Extract one addend that have both real and imaginary parts positive.
+ NodePtr extractPositiveAddend(std::list<Addend> &RealAddends,
+ std::list<Addend> &ImagAddends);
+
+ /// Determine if sum of multiplications of complex numbers can be formed from
+ /// \p RealMuls and \p ImagMuls. If \p Accumulator is not null, add the result
+ /// to it. Return nullptr if it is not possible to construct a complex number.
+ NodePtr identifyMultiplications(std::vector<Product> &RealMuls,
+ std::vector<Product> &ImagMuls,
+ NodePtr Accumulator);
+
+ /// Go through pairs of multiplication (one Real and one Imag) and find all
+ /// possible candidates for partial multiplication and put them into \p
+ /// Candidates. Returns true if all Product has pair with common operand
+ bool collectPartialMuls(const std::vector<Product> &RealMuls,
+ const std::vector<Product> &ImagMuls,
+ std::vector<PartialMulCandidate> &Candidates);
+
+ /// If the code is compiled with -Ofast or expressions have `reassoc` flag,
+ /// the order of complex computation operations may be significantly altered,
+ /// and the real and imaginary parts may not be executed in parallel. This
+ /// function takes this into consideration and employs a more general approach
+ /// to identify complex computations. Initially, it gathers all the addends
+ /// and multiplicands and then constructs a complex expression from them.
+ NodePtr identifyReassocNodes(Instruction *I, Instruction *J);
+
NodePtr identifyRoot(Instruction *I);
/// Identifies the Deinterleave operation applied to a vector containing
@@ -737,8 +794,16 @@ ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real,
return nullptr;
}
+ if (isa<FPMathOperator>(Real) &&
+ Real->getFastMathFlags() != Imag->getFastMathFlags())
+ return nullptr;
+
auto Node = prepareCompositeNode(ComplexDeinterleavingOperation::Symmetric,
Real, Imag);
+ Node->Opcode = Real->getOpcode();
+ if (isa<FPMathOperator>(Real))
+ Node->Flags = Real->getFastMathFlags();
+
Node->addOperand(Op0);
if (Real->isBinaryOp())
Node->addOperand(Op1);
@@ -754,29 +819,477 @@ ComplexDeinterleavingGraph::identifyNode(Instruction *Real, Instruction *Imag) {
return CN;
}
- NodePtr Node = identifyDeinterleave(Real, Imag);
- if (Node)
- return Node;
+ if (NodePtr CN = identifyDeinterleave(Real, Imag))
+ return CN;
auto *VTy = cast<VectorType>(Real->getType());
auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
- if (TL->isComplexDeinterleavingOperationSupported(
- ComplexDeinterleavingOperation::CMulPartial, NewVTy) &&
- isInstructionPairMul(Real, Imag)) {
- return identifyPartialMul(Real, Imag);
+ bool HasCMulSupport = TL->isComplexDeinterleavingOperationSupported(
+ ComplexDeinterleavingOperation::CMulPartial, NewVTy);
+ bool HasCAddSupport = TL->isComplexDeinterleavingOperationSupported(
+ ComplexDeinterleavingOperation::CAdd, NewVTy);
+
+ if (HasCMulSupport && isInstructionPairMul(Real, Imag)) {
+ if (NodePtr CN = identifyPartialMul(Real, Imag))
+ return CN;
+ }
+
+ if (HasCAddSupport && isInstructionPairAdd(Real, Imag)) {
+ if (NodePtr CN = identifyAdd(Real, Imag))
+ return CN;
+ }
+
+ if (HasCMulSupport && HasCAddSupport) {
+ if (NodePtr CN = identifyReassocNodes(Real, Imag))
+ return CN;
+ }
+
+ if (NodePtr CN = identifySymmetricOperation(Real, Imag))
+ return CN;
+
+ LLVM_DEBUG(dbgs() << " - Not recognised as a valid pattern.\n");
+ return nullptr;
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyReassocNodes(Instruction *Real,
+ Instruction *Imag) {
+ if ((Real->getOpcode() != Instruction::FAdd &&
+ Real->getOpcode() != Instruction::FSub &&
+ Real->getOpcode() != Instruction::FNeg) ||
+ (Imag->getOpcode() != Instruction::FAdd &&
+ Imag->getOpcode() != Instruction::FSub &&
+ Imag->getOpcode() != Instruction::FNeg))
+ return nullptr;
+
+ if (Real->getFastMathFlags() != Imag->getFastMathFlags()) {
+ LLVM_DEBUG(
+ dbgs()
+ << "The flags in Real and Imaginary instructions are not identical\n");
+ return nullptr;
+ }
+
+ FastMathFlags Flags = Real->getFastMathFlags();
+ if (!Flags.allowReassoc()) {
+ LLVM_DEBUG(
+ dbgs() << "the 'Reassoc' attribute is missing in the FastMath flags\n");
+ return nullptr;
+ }
+
+ // Collect multiplications and addend instructions from the given instruction
+ // while traversing it operands. Additionally, verify that all instructions
+ // have the same fast math flags.
+ auto Collect = [&Flags](Instruction *Insn, std::vector<Product> &Muls,
+ std::list<Addend> &Addends) -> bool {
+ SmallVector<PointerIntPair<Value *, 1, bool>> Worklist = {{Insn, true}};
+ SmallPtrSet<Value *, 8> Visited;
+ while (!Worklist.empty()) {
+ auto [V, IsPositive] = Worklist.back();
+ Worklist.pop_back();
+ if (!Visited.insert(V).second)
+ continue;
+
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return false;
+
+ // If an instruction has more than one user, it indicates that it either
+ // has an external user, which will be later checked by the checkNodes
+ // function, or it is a subexpression utilized by multiple expressions. In
+ // the latter case, we will attempt to separately identify the complex
+ // operation from here in order to create a shared
+ // ComplexDeinterleavingCompositeNode.
+ if (I != Insn && I->getNumUses() > 1) {
+ LLVM_DEBUG(dbgs() << "Found potential sub-expression: " << *I << "\n");
+ Addends.emplace_back(I, IsPositive);
+ continue;
+ }
+
+ if (I->getOpcode() == Instruction::FAdd) {
+ Worklist.emplace_back(I->getOperand(1), IsPositive);
+ Worklist.emplace_back(I->getOperand(0), IsPositive);
+ } else if (I->getOpcode() == Instruction::FSub) {
+ Worklist.emplace_back(I->getOperand(1), !IsPositive);
+ Worklist.emplace_back(I->getOperand(0), IsPositive);
+ } else if (I->getOpcode() == Instruction::FMul) {
+ auto *A = dyn_cast<Instruction>(I->getOperand(0));
+ if (A && A->getOpcode() == Instruction::FNeg) {
+ A = dyn_cast<Instruction>(A->getOperand(0));
+ IsPositive = !IsPositive;
+ }
+ if (!A)
+ return false;
+ auto *B = dyn_cast<Instruction>(I->getOperand(1));
+ if (B && B->getOpcode() == Instruction::FNeg) {
+ B = dyn_cast<Instruction>(B->getOperand(0));
+ IsPositive = !IsPositive;
+ }
+ if (!B)
+ return false;
+ Muls.push_back(Product{A, B, IsPositive});
+ } else if (I->getOpcode() == Instruction::FNeg) {
+ Worklist.emplace_back(I->getOperand(0), !IsPositive);
+ } else {
+ Addends.emplace_back(I, IsPositive);
+ continue;
+ }
+
+ if (I->getFastMathFlags() != Flags) {
+ LLVM_DEBUG(dbgs() << "The instruction's fast math flags are "
+ "inconsistent with the root instructions' flags: "
+ << *I << "\n");
+ return false;
+ }
+ }
+ return true;
+ };
+
+ std::vector<Product> RealMuls, ImagMuls;
+ std::list<Addend> RealAddends, ImagAddends;
+ if (!Collect(Real, RealMuls, RealAddends) ||
+ !Collect(Imag, ImagMuls, ImagAddends))
+ return nullptr;
+
+ if (RealAddends.size() != ImagAddends.size())
+ return nullptr;
+
+ NodePtr FinalNode;
+ if (!RealMuls.empty() || !ImagMuls.empty()) {
+ // If there are multiplicands, extract positive addend and use it as an
+ // accumulator
+ FinalNode = extractPositiveAddend(RealAddends, ImagAddends);
+ FinalNode = identifyMultiplications(RealMuls, ImagMuls, FinalNode);
+ if (!FinalNode)
+ return nullptr;
}
- if (TL->isComplexDeinterleavingOperationSupported(
- ComplexDeinterleavingOperation::CAdd, NewVTy) &&
- isInstructionPairAdd(Real, Imag)) {
- return identifyAdd(Real, Imag);
+ // Identify and process remaining additions
+ if (!RealAddends.empty() || !ImagAddends.empty()) {
+ FinalNode = identifyAdditions(RealAddends, ImagAddends, Flags, FinalNode);
+ if (!FinalNode)
+ return nullptr;
}
- auto Symmetric = identifySymmetricOperation(Real, Imag);
- LLVM_DEBUG(if (Symmetric == nullptr) dbgs()
- << " - Not recognised as a valid pattern.\n");
- return Symmetric;
+ // Set the Real and Imag fields of the final node and submit it
+ FinalNode->Real = Real;
+ FinalNode->Imag = Imag;
+ submitCompositeNode(FinalNode);
+ return FinalNode;
+}
+
+bool ComplexDeinterleavingGraph::collectPartialMuls(
+ const std::vector<Product> &RealMuls, const std::vector<Product> &ImagMuls,
+ std::vector<PartialMulCandidate> &PartialMulCandidates) {
+ // Helper function to extract a common operand from two products
+ auto FindCommonInstruction = [](const Product &Real,
+ const Product &Imag) -> Instruction * {
+ if (Real.Multiplicand == Imag.Multiplicand ||
+ Real.Multiplicand == Imag.Multiplier)
+ return Real.Multiplicand;
+
+ if (Real.Multiplier == Imag.Multiplicand ||
+ Real.Multiplier == Imag.Multiplier)
+ return Real.Multiplier;
+
+ return nullptr;
+ };
+
+ // Iterating over real and imaginary multiplications to find common operands
+ // If a common operand is found, a partial multiplication candidate is created
+ // and added to the candidates vector The function returns false if no common
+ // operands are found for any product
+ for (unsigned i = 0; i < RealMuls.size(); ++i) {
+ bool FoundCommon = false;
+ for (unsigned j = 0; j < ImagMuls.size(); ++j) {
+ auto *Common = FindCommonInstruction(RealMuls[i], ImagMuls[j]);
+ if (!Common)
+ continue;
+
+ auto *A = RealMuls[i].Multiplicand == Common ? RealMuls[i].Multiplier
+ : RealMuls[i].Multiplicand;
+ auto *B = ImagMuls[j].Multiplicand == Common ? ImagMuls[j].Multiplier
+ : ImagMuls[j].Multiplicand;
+
+ bool Inverted = false;
+ auto Node = identifyNode(A, B);
+ if (!Node) {
+ std::swap(A, B);
+ Inverted = true;
+ Node = identifyNode(A, B);
+ }
+ if (!Node)
+ continue;
+
+ FoundCommon = true;
+ PartialMulCandidates.push_back({Common, Node, i, j, Inverted});
+ }
+ if (!FoundCommon)
+ return false;
+ }
+ return true;
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyMultiplications(
+ std::vector<Product> &RealMuls, std::vector<Product> &ImagMuls,
+ NodePtr Accumulator = nullptr) {
+ if (RealMuls.size() != ImagMuls.size())
+ return nullptr;
+
+ std::vector<PartialMulCandidate> Info;
+ if (!collectPartialMuls(RealMuls, ImagMuls, Info))
+ return nullptr;
+
+ // Map to store common instruction to node pointers
+ std::map<Instruction *, NodePtr> CommonToNode;
+ std::vector<bool> Processed(Info.size(), false);
+ for (unsigned I = 0; I < Info.size(); ++I) {
+ if (Processed[I])
+ continue;
+
+ PartialMulCandidate &InfoA = Info[I];
+ for (unsigned J = I + 1; J < Info.size(); ++J) {
+ if (Processed[J])
+ continue;
+
+ PartialMulCandidate &InfoB = Info[J];
+ auto *InfoReal = &InfoA;
+ auto *InfoImag = &InfoB;
+
+ auto NodeFromCommon = identifyNode(InfoReal->Common, InfoImag->Common);
+ if (!NodeFromCommon) {
+ std::swap(InfoReal, InfoImag);
+ NodeFromCommon = identifyNode(InfoReal->Common, InfoImag->Common);
+ }
+ if (!NodeFromCommon)
+ continue;
+
+ CommonToNode[InfoReal->Common] = NodeFromCommon;
+ CommonToNode[InfoImag->Common] = NodeFromCommon;
+ Processed[I] = true;
+ Processed[J] = true;
+ }
+ }
+
+ std::vector<bool> ProcessedReal(RealMuls.size(), false);
+ std::vector<bool> ProcessedImag(ImagMuls.size(), false);
+ NodePtr Result = Accumulator;
+ for (auto &PMI : Info) {
+ if (ProcessedReal[PMI.RealIdx] || ProcessedImag[PMI.ImagIdx])
+ continue;
+
+ auto It = CommonToNode.find(PMI.Common);
+ // TODO: Process independent complex multiplications. Cases like this:
+ // A.real() * B where both A and B are complex numbers.
+ if (It == CommonToNode.end()) {
+ LLVM_DEBUG({
+ dbgs() << "Unprocessed independent partial multiplication:\n";
+ for (auto *Mul : {&RealMuls[PMI.RealIdx], &RealMuls[PMI.RealIdx]})
+ dbgs().indent(4) << (Mul->IsPositive ? "+" : "-") << *Mul->Multiplier
+ << " multiplied by " << *Mul->Multiplicand << "\n";
+ });
+ return nullptr;
+ }
+
+ auto &RealMul = RealMuls[PMI.RealIdx];
+ auto &ImagMul = ImagMuls[PMI.ImagIdx];
+
+ auto NodeA = It->second;
+ auto NodeB = PMI.Node;
+ auto IsMultiplicandReal = PMI.Common == NodeA->Real;
+ // The following table illustrates the relationship between multiplications
+ // and rotations. If we consider the multiplication (X + iY) * (U + iV), we
+ // can see:
+ //
+ // Rotation | Real | Imag |
+ // ---------+--------+--------+
+ // 0 | x * u | x * v |
+ // 90 | -y * v | y * u |
+ // 180 | -x * u | -x * v |
+ // 270 | y * v | -y * u |
+ //
+ // Check if the candidate can indeed be represented by partial
+ // multiplication
+ // TODO: Add support for multiplication by complex one
+ if ((IsMultiplicandReal && PMI.IsNodeInverted) ||
+ (!IsMultiplicandReal && !PMI.IsNodeInverted))
+ continue;
+
+ // Determine the rotation based on the multiplications
+ ComplexDeinterleavingRotation Rotation;
+ if (IsMultiplicandReal) {
+ // Detect 0 and 180 degrees rotation
+ if (RealMul.IsPositive && ImagMul.IsPositive)
+ Rotation = llvm::ComplexDeinterleavingRotation::Rotation_0;
+ else if (!RealMul.IsPositive && !ImagMul.IsPositive)
+ Rotation = llvm::ComplexDeinterleavingRotation::Rotation_180;
+ else
+ continue;
+
+ } else {
+ // Detect 90 and 270 degrees rotation
+ if (!RealMul.IsPositive && ImagMul.IsPositive)
+ Rotation = llvm::ComplexDeinterleavingRotation::Rotation_90;
+ else if (RealMul.IsPositive && !ImagMul.IsPositive)
+ Rotation = llvm::ComplexDeinterleavingRotation::Rotation_270;
+ else
+ continue;
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "Identified partial multiplication (X, Y) * (U, V):\n";
+ dbgs().indent(4) << "X: " << *NodeA->Real << "\n";
+ dbgs().indent(4) << "Y: " << *NodeA->Imag << "\n";
+ dbgs().indent(4) << "U: " << *NodeB->Real << "\n";
+ dbgs().indent(4) << "V: " << *NodeB->Imag << "\n";
+ dbgs().indent(4) << "Rotation - " << (int)Rotation * 90 << "\n";
+ });
+
+ NodePtr NodeMul = prepareCompositeNode(
+ ComplexDeinterleavingOperation::CMulPartial, nullptr, nullptr);
+ NodeMul->Rotation = Rotation;
+ NodeMul->addOperand(NodeA);
+ NodeMul->addOperand(NodeB);
+ if (Result)
+ NodeMul->addOperand(Result);
+ submitCompositeNode(NodeMul);
+ Result = NodeMul;
+ ProcessedReal[PMI.RealIdx] = true;
+ ProcessedImag[PMI.ImagIdx] = true;
+ }
+
+ // Ensure all products have been processed, if not return nullptr.
+ if (!all_of(ProcessedReal, [](bool V) { return V; }) ||
+ !all_of(ProcessedImag, [](bool V) { return V; })) {
+
+ // Dump debug information about which partial multiplications are not
+ // processed.
+ LLVM_DEBUG({
+ dbgs() << "Unprocessed products (Real):\n";
+ for (size_t i = 0; i < ProcessedReal.size(); ++i) {
+ if (!ProcessedReal[i])
+ dbgs().indent(4) << (RealMuls[i].IsPositive ? "+" : "-")
+ << *RealMuls[i].Multiplier << " multiplied by "
+ << *RealMuls[i].Multiplicand << "\n";
+ }
+ dbgs() << "Unprocessed products (Imag):\n";
+ for (size_t i = 0; i < ProcessedImag.size(); ++i) {
+ if (!ProcessedImag[i])
+ dbgs().indent(4) << (ImagMuls[i].IsPositive ? "+" : "-")
+ << *ImagMuls[i].Multiplier << " multiplied by "
+ << *ImagMuls[i].Multiplicand << "\n";
+ }
+ });
+ return nullptr;
+ }
+
+ return Result;
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyAdditions(std::list<Addend> &RealAddends,
+ std::list<Addend> &ImagAddends,
+ FastMathFlags Flags,
+ NodePtr Accumulator = nullptr) {
+ if (RealAddends.size() != ImagAddends.size())
+ return nullptr;
+
+ NodePtr Result;
+ // If we have accumulator use it as first addend
+ if (Accumulator)
+ Result = Accumulator;
+ // Otherwise find an element with both positive real and imaginary parts.
+ else
+ Result = extractPositiveAddend(RealAddends, ImagAddends);
+
+ if (!Result)
+ return nullptr;
+
+ while (!RealAddends.empty()) {
+ auto ItR = RealAddends.begin();
+ auto [R, IsPositiveR] = *ItR;
+
+ bool FoundImag = false;
+ for (auto ItI = ImagAddends.begin(); ItI != ImagAddends.end(); ++ItI) {
+ auto [I, IsPositiveI] = *ItI;
+ ComplexDeinterleavingRotation Rotation;
+ if (IsPositiveR && IsPositiveI)
+ Rotation = ComplexDeinterleavingRotation::Rotation_0;
+ else if (!IsPositiveR && IsPositiveI)
+ Rotation = ComplexDeinterleavingRotation::Rotation_90;
+ else if (!IsPositiveR && !IsPositiveI)
+ Rotation = ComplexDeinterleavingRotation::Rotation_180;
+ else
+ Rotation = ComplexDeinterleavingRotation::Rotation_270;
+
+ NodePtr AddNode;
+ if (Rotation == ComplexDeinterleavingRotation::Rotation_0 ||
+ Rotation == ComplexDeinterleavingRotation::Rotation_180) {
+ AddNode = identifyNode(R, I);
+ } else {
+ AddNode = identifyNode(I, R);
+ }
+ if (AddNode) {
+ LLVM_DEBUG({
+ dbgs() << "Identified addition:\n";
+ dbgs().indent(4) << "X: " << *R << "\n";
+ dbgs().indent(4) << "Y: " << *I << "\n";
+ dbgs().indent(4) << "Rotation - " << (int)Rotation * 90 << "\n";
+ });
+
+ NodePtr TmpNode;
+ if (Rotation == llvm::ComplexDeinterleavingRotation::Rotation_0) {
+ TmpNode = prepareCompositeNode(
+ ComplexDeinterleavingOperation::Symmetric, nullptr, nullptr);
+ TmpNode->Opcode = Instruction::FAdd;
+ TmpNode->Flags = Flags;
+ } else if (Rotation ==
+ llvm::ComplexDeinterleavingRotation::Rotation_180) {
+ TmpNode = prepareCompositeNode(
+ ComplexDeinterleavingOperation::Symmetric, nullptr, nullptr);
+ TmpNode->Opcode = Instruction::FSub;
+ TmpNode->Flags = Flags;
+ } else {
+ TmpNode = prepareCompositeNode(ComplexDeinterleavingOperation::CAdd,
+ nullptr, nullptr);
+ TmpNode->Rotation = Rotation;
+ }
+
+ TmpNode->addOperand(Result);
+ TmpNode->addOperand(AddNode);
+ submitCompositeNode(TmpNode);
+ Result = TmpNode;
+ RealAddends.erase(ItR);
+ ImagAddends.erase(ItI);
+ FoundImag = true;
+ break;
+ }
+ }
+ if (!FoundImag)
+ return nullptr;
+ }
+ return Result;
+}
+
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::extractPositiveAddend(
+ std::list<Addend> &RealAddends, std::list<Addend> &ImagAddends) {
+ for (auto ItR = RealAddends.begin(); ItR != RealAddends.end(); ++ItR) {
+ for (auto ItI = ImagAddends.begin(); ItI != ImagAddends.end(); ++ItI) {
+ auto [R, IsPositiveR] = *ItR;
+ auto [I, IsPositiveI] = *ItI;
+ if (IsPositiveR && IsPositiveI) {
+ auto Result = identifyNode(R, I);
+ if (Result) {
+ RealAddends.erase(ItR);
+ ImagAddends.erase(ItI);
+ return Result;
+ }
+ }
+ }
+ }
+ return nullptr;
}
bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
@@ -1011,29 +1524,28 @@ ComplexDeinterleavingGraph::identifyDeinterleave(Instruction *Real,
return submitCompositeNode(PlaceholderNode);
}
-static Value *replaceSymmetricNode(IRBuilderBase &B,
- ComplexDeinterleavingGraph::RawNodePtr Node,
- Value *InputA, Value *InputB) {
- Instruction *I = Node->Real;
- if (I->isUnaryOp())
- assert(!InputB &&
- "Unary symmetric operations need one input, but two were provided.");
- else if (I->isBinaryOp())
- assert(InputB && "Binary symmetric operations need two inputs, only one "
- "was provided.");
-
- switch (I->getOpcode()) {
+static Value *replaceSymmetricNode(IRBuilderBase &B, unsigned Opcode,
+ FastMathFlags Flags, Value *InputA,
+ Value *InputB) {
+ Value *I;
+ switch (Opcode) {
case Instruction::FNeg:
- return B.CreateFNegFMF(InputA, I);
+ I = B.CreateFNeg(InputA);
+ break;
case Instruction::FAdd:
- return B.CreateFAddFMF(InputA, InputB, I);
+ I = B.CreateFAdd(InputA, InputB);
+ break;
case Instruction::FSub:
- return B.CreateFSubFMF(InputA, InputB, I);
+ I = B.CreateFSub(InputA, InputB);
+ break;
case Instruction::FMul:
- return B.CreateFMulFMF(InputA, InputB, I);
+ I = B.CreateFMul(InputA, InputB);
+ break;
+ default:
+ llvm_unreachable("Incorrect symmetric opcode");
}
-
- return nullptr;
+ cast<Instruction>(I)->setFastMathFlags(Flags);
+ return I;
}
Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
@@ -1048,13 +1560,13 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
Value *Accumulator = Node->Operands.size() > 2
? replaceNode(Builder, Node->Operands[2])
: nullptr;
-
if (Input1)
assert(Input0->getType() == Input1->getType() &&
"Node inputs need to be of the same type");
if (Node->Operation == ComplexDeinterleavingOperation::Symmetric)
- Node->ReplacementNode = replaceSymmetricNode(Builder, Node, Input0, Input1);
+ Node->ReplacementNode = replaceSymmetricNode(Builder, Node->Opcode,
+ Node->Flags, Input0, Input1);
else
Node->ReplacementNode = TL->createComplexDeinterleavingIR(
Builder, Node->Operation, Node->Rotation, Input0, Input1, Accumulator);
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll
index 577c3ce8d95e1..76e90e92433b3 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-fixed-fast.ll
@@ -7,18 +7,12 @@ target triple = "aarch64-arm-none-eabi"
define <4 x double> @mull_add(<4 x double> %a, <4 x double> %b, <4 x double> %c) {
; CHECK-LABEL: mull_add:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: zip2 v6.2d, v4.2d, v5.2d
-; CHECK-NEXT: zip1 v7.2d, v0.2d, v1.2d
-; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: zip1 v1.2d, v4.2d, v5.2d
-; CHECK-NEXT: zip1 v4.2d, v2.2d, v3.2d
-; CHECK-NEXT: zip2 v2.2d, v2.2d, v3.2d
-; CHECK-NEXT: fmla v6.2d, v0.2d, v4.2d
-; CHECK-NEXT: fmla v1.2d, v7.2d, v4.2d
-; CHECK-NEXT: fmla v6.2d, v7.2d, v2.2d
-; CHECK-NEXT: fmls v1.2d, v0.2d, v2.2d
-; CHECK-NEXT: zip1 v0.2d, v1.2d, v6.2d
-; CHECK-NEXT: zip2 v1.2d, v1.2d, v6.2d
+; CHECK-NEXT: fcmla v4.2d, v2.2d, v0.2d, #0
+; CHECK-NEXT: fcmla v5.2d, v3.2d, v1.2d, #0
+; CHECK-NEXT: fcmla v4.2d, v2.2d, v0.2d, #90
+; CHECK-NEXT: fcmla v5.2d, v3.2d, v1.2d, #90
+; CHECK-NEXT: mov v0.16b, v4.16b
+; CHECK-NEXT: mov v1.16b, v5.16b
; CHECK-NEXT: ret
entry:
%strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
@@ -43,25 +37,18 @@ entry:
define <4 x double> @mul_add_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
; CHECK-LABEL: mul_add_mull:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: zip1 v16.2d, v2.2d, v3.2d
-; CHECK-NEXT: zip1 v17.2d, v0.2d, v1.2d
-; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d
-; CHECK-NEXT: zip1 v2.2d, v4.2d, v5.2d
-; CHECK-NEXT: zip2 v3.2d, v4.2d, v5.2d
-; CHECK-NEXT: fmul v4.2d, v16.2d, v0.2d
-; CHECK-NEXT: zip1 v5.2d, v6.2d, v7.2d
-; CHECK-NEXT: zip2 v6.2d, v6.2d, v7.2d
-; CHECK-NEXT: fmul v0.2d, v1.2d, v0.2d
-; CHECK-NEXT: fmul v7.2d, v16.2d, v17.2d
-; CHECK-NEXT: fmla v4.2d, v17.2d, v1.2d
-; CHECK-NEXT: fmla v0.2d, v3.2d, v6.2d
-; CHECK-NEXT: fmla v7.2d, v2.2d, v5.2d
-; CHECK-NEXT: fmla v4.2d, v3.2d, v5.2d
-; CHECK-NEXT: fsub v1.2d, v7.2d, v0.2d
-; CHECK-NEXT: fmla v4.2d, v2.2d, v6.2d
-; CHECK-NEXT: zip1 v0.2d, v1.2d, v4.2d
-; CHECK-NEXT: zip2 v1.2d, v1.2d, v4.2d
+; CHECK-NEXT: movi v16.2d, #0000000000000000
+; CHECK-NEXT: movi v17.2d, #0000000000000000
+; CHECK-NEXT: fcmla v16.2d, v4.2d, v6.2d, #0
+; CHECK-NEXT: fcmla v17.2d, v5.2d, v7.2d, #0
+; CHECK-NEXT: fcmla v16.2d, v2.2d, v0.2d, #0
+; CHECK-NEXT: fcmla v17.2d, v3.2d, v1.2d, #0
+; CHECK-NEXT: fcmla v16.2d, v4.2d, v6.2d, #90
+; CHECK-NEXT: fcmla v17.2d, v5.2d, v7.2d, #90
+; CHECK-NEXT: fcmla v16.2d, v2.2d, v0.2d, #90
+; CHECK-NEXT: fcmla v17.2d, v3.2d, v1.2d, #90
+; CHECK-NEXT: mov v0.16b, v16.16b
+; CHECK-NEXT: mov v1.16b, v17.16b
; CHECK-NEXT: ret
entry:
%strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
@@ -94,26 +81,18 @@ entry:
define <4 x double> @mul_sub_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
; CHECK-LABEL: mul_sub_mull:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: zip1 v17.2d, v2.2d, v3.2d
-; CHECK-NEXT: zip1 v18.2d, v0.2d, v1.2d
-; CHECK-NEXT: zip2 v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: zip2 v1.2d, v2.2d, v3.2d
-; CHECK-NEXT: zip2 v2.2d, v4.2d, v5.2d
-; CHECK-NEXT: zip1 v3.2d, v6.2d, v7.2d
-; CHECK-NEXT: zip1 v16.2d, v4.2d, v5.2d
-; CHECK-NEXT: fmul v4.2d, v17.2d, v0.2d
-; CHECK-NEXT: fmul v5.2d, v17.2d, v18.2d
-; CHECK-NEXT: fmul v0.2d, v1.2d, v0.2d
-; CHECK-NEXT: zip2 v6.2d, v6.2d, v7.2d
-; CHECK-NEXT: fmul v7.2d, v3.2d, v2.2d
-; CHECK-NEXT: fmla v4.2d, v18.2d, v1.2d
-; CHECK-NEXT: fmla v0.2d, v16.2d, v3.2d
-; CHECK-NEXT: fmla v5.2d, v2.2d, v6.2d
-; CHECK-NEXT: fmla v7.2d, v16.2d, v6.2d
-; CHECK-NEXT: fsub v1.2d, v5.2d, v0.2d
-; CHECK-NEXT: fsub v2.2d, v4.2d, v7.2d
-; CHECK-NEXT: zip1 v0.2d, v1.2d, v2.2d
-; CHECK-NEXT: zip2 v1.2d, v1.2d, v2.2d
+; CHECK-NEXT: movi v16.2d, #0000000000000000
+; CHECK-NEXT: movi v17.2d, #0000000000000000
+; CHECK-NEXT: fcmla v16.2d, v4.2d, v6.2d, #270
+; CHECK-NEXT: fcmla v17.2d, v5.2d, v7.2d, #270
+; CHECK-NEXT: fcmla v16.2d, v2.2d, v0.2d, #0
+; CHECK-NEXT: fcmla v17.2d, v3.2d, v1.2d, #0
+; CHECK-NEXT: fcmla v16.2d, v4.2d, v6.2d, #180
+; CHECK-NEXT: fcmla v17.2d, v5.2d, v7.2d, #180
+; CHECK-NEXT: fcmla v16.2d, v2.2d, v0.2d, #90
+; CHECK-NEXT: fcmla v17.2d, v3.2d, v1.2d, #90
+; CHECK-NEXT: mov v0.16b, v16.16b
+; CHECK-NEXT: mov v1.16b, v17.16b
; CHECK-NEXT: ret
entry:
%strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
@@ -146,25 +125,18 @@ entry:
define <4 x double> @mul_conj_mull(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d) {
; CHECK-LABEL: mul_conj_mull:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: zip2 v16.2d, v2.2d, v3.2d
-; CHECK-NEXT: zip2 v17.2d, v0.2d, v1.2d
-; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d
-; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: fmul v3.2d, v16.2d, v17.2d
-; CHECK-NEXT: fmul v1.2d, v2.2d, v17.2d
-; CHECK-NEXT: zip1 v17.2d, v4.2d, v5.2d
-; CHECK-NEXT: zip2 v4.2d, v4.2d, v5.2d
-; CHECK-NEXT: fneg v3.2d, v3.2d
-; CHECK-NEXT: zip1 v5.2d, v6.2d, v7.2d
-; CHECK-NEXT: fmla v1.2d, v0.2d, v16.2d
-; CHECK-NEXT: fmla v3.2d, v0.2d, v2.2d
-; CHECK-NEXT: zip2 v0.2d, v6.2d, v7.2d
-; CHECK-NEXT: fmls v1.2d, v4.2d, v5.2d
-; CHECK-NEXT: fmla v3.2d, v17.2d, v5.2d
-; CHECK-NEXT: fmla v1.2d, v17.2d, v0.2d
-; CHECK-NEXT: fmla v3.2d, v4.2d, v0.2d
-; CHECK-NEXT: zip1 v0.2d, v3.2d, v1.2d
-; CHECK-NEXT: zip2 v1.2d, v3.2d, v1.2d
+; CHECK-NEXT: movi v16.2d, #0000000000000000
+; CHECK-NEXT: movi v17.2d, #0000000000000000
+; CHECK-NEXT: fcmla v16.2d, v2.2d, v0.2d, #0
+; CHECK-NEXT: fcmla v17.2d, v3.2d, v1.2d, #0
+; CHECK-NEXT: fcmla v16.2d, v2.2d, v0.2d, #90
+; CHECK-NEXT: fcmla v17.2d, v3.2d, v1.2d, #90
+; CHECK-NEXT: fcmla v16.2d, v6.2d, v4.2d, #0
+; CHECK-NEXT: fcmla v17.2d, v7.2d, v5.2d, #0
+; CHECK-NEXT: fcmla v16.2d, v6.2d, v4.2d, #270
+; CHECK-NEXT: fcmla v17.2d, v7.2d, v5.2d, #270
+; CHECK-NEXT: mov v0.16b, v16.16b
+; CHECK-NEXT: mov v1.16b, v17.16b
; CHECK-NEXT: ret
entry:
%strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
index f801a1bfd7e0a..0576475c8168a 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-add-mull-scalable-fast.ll
@@ -7,21 +7,13 @@ target triple = "aarch64-arm-none-eabi"
define <vscale x 4 x double> @mull_add(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c) {
; CHECK-LABEL: mull_add:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: uzp2 z6.d, z4.d, z5.d
-; CHECK-NEXT: uzp1 z7.d, z0.d, z1.d
-; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d
-; CHECK-NEXT: uzp1 z1.d, z4.d, z5.d
-; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: fmla z1.d, p0/m, z4.d, z7.d
-; CHECK-NEXT: uzp2 z2.d, z2.d, z3.d
-; CHECK-NEXT: movprfx z5, z6
-; CHECK-NEXT: fmla z5.d, p0/m, z4.d, z0.d
-; CHECK-NEXT: movprfx z3, z5
-; CHECK-NEXT: fmla z3.d, p0/m, z2.d, z7.d
-; CHECK-NEXT: fmls z1.d, p0/m, z2.d, z0.d
-; CHECK-NEXT: zip1 z0.d, z1.d, z3.d
-; CHECK-NEXT: zip2 z1.d, z1.d, z3.d
+; CHECK-NEXT: fcmla z4.d, p0/m, z0.d, z2.d, #0
+; CHECK-NEXT: fcmla z5.d, p0/m, z1.d, z3.d, #0
+; CHECK-NEXT: fcmla z4.d, p0/m, z0.d, z2.d, #90
+; CHECK-NEXT: fcmla z5.d, p0/m, z1.d, z3.d, #90
+; CHECK-NEXT: mov z0.d, z4.d
+; CHECK-NEXT: mov z1.d, z5.d
; CHECK-NEXT: ret
entry:
%strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -49,26 +41,19 @@ entry:
define <vscale x 4 x double> @mul_add_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
; CHECK-LABEL: mul_add_mull:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d
-; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d
-; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d
-; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d
-; CHECK-NEXT: fmul z2.d, z1.d, z0.d
+; CHECK-NEXT: mov z24.d, #0 // =0x0
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d
-; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d
-; CHECK-NEXT: uzp1 z26.d, z6.d, z7.d
-; CHECK-NEXT: fmul z1.d, z1.d, z25.d
-; CHECK-NEXT: fmul z0.d, z24.d, z0.d
-; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d
-; CHECK-NEXT: uzp2 z5.d, z6.d, z7.d
-; CHECK-NEXT: fmla z1.d, p0/m, z26.d, z4.d
-; CHECK-NEXT: fmla z2.d, p0/m, z26.d, z3.d
-; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z3.d
-; CHECK-NEXT: fmla z2.d, p0/m, z5.d, z4.d
-; CHECK-NEXT: fsub z1.d, z1.d, z0.d
-; CHECK-NEXT: zip1 z0.d, z1.d, z2.d
-; CHECK-NEXT: zip2 z1.d, z1.d, z2.d
+; CHECK-NEXT: mov z25.d, z24.d
+; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #0
+; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #0
+; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0
+; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0
+; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #90
+; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #90
+; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90
+; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90
+; CHECK-NEXT: mov z1.d, z24.d
+; CHECK-NEXT: mov z0.d, z25.d
; CHECK-NEXT: ret
entry:
%strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -105,27 +90,19 @@ entry:
define <vscale x 4 x double> @mul_sub_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
; CHECK-LABEL: mul_sub_mull:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d
-; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d
-; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d
+; CHECK-NEXT: mov z24.d, #0 // =0x0
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d
-; CHECK-NEXT: fmul z2.d, z1.d, z0.d
-; CHECK-NEXT: fmul z1.d, z1.d, z25.d
-; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d
-; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d
-; CHECK-NEXT: uzp1 z5.d, z6.d, z7.d
-; CHECK-NEXT: uzp2 z6.d, z6.d, z7.d
-; CHECK-NEXT: fmul z0.d, z24.d, z0.d
-; CHECK-NEXT: fmla z1.d, p0/m, z6.d, z3.d
-; CHECK-NEXT: fmul z3.d, z5.d, z3.d
-; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z4.d
-; CHECK-NEXT: fmla z3.d, p0/m, z6.d, z4.d
-; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d
-; CHECK-NEXT: fsub z1.d, z1.d, z0.d
-; CHECK-NEXT: fsub z2.d, z2.d, z3.d
-; CHECK-NEXT: zip1 z0.d, z1.d, z2.d
-; CHECK-NEXT: zip2 z1.d, z1.d, z2.d
+; CHECK-NEXT: mov z25.d, z24.d
+; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #270
+; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #270
+; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0
+; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0
+; CHECK-NEXT: fcmla z24.d, p0/m, z7.d, z5.d, #180
+; CHECK-NEXT: fcmla z25.d, p0/m, z6.d, z4.d, #180
+; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90
+; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90
+; CHECK-NEXT: mov z1.d, z24.d
+; CHECK-NEXT: mov z0.d, z25.d
; CHECK-NEXT: ret
entry:
%strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
@@ -162,26 +139,19 @@ entry:
define <vscale x 4 x double> @mul_conj_mull(<vscale x 4 x double> %a, <vscale x 4 x double> %b, <vscale x 4 x double> %c, <vscale x 4 x double> %d) {
; CHECK-LABEL: mul_conj_mull:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: uzp2 z24.d, z2.d, z3.d
-; CHECK-NEXT: uzp1 z25.d, z0.d, z1.d
-; CHECK-NEXT: uzp2 z0.d, z0.d, z1.d
-; CHECK-NEXT: uzp1 z1.d, z2.d, z3.d
-; CHECK-NEXT: fmul z2.d, z1.d, z0.d
+; CHECK-NEXT: mov z24.d, #0 // =0x0
; CHECK-NEXT: ptrue p0.d
-; CHECK-NEXT: fmul z0.d, z24.d, z0.d
-; CHECK-NEXT: fmla z2.d, p0/m, z24.d, z25.d
-; CHECK-NEXT: uzp2 z3.d, z4.d, z5.d
-; CHECK-NEXT: uzp1 z4.d, z4.d, z5.d
-; CHECK-NEXT: uzp1 z5.d, z6.d, z7.d
-; CHECK-NEXT: fnmls z0.d, p0/m, z1.d, z25.d
-; CHECK-NEXT: fmla z0.d, p0/m, z5.d, z4.d
-; CHECK-NEXT: movprfx z1, z2
-; CHECK-NEXT: fmls z1.d, p0/m, z5.d, z3.d
-; CHECK-NEXT: uzp2 z2.d, z6.d, z7.d
-; CHECK-NEXT: fmla z1.d, p0/m, z2.d, z4.d
-; CHECK-NEXT: fmad z3.d, p0/m, z2.d, z0.d
-; CHECK-NEXT: zip1 z0.d, z3.d, z1.d
-; CHECK-NEXT: zip2 z1.d, z3.d, z1.d
+; CHECK-NEXT: mov z25.d, z24.d
+; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #0
+; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #0
+; CHECK-NEXT: fcmla z24.d, p0/m, z1.d, z3.d, #90
+; CHECK-NEXT: fcmla z25.d, p0/m, z0.d, z2.d, #90
+; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #0
+; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z6.d, #0
+; CHECK-NEXT: fcmla z24.d, p0/m, z5.d, z7.d, #270
+; CHECK-NEXT: fcmla z25.d, p0/m, z4.d, z6.d, #270
+; CHECK-NEXT: mov z1.d, z24.d
+; CHECK-NEXT: mov z0.d, z25.d
; CHECK-NEXT: ret
entry:
%strided.vec = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.experimental.vector.deinterleave2.nxv4f64(<vscale x 4 x double> %a)
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll
index 65012899c97e3..c6cc42d6a45d3 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-mixed-cases.ll
@@ -484,9 +484,9 @@ define <4 x float> @mul_negequal(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: mul_negequal:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: movi v2.2d, #0000000000000000
-; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #0
-; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #90
-; CHECK-NEXT: fneg v0.4s, v2.4s
+; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #180
+; CHECK-NEXT: fcmla v2.4s, v0.4s, v1.4s, #270
+; CHECK-NEXT: mov v0.16b, v2.16b
; CHECK-NEXT: ret
entry:
%strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll
index 9409bb9530e0e..4d6dad1945bde 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-multiuses.ll
@@ -299,50 +299,34 @@ entry:
define void @mul_add_common_mul_add_mul(<4 x double> %a, <4 x double> %b, <4 x double> %c, <4 x double> %d, <4 x double> %e, <4 x double> %f, <4 x double> %g, <4 x double> %h, ptr %p1, ptr %p2) {
; CHECK-LABEL: mul_add_common_mul_add_mul:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldp q17, q16, [sp, #96]
-; CHECK-NEXT: zip2 v20.2d, v4.2d, v5.2d
-; CHECK-NEXT: zip2 v21.2d, v6.2d, v7.2d
-; CHECK-NEXT: zip1 v4.2d, v4.2d, v5.2d
-; CHECK-NEXT: zip1 v5.2d, v6.2d, v7.2d
-; CHECK-NEXT: ldp q19, q18, [sp, #64]
-; CHECK-NEXT: zip2 v23.2d, v17.2d, v16.2d
-; CHECK-NEXT: fmul v6.2d, v21.2d, v20.2d
-; CHECK-NEXT: zip1 v16.2d, v17.2d, v16.2d
-; CHECK-NEXT: zip2 v22.2d, v19.2d, v18.2d
-; CHECK-NEXT: zip1 v18.2d, v19.2d, v18.2d
-; CHECK-NEXT: fneg v6.2d, v6.2d
-; CHECK-NEXT: fmul v20.2d, v5.2d, v20.2d
-; CHECK-NEXT: fmul v7.2d, v22.2d, v23.2d
-; CHECK-NEXT: fmla v6.2d, v4.2d, v5.2d
-; CHECK-NEXT: zip2 v5.2d, v2.2d, v3.2d
-; CHECK-NEXT: fneg v7.2d, v7.2d
-; CHECK-NEXT: zip1 v2.2d, v2.2d, v3.2d
-; CHECK-NEXT: fmla v7.2d, v18.2d, v16.2d
-; CHECK-NEXT: fadd v19.2d, v7.2d, v6.2d
-; CHECK-NEXT: fmla v20.2d, v4.2d, v21.2d
-; CHECK-NEXT: zip2 v4.2d, v0.2d, v1.2d
-; CHECK-NEXT: ldp q7, q6, [sp]
-; CHECK-NEXT: zip1 v0.2d, v0.2d, v1.2d
-; CHECK-NEXT: fmla v20.2d, v18.2d, v23.2d
-; CHECK-NEXT: fmul v1.2d, v2.2d, v4.2d
-; CHECK-NEXT: fmla v20.2d, v22.2d, v16.2d
-; CHECK-NEXT: mov v3.16b, v19.16b
-; CHECK-NEXT: fmla v1.2d, v0.2d, v5.2d
-; CHECK-NEXT: fmla v3.2d, v4.2d, v5.2d
-; CHECK-NEXT: ldp q16, q4, [sp, #32]
-; CHECK-NEXT: fneg v17.2d, v3.2d
-; CHECK-NEXT: zip1 v3.2d, v7.2d, v6.2d
-; CHECK-NEXT: zip2 v6.2d, v7.2d, v6.2d
-; CHECK-NEXT: zip1 v5.2d, v16.2d, v4.2d
-; CHECK-NEXT: fmla v17.2d, v0.2d, v2.2d
-; CHECK-NEXT: fsub v18.2d, v1.2d, v20.2d
-; CHECK-NEXT: zip2 v0.2d, v16.2d, v4.2d
-; CHECK-NEXT: fmla v19.2d, v3.2d, v5.2d
-; CHECK-NEXT: st2 { v17.2d, v18.2d }, [x0]
-; CHECK-NEXT: fmls v19.2d, v6.2d, v0.2d
-; CHECK-NEXT: fmla v20.2d, v6.2d, v5.2d
-; CHECK-NEXT: fmla v20.2d, v3.2d, v0.2d
-; CHECK-NEXT: st2 { v19.2d, v20.2d }, [x1]
+; CHECK-NEXT: ldp q17, q16, [sp, #64]
+; CHECK-NEXT: movi v20.2d, #0000000000000000
+; CHECK-NEXT: movi v21.2d, #0000000000000000
+; CHECK-NEXT: movi v24.2d, #0000000000000000
+; CHECK-NEXT: movi v25.2d, #0000000000000000
+; CHECK-NEXT: ldp q19, q18, [sp, #96]
+; CHECK-NEXT: fcmla v24.2d, v2.2d, v0.2d, #0
+; CHECK-NEXT: fcmla v25.2d, v3.2d, v1.2d, #0
+; CHECK-NEXT: fcmla v20.2d, v19.2d, v17.2d, #0
+; CHECK-NEXT: fcmla v24.2d, v2.2d, v0.2d, #90
+; CHECK-NEXT: fcmla v21.2d, v18.2d, v16.2d, #0
+; CHECK-NEXT: ldp q23, q22, [sp, #32]
+; CHECK-NEXT: fcmla v20.2d, v19.2d, v17.2d, #90
+; CHECK-NEXT: fcmla v25.2d, v3.2d, v1.2d, #90
+; CHECK-NEXT: fcmla v21.2d, v18.2d, v16.2d, #90
+; CHECK-NEXT: fcmla v20.2d, v6.2d, v4.2d, #0
+; CHECK-NEXT: ldp q1, q0, [sp]
+; CHECK-NEXT: fcmla v21.2d, v7.2d, v5.2d, #0
+; CHECK-NEXT: fcmla v20.2d, v6.2d, v4.2d, #90
+; CHECK-NEXT: fcmla v21.2d, v7.2d, v5.2d, #90
+; CHECK-NEXT: fsub v2.2d, v24.2d, v20.2d
+; CHECK-NEXT: fcmla v20.2d, v1.2d, v23.2d, #0
+; CHECK-NEXT: fsub v3.2d, v25.2d, v21.2d
+; CHECK-NEXT: fcmla v21.2d, v0.2d, v22.2d, #0
+; CHECK-NEXT: fcmla v20.2d, v1.2d, v23.2d, #90
+; CHECK-NEXT: stp q2, q3, [x0]
+; CHECK-NEXT: fcmla v21.2d, v0.2d, v22.2d, #90
+; CHECK-NEXT: stp q20, q21, [x1]
; CHECK-NEXT: ret
entry:
%strided.vec = shufflevector <4 x double> %a, <4 x double> poison, <2 x i32> <i32 0, i32 2>
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
index d9a279d1a79e7..3a1d909b9d8c7 100644
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-uniform-cases.ll
@@ -115,15 +115,7 @@ entry:
define <4 x float> @simple_add_270_false(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: simple_add_270_false:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ext v2.16b, v0.16b, v0.16b, #8
-; CHECK-NEXT: ext v3.16b, v1.16b, v1.16b, #8
-; CHECK-NEXT: zip1 v4.2s, v0.2s, v2.2s
-; CHECK-NEXT: zip2 v0.2s, v0.2s, v2.2s
-; CHECK-NEXT: zip1 v2.2s, v1.2s, v3.2s
-; CHECK-NEXT: zip2 v1.2s, v1.2s, v3.2s
-; CHECK-NEXT: fadd v1.2s, v1.2s, v4.2s
-; CHECK-NEXT: fsub v0.2s, v0.2s, v2.2s
-; CHECK-NEXT: zip1 v0.4s, v1.4s, v0.4s
+; CHECK-NEXT: fcadd v0.4s, v0.4s, v1.4s, #270
; CHECK-NEXT: ret
entry:
%strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll
index a7211d196bf3b..2f09c98891d03 100644
--- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-mixed-cases.ll
@@ -553,11 +553,10 @@ define <4 x float> @mul_negequal(<4 x float> %a, <4 x float> %b) {
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: vmov d1, r2, r3
-; CHECK-NEXT: vcmul.f32 q2, q0, q1, #0
-; CHECK-NEXT: vcmla.f32 q2, q0, q1, #90
-; CHECK-NEXT: vneg.f32 q0, q2
-; CHECK-NEXT: vmov r0, r1, d0
-; CHECK-NEXT: vmov r2, r3, d1
+; CHECK-NEXT: vcmul.f32 q2, q0, q1, #180
+; CHECK-NEXT: vcmla.f32 q2, q0, q1, #270
+; CHECK-NEXT: vmov r0, r1, d4
+; CHECK-NEXT: vmov r2, r3, d5
; CHECK-NEXT: bx lr
entry:
%strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
diff --git a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll
index 38c56c674267f..93d9797a38a1f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-complex-deinterleaving-uniform-cases.ll
@@ -118,19 +118,8 @@ entry:
define arm_aapcs_vfpcc <4 x float> @simple_add_270_false(<4 x float> %a, <4 x float> %b) {
; CHECK-LABEL: simple_add_270_false:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: vmov.f32 s8, s4
-; CHECK-NEXT: vmov.f32 s12, s1
-; CHECK-NEXT: vmov.f32 s4, s5
-; CHECK-NEXT: vmov.f32 s9, s6
-; CHECK-NEXT: vmov.f32 s13, s3
-; CHECK-NEXT: vmov.f32 s1, s2
-; CHECK-NEXT: vsub.f32 q2, q3, q2
-; CHECK-NEXT: vmov.f32 s5, s7
-; CHECK-NEXT: vadd.f32 q1, q1, q0
-; CHECK-NEXT: vmov.f32 s1, s8
-; CHECK-NEXT: vmov.f32 s0, s4
-; CHECK-NEXT: vmov.f32 s2, s5
-; CHECK-NEXT: vmov.f32 s3, s9
+; CHECK-NEXT: vcadd.f32 q2, q0, q1, #270
+; CHECK-NEXT: vmov q0, q2
; CHECK-NEXT: bx lr
entry:
%strided.vec = shufflevector <4 x float> %a, <4 x float> poison, <2 x i32> <i32 0, i32 2>
More information about the llvm-commits
mailing list