[llvm] 76714be - Revert "Add support for single reductions in ComplexDeinterleavingPass (#112875)"
Florian Hahn via llvm-commits
llvm-commits at lists.llvm.org
Wed Dec 18 07:07:14 PST 2024
Author: Florian Hahn
Date: 2024-12-18T15:06:52Z
New Revision: 76714be5fd4ace66dd9e19ce706c2e2149dd5716
URL: https://github.com/llvm/llvm-project/commit/76714be5fd4ace66dd9e19ce706c2e2149dd5716
DIFF: https://github.com/llvm/llvm-project/commit/76714be5fd4ace66dd9e19ce706c2e2149dd5716.diff
LOG: Revert "Add support for single reductions in ComplexDeinterleavingPass (#112875)"
This reverts commit b3eede5e1fa7ab742b86e9be22db7bccd2505b8a.
This has been breaking most AArch64 stage2 builds for 4+ hours,
reverting to get the bots back to green.
https://lab.llvm.org/buildbot/#/builders/41/builds/4172
https://lab.llvm.org/buildbot/#/builders/4/builds/4281
https://lab.llvm.org/buildbot/#/builders/199/builds/263
https://lab.llvm.org/buildbot/#/builders/198/builds/334
https://lab.llvm.org/buildbot/#/builders/143/builds/4276
https://lab.llvm.org/buildbot/#/builders/17/builds/4725
Added:
Modified:
llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Removed:
llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
################################################################################
diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
index 4383249658e606..84a2673fecb5bf 100644
--- a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
+++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
@@ -35,7 +35,6 @@ struct ComplexDeinterleavingPass
enum class ComplexDeinterleavingOperation {
CAdd,
CMulPartial,
- CDot,
// The following 'operations' are used to represent internal states. Backends
// are not expected to try and support these in any capacity.
Deinterleave,
@@ -44,7 +43,6 @@ enum class ComplexDeinterleavingOperation {
ReductionPHI,
ReductionOperation,
ReductionSelect,
- ReductionSingle
};
enum class ComplexDeinterleavingRotation {
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 3111354addacd1..f3f7ea9407b46f 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -108,13 +108,6 @@ static bool isNeg(Value *V);
static Value *getNegOperand(Value *V);
namespace {
-template <typename T, typename IterT>
-std::optional<T> findCommonBetweenCollections(IterT A, IterT B) {
- auto Common = llvm::find_if(A, [B](T I) { return llvm::is_contained(B, I); });
- if (Common != A.end())
- return std::make_optional(*Common);
- return std::nullopt;
-}
class ComplexDeinterleavingLegacyPass : public FunctionPass {
public:
@@ -151,7 +144,6 @@ struct ComplexDeinterleavingCompositeNode {
friend class ComplexDeinterleavingGraph;
using NodePtr = std::shared_ptr<ComplexDeinterleavingCompositeNode>;
using RawNodePtr = ComplexDeinterleavingCompositeNode *;
- bool OperandsValid = true;
public:
ComplexDeinterleavingOperation Operation;
@@ -168,11 +160,7 @@ struct ComplexDeinterleavingCompositeNode {
SmallVector<RawNodePtr> Operands;
Value *ReplacementNode = nullptr;
- void addOperand(NodePtr Node) {
- if (!Node || !Node.get())
- OperandsValid = false;
- Operands.push_back(Node.get());
- }
+ void addOperand(NodePtr Node) { Operands.push_back(Node.get()); }
void dump() { dump(dbgs()); }
void dump(raw_ostream &OS) {
@@ -206,8 +194,6 @@ struct ComplexDeinterleavingCompositeNode {
PrintNodeRef(Op);
}
}
-
- bool areOperandsValid() { return OperandsValid; }
};
class ComplexDeinterleavingGraph {
@@ -307,7 +293,7 @@ class ComplexDeinterleavingGraph {
NodePtr submitCompositeNode(NodePtr Node) {
CompositeNodes.push_back(Node);
- if (Node->Real)
+ if (Node->Real && Node->Imag)
CachedResult[{Node->Real, Node->Imag}] = Node;
return Node;
}
@@ -341,8 +327,6 @@ class ComplexDeinterleavingGraph {
/// i: ai - br
NodePtr identifyAdd(Instruction *Real, Instruction *Imag);
NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag);
- NodePtr identifyPartialReduction(Value *R, Value *I);
- NodePtr identifyDotProduct(Value *Inst);
NodePtr identifyNode(Value *R, Value *I);
@@ -412,7 +396,6 @@ class ComplexDeinterleavingGraph {
/// * Deinterleave the final value outside of the loop and repurpose original
/// reduction users
void processReductionOperation(Value *OperationReplacement, RawNodePtr Node);
- void processReductionSingle(Value *OperationReplacement, RawNodePtr Node);
public:
void dump() { dump(dbgs()); }
@@ -907,164 +890,18 @@ ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real,
return submitCompositeNode(Node);
}
-ComplexDeinterleavingGraph::NodePtr
-ComplexDeinterleavingGraph::identifyDotProduct(Value *V) {
-
- if (!TL->isComplexDeinterleavingOperationSupported(
- ComplexDeinterleavingOperation::CDot, V->getType())) {
- LLVM_DEBUG(dbgs() << "Target doesn't support complex deinterleaving "
- "operation CDot with the type "
- << *V->getType() << "\n");
- return nullptr;
- }
-
- auto *Inst = cast<Instruction>(V);
- auto *RealUser = cast<Instruction>(*Inst->user_begin());
-
- NodePtr CN =
- prepareCompositeNode(ComplexDeinterleavingOperation::CDot, Inst, nullptr);
-
- NodePtr ANode;
-
- const Intrinsic::ID PartialReduceInt =
- Intrinsic::experimental_vector_partial_reduce_add;
-
- Value *AReal = nullptr;
- Value *AImag = nullptr;
- Value *BReal = nullptr;
- Value *BImag = nullptr;
- Value *Phi = nullptr;
-
- auto UnwrapCast = [](Value *V) -> Value * {
- if (auto *CI = dyn_cast<CastInst>(V))
- return CI->getOperand(0);
- return V;
- };
-
- auto PatternRot0 = m_Intrinsic<PartialReduceInt>(
- m_Intrinsic<PartialReduceInt>(m_Value(Phi),
- m_Mul(m_Value(BReal), m_Value(AReal))),
- m_Neg(m_Mul(m_Value(BImag), m_Value(AImag))));
-
- auto PatternRot270 = m_Intrinsic<PartialReduceInt>(
- m_Intrinsic<PartialReduceInt>(
- m_Value(Phi), m_Neg(m_Mul(m_Value(BReal), m_Value(AImag)))),
- m_Mul(m_Value(BImag), m_Value(AReal)));
-
- if (match(Inst, PatternRot0)) {
- CN->Rotation = ComplexDeinterleavingRotation::Rotation_0;
- } else if (match(Inst, PatternRot270)) {
- CN->Rotation = ComplexDeinterleavingRotation::Rotation_270;
- } else {
- Value *A0, *A1;
- // The rotations 90 and 180 share the same operation pattern, so inspect the
- // order of the operands, identifying where the real and imaginary
- // components of A go, to discern between the aforementioned rotations.
- auto PatternRot90Rot180 = m_Intrinsic<PartialReduceInt>(
- m_Intrinsic<PartialReduceInt>(m_Value(Phi),
- m_Mul(m_Value(BReal), m_Value(A0))),
- m_Mul(m_Value(BImag), m_Value(A1)));
-
- if (!match(Inst, PatternRot90Rot180))
- return nullptr;
-
- A0 = UnwrapCast(A0);
- A1 = UnwrapCast(A1);
-
- // Test if A0 is real/A1 is imag
- ANode = identifyNode(A0, A1);
- if (!ANode) {
- // Test if A0 is imag/A1 is real
- ANode = identifyNode(A1, A0);
- // Unable to identify operand components, thus unable to identify rotation
- if (!ANode)
- return nullptr;
- CN->Rotation = ComplexDeinterleavingRotation::Rotation_90;
- AReal = A1;
- AImag = A0;
- } else {
- AReal = A0;
- AImag = A1;
- CN->Rotation = ComplexDeinterleavingRotation::Rotation_180;
- }
- }
-
- AReal = UnwrapCast(AReal);
- AImag = UnwrapCast(AImag);
- BReal = UnwrapCast(BReal);
- BImag = UnwrapCast(BImag);
-
- VectorType *VTy = cast<VectorType>(V->getType());
- Type *ExpectedOperandTy = VectorType::getSubdividedVectorType(VTy, 2);
- if (AReal->getType() != ExpectedOperandTy)
- return nullptr;
- if (AImag->getType() != ExpectedOperandTy)
- return nullptr;
- if (BReal->getType() != ExpectedOperandTy)
- return nullptr;
- if (BImag->getType() != ExpectedOperandTy)
- return nullptr;
-
- if (Phi->getType() != VTy && RealUser->getType() != VTy)
- return nullptr;
-
- NodePtr Node = identifyNode(AReal, AImag);
-
- // In the case that a node was identified to figure out the rotation, ensure
- // that trying to identify a node with AReal and AImag post-unwrap results in
- // the same node
- if (ANode && Node != ANode) {
- LLVM_DEBUG(
- dbgs()
- << "Identified node is
diff erent from previously identified node. "
- "Unable to confidently generate a complex operation node\n");
- return nullptr;
- }
-
- CN->addOperand(Node);
- CN->addOperand(identifyNode(BReal, BImag));
- CN->addOperand(identifyNode(Phi, RealUser));
-
- return submitCompositeNode(CN);
-}
-
-ComplexDeinterleavingGraph::NodePtr
-ComplexDeinterleavingGraph::identifyPartialReduction(Value *R, Value *I) {
- // Partial reductions don't support non-vector types, so check these first
- if (!isa<VectorType>(R->getType()) || !isa<VectorType>(I->getType()))
- return nullptr;
-
- auto CommonUser =
- findCommonBetweenCollections<Value *>(R->users(), I->users());
- if (!CommonUser)
- return nullptr;
-
- auto *IInst = dyn_cast<IntrinsicInst>(*CommonUser);
- if (!IInst || IInst->getIntrinsicID() !=
- Intrinsic::experimental_vector_partial_reduce_add)
- return nullptr;
-
- if (NodePtr CN = identifyDotProduct(IInst))
- return CN;
-
- return nullptr;
-}
-
ComplexDeinterleavingGraph::NodePtr
ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) {
+ LLVM_DEBUG(dbgs() << "identifyNode on " << *R << " / " << *I << "\n");
+ assert(R->getType() == I->getType() &&
+ "Real and imaginary parts should not have
diff erent types");
+
auto It = CachedResult.find({R, I});
if (It != CachedResult.end()) {
LLVM_DEBUG(dbgs() << " - Folding to existing node\n");
return It->second;
}
- if (NodePtr CN = identifyPartialReduction(R, I))
- return CN;
-
- bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I);
- if (!IsReduction && R->getType() != I->getType())
- return nullptr;
-
if (NodePtr CN = identifySplat(R, I))
return CN;
@@ -1590,20 +1427,12 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
if (It != RootToNode.end()) {
auto RootNode = It->second;
assert(RootNode->Operation ==
- ComplexDeinterleavingOperation::ReductionOperation ||
- RootNode->Operation ==
- ComplexDeinterleavingOperation::ReductionSingle);
+ ComplexDeinterleavingOperation::ReductionOperation);
// Find out which part, Real or Imag, comes later, and only if we come to
// the latest part, add it to OrderedRoots.
auto *R = cast<Instruction>(RootNode->Real);
- auto *I = RootNode->Imag ? cast<Instruction>(RootNode->Imag) : nullptr;
-
- Instruction *ReplacementAnchor;
- if (I)
- ReplacementAnchor = R->comesBefore(I) ? I : R;
- else
- ReplacementAnchor = R;
-
+ auto *I = cast<Instruction>(RootNode->Imag);
+ auto *ReplacementAnchor = R->comesBefore(I) ? I : R;
if (ReplacementAnchor != RootI)
return false;
OrderedRoots.push_back(RootI);
@@ -1694,6 +1523,7 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
for (size_t j = i + 1; j < OperationInstruction.size(); ++j) {
if (Processed[j])
continue;
+
auto *Real = OperationInstruction[i];
auto *Imag = OperationInstruction[j];
if (Real->getType() != Imag->getType())
@@ -1726,28 +1556,6 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
break;
}
}
-
- auto *Real = OperationInstruction[i];
- // We want to check that we have 2 operands, but the function attributes
- // being counted as operands bloats this value.
- if (Real->getNumOperands() < 2)
- continue;
-
- RealPHI = ReductionInfo[Real].first;
- ImagPHI = nullptr;
- PHIsFound = false;
- auto Node = identifyNode(Real->getOperand(0), Real->getOperand(1));
- if (Node && PHIsFound) {
- LLVM_DEBUG(
- dbgs() << "Identified single reduction starting from instruction: "
- << *Real << "/" << *ReductionInfo[Real].second << "\n");
- Processed[i] = true;
- auto RootNode = prepareCompositeNode(
- ComplexDeinterleavingOperation::ReductionSingle, Real, nullptr);
- RootNode->addOperand(Node);
- RootToNode[Real] = RootNode;
- submitCompositeNode(RootNode);
- }
}
RealPHI = nullptr;
@@ -1755,12 +1563,6 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
}
bool ComplexDeinterleavingGraph::checkNodes() {
-
- for (NodePtr N : CompositeNodes) {
- if (!N->areOperandsValid())
- return false;
- }
-
// Collect all instructions from roots to leaves
SmallPtrSet<Instruction *, 16> AllInstructions;
SmallVector<Instruction *, 8> Worklist;
@@ -2029,7 +1831,7 @@ ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) {
ComplexDeinterleavingGraph::NodePtr
ComplexDeinterleavingGraph::identifyPHINode(Instruction *Real,
Instruction *Imag) {
- if (Real != RealPHI || (ImagPHI && Imag != ImagPHI))
+ if (Real != RealPHI || Imag != ImagPHI)
return nullptr;
PHIsFound = true;
@@ -2124,16 +1926,6 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
Value *ReplacementNode;
switch (Node->Operation) {
- case ComplexDeinterleavingOperation::CDot: {
- Value *Input0 = ReplaceOperandIfExist(Node, 0);
- Value *Input1 = ReplaceOperandIfExist(Node, 1);
- Value *Accumulator = ReplaceOperandIfExist(Node, 2);
- assert(!Input1 || (Input0->getType() == Input1->getType() &&
- "Node inputs need to be of the same type"));
- ReplacementNode = TL->createComplexDeinterleavingIR(
- Builder, Node->Operation, Node->Rotation, Input0, Input1, Accumulator);
- break;
- }
case ComplexDeinterleavingOperation::CAdd:
case ComplexDeinterleavingOperation::CMulPartial:
case ComplexDeinterleavingOperation::Symmetric: {
@@ -2177,18 +1969,13 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
case ComplexDeinterleavingOperation::ReductionPHI: {
// If Operation is ReductionPHI, a new empty PHINode is created.
// It is filled later when the ReductionOperation is processed.
- auto *OldPHI = cast<PHINode>(Node->Real);
auto *VTy = cast<VectorType>(Node->Real->getType());
auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
auto *NewPHI = PHINode::Create(NewVTy, 0, "", BackEdge->getFirstNonPHIIt());
- OldToNewPHI[OldPHI] = NewPHI;
+ OldToNewPHI[dyn_cast<PHINode>(Node->Real)] = NewPHI;
ReplacementNode = NewPHI;
break;
}
- case ComplexDeinterleavingOperation::ReductionSingle:
- ReplacementNode = replaceNode(Builder, Node->Operands[0]);
- processReductionSingle(ReplacementNode, Node);
- break;
case ComplexDeinterleavingOperation::ReductionOperation:
ReplacementNode = replaceNode(Builder, Node->Operands[0]);
processReductionOperation(ReplacementNode, Node);
@@ -2213,38 +2000,6 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
return ReplacementNode;
}
-void ComplexDeinterleavingGraph::processReductionSingle(
- Value *OperationReplacement, RawNodePtr Node) {
- auto *Real = cast<Instruction>(Node->Real);
- auto *OldPHI = ReductionInfo[Real].first;
- auto *NewPHI = OldToNewPHI[OldPHI];
- auto *VTy = cast<VectorType>(Real->getType());
- auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
-
- Value *Init = OldPHI->getIncomingValueForBlock(Incoming);
-
- IRBuilder<> Builder(Incoming->getTerminator());
-
- Value *NewInit = nullptr;
- if (auto *C = dyn_cast<Constant>(Init)) {
- if (C->isZeroValue())
- NewInit = Constant::getNullValue(NewVTy);
- }
-
- if (!NewInit)
- NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy,
- {Init, Constant::getNullValue(VTy)});
-
- NewPHI->addIncoming(NewInit, Incoming);
- NewPHI->addIncoming(OperationReplacement, BackEdge);
-
- auto *FinalReduction = ReductionInfo[Real].second;
- Builder.SetInsertPoint(&*FinalReduction->getParent()->getFirstInsertionPt());
-
- auto *AddReduce = Builder.CreateAddReduce(OperationReplacement);
- FinalReduction->replaceAllUsesWith(AddReduce);
-}
-
void ComplexDeinterleavingGraph::processReductionOperation(
Value *OperationReplacement, RawNodePtr Node) {
auto *Real = cast<Instruction>(Node->Real);
@@ -2304,13 +2059,8 @@ void ComplexDeinterleavingGraph::replaceNodes() {
auto *RootImag = cast<Instruction>(RootNode->Imag);
ReductionInfo[RootReal].first->removeIncomingValue(BackEdge);
ReductionInfo[RootImag].first->removeIncomingValue(BackEdge);
- DeadInstrRoots.push_back(RootReal);
- DeadInstrRoots.push_back(RootImag);
- } else if (RootNode->Operation ==
- ComplexDeinterleavingOperation::ReductionSingle) {
- auto *RootInst = cast<Instruction>(RootNode->Real);
- ReductionInfo[RootInst].first->removeIncomingValue(BackEdge);
- DeadInstrRoots.push_back(ReductionInfo[RootInst].second);
+ DeadInstrRoots.push_back(cast<Instruction>(RootReal));
+ DeadInstrRoots.push_back(cast<Instruction>(RootImag));
} else {
assert(R && "Unable to find replacement for RootInstruction");
DeadInstrRoots.push_back(RootInstruction);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d45c3cddd64de4..cb6ba06bd4425c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29542,16 +29542,9 @@ bool AArch64TargetLowering::isComplexDeinterleavingOperationSupported(
if (ScalarTy->isIntegerTy() && Subtarget->hasSVE2() && VTy->isScalableTy()) {
unsigned ScalarWidth = ScalarTy->getScalarSizeInBits();
-
- if (Operation == ComplexDeinterleavingOperation::CDot)
- return ScalarWidth == 32 || ScalarWidth == 64;
return 8 <= ScalarWidth && ScalarWidth <= 64;
}
- // CDot is not supported outside of scalable/sve scopes
- if (Operation == ComplexDeinterleavingOperation::CDot)
- return false;
-
return (ScalarTy->isHalfTy() && Subtarget->hasFullFP16()) ||
ScalarTy->isFloatTy() || ScalarTy->isDoubleTy();
}
@@ -29561,8 +29554,6 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
Value *Accumulator) const {
VectorType *Ty = cast<VectorType>(InputA->getType());
- if (Accumulator == nullptr)
- Accumulator = Constant::getNullValue(Ty);
bool IsScalable = Ty->isScalableTy();
bool IsInt = Ty->getElementType()->isIntegerTy();
@@ -29574,10 +29565,6 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
if (TyWidth > 128) {
int Stride = Ty->getElementCount().getKnownMinValue() / 2;
- int AccStride = cast<VectorType>(Accumulator->getType())
- ->getElementCount()
- .getKnownMinValue() /
- 2;
auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
@@ -29587,26 +29574,25 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
Value *LowerSplitAcc = nullptr;
Value *UpperSplitAcc = nullptr;
- Type *FullTy = Ty;
- FullTy = Accumulator->getType();
- auto *HalfAccTy = VectorType::getHalfElementsVectorType(
- cast<VectorType>(Accumulator->getType()));
- LowerSplitAcc =
- B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(0));
- UpperSplitAcc =
- B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(AccStride));
+ if (Accumulator) {
+ LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
+ UpperSplitAcc =
+ B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
+ }
auto *LowerSplitInt = createComplexDeinterleavingIR(
B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
auto *UpperSplitInt = createComplexDeinterleavingIR(
B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
- auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy),
- LowerSplitInt, B.getInt64(0));
- return B.CreateInsertVector(FullTy, Result, UpperSplitInt,
- B.getInt64(AccStride));
+ auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
+ B.getInt64(0));
+ return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
}
if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
+ if (Accumulator == nullptr)
+ Accumulator = Constant::getNullValue(Ty);
+
if (IsScalable) {
if (IsInt)
return B.CreateIntrinsic(
@@ -29658,13 +29644,6 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
return B.CreateIntrinsic(IntId, Ty, {InputA, InputB});
}
- if (OperationType == ComplexDeinterleavingOperation::CDot && IsInt &&
- IsScalable) {
- return B.CreateIntrinsic(
- Intrinsic::aarch64_sve_cdot, Accumulator->getType(),
- {Accumulator, InputA, InputB, B.getInt32((int)Rotation * 90)});
- }
-
return nullptr;
}
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
deleted file mode 100644
index 11cf4c31936d8f..00000000000000
--- a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
+++ /dev/null
@@ -1,1136 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s --check-prefix=CHECK-SVE2
-; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve -o - | FileCheck %s --check-prefix=CHECK-SVE
-; RUN: opt -S --passes=complex-deinterleaving %s -o - | FileCheck %s --check-prefix=CHECK-NOSVE
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-none-unknown-elf"
-
-define i32 @cdotp_i8_rot0(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
-; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot0(
-; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE2: [[VECTOR_BODY]]:
-; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
-; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
-; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
-; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 0)
-; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 0)
-; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
-; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE2: [[MIDDLE_BLOCK]]:
-; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
-; CHECK-SVE2-NEXT: ret i32 [[TMP0]]
-;
-; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot0(
-; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-SVE-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE: [[VECTOR_BODY]]:
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
-; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
-; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
-; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
-; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE: [[MIDDLE_BLOCK]]:
-; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
-; CHECK-SVE-NEXT: ret i32 [[TMP11]]
-;
-; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot0(
-; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
-; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
-; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-NOSVE: [[VECTOR_BODY]]:
-; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
-; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
-; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
-; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
-; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
-; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
-; CHECK-NOSVE-NEXT: ret i32 [[TMP0]]
-;
-entry:
- br label %vector.body
-
-vector.body: ; preds = %vector.body, %entry
- %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
- %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
- %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
- %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
- %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
- %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
- %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
- %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
- %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
- %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
- %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
- %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
- %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
- %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
- %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
- %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
- br i1 true, label %middle.block, label %vector.body
-
-middle.block: ; preds = %vector.body
- %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
- ret i32 %0
-}
-
-define i32 @cdotp_i8_rot90(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
-; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot90(
-; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE2: [[VECTOR_BODY]]:
-; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
-; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
-; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
-; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 90)
-; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 90)
-; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
-; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE2: [[MIDDLE_BLOCK]]:
-; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
-; CHECK-SVE2-NEXT: ret i32 [[TMP0]]
-;
-; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot90(
-; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-SVE-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE: [[VECTOR_BODY]]:
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
-; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
-; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
-; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
-; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
-; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE: [[MIDDLE_BLOCK]]:
-; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
-; CHECK-SVE-NEXT: ret i32 [[TMP11]]
-;
-; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot90(
-; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
-; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
-; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-NOSVE: [[VECTOR_BODY]]:
-; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
-; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
-; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
-; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
-; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
-; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
-; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
-; CHECK-NOSVE-NEXT: ret i32 [[TMP0]]
-;
-entry:
- br label %vector.body
-
-vector.body: ; preds = %vector.body, %entry
- %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
- %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
- %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
- %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
- %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
- %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
- %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
- %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
- %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
- %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
- %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
- %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.imag.ext
- %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
- %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.real.ext
- %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
- br i1 true, label %middle.block, label %vector.body
-
-middle.block: ; preds = %vector.body
- %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
- ret i32 %0
-}
-
-define i32 @cdotp_i8_rot180(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
-; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot180(
-; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE2: [[VECTOR_BODY]]:
-; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
-; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
-; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
-; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 180)
-; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 180)
-; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
-; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE2: [[MIDDLE_BLOCK]]:
-; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
-; CHECK-SVE2-NEXT: ret i32 [[TMP0]]
-;
-; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot180(
-; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-SVE-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE: [[VECTOR_BODY]]:
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
-; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
-; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
-; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
-; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE: [[MIDDLE_BLOCK]]:
-; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
-; CHECK-SVE-NEXT: ret i32 [[TMP11]]
-;
-; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot180(
-; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
-; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
-; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-NOSVE: [[VECTOR_BODY]]:
-; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
-; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
-; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
-; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
-; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
-; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
-; CHECK-NOSVE-NEXT: ret i32 [[TMP0]]
-;
-entry:
- br label %vector.body
-
-vector.body: ; preds = %vector.body, %entry
- %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
- %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
- %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
- %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
- %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
- %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
- %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
- %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
- %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
- %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
- %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
- %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
- %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
- %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
- %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
- br i1 true, label %middle.block, label %vector.body
-
-middle.block: ; preds = %vector.body
- %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
- ret i32 %0
-}
-
-define i32 @cdotp_i8_rot270(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
-; CHECK-SVE2-LABEL: define i32 @cdotp_i8_rot270(
-; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE2: [[VECTOR_BODY]]:
-; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A]], i64 16)
-; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B]], i64 16)
-; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
-; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP5]], <vscale x 16 x i8> [[TMP1]], <vscale x 16 x i8> [[TMP2]], i32 270)
-; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP6]], <vscale x 16 x i8> [[TMP3]], <vscale x 16 x i8> [[TMP4]], i32 270)
-; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP7]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP10]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP9]], <vscale x 4 x i32> [[TMP8]], i64 4)
-; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE2: [[MIDDLE_BLOCK]]:
-; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP10]])
-; CHECK-SVE2-NEXT: ret i32 [[TMP0]]
-;
-; CHECK-SVE-LABEL: define i32 @cdotp_i8_rot270(
-; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-SVE-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE: [[VECTOR_BODY]]:
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
-; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
-; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
-; CHECK-SVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
-; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
-; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
-; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE: [[MIDDLE_BLOCK]]:
-; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
-; CHECK-SVE-NEXT: ret i32 [[TMP11]]
-;
-; CHECK-NOSVE-LABEL: define i32 @cdotp_i8_rot270(
-; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
-; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
-; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-NOSVE: [[VECTOR_BODY]]:
-; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
-; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
-; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_IMAG_EXT]]
-; CHECK-NOSVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
-; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
-; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL]])
-; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
-; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
-; CHECK-NOSVE-NEXT: ret i32 [[TMP0]]
-;
-entry:
- br label %vector.body
-
-vector.body: ; preds = %vector.body, %entry
- %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
- %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
- %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
- %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
- %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
- %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
- %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
- %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
- %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
- %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
- %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
- %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.imag.ext
- %real.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %real.mul
- %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul.neg)
- %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.real.ext
- %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul)
- br i1 true, label %middle.block, label %vector.body
-
-middle.block: ; preds = %vector.body
- %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
- ret i32 %0
-}
-
-define i64 @cdotp_i16_rot0(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
-; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot0(
-; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE2: [[VECTOR_BODY]]:
-; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
-; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
-; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
-; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 0)
-; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 0)
-; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
-; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE2: [[MIDDLE_BLOCK]]:
-; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
-; CHECK-SVE2-NEXT: ret i64 [[TMP0]]
-;
-; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot0(
-; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-SVE-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE: [[VECTOR_BODY]]:
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
-; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
-; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
-; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
-; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
-; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
-; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
-; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[IMAG_MUL]]
-; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL_NEG]])
-; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE: [[MIDDLE_BLOCK]]:
-; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
-; CHECK-SVE-NEXT: ret i64 [[TMP11]]
-;
-; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot0(
-; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
-; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
-; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-NOSVE: [[VECTOR_BODY]]:
-; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
-; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
-; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
-; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
-; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
-; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
-; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
-; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[IMAG_MUL]]
-; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL_NEG]])
-; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
-; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
-; CHECK-NOSVE-NEXT: ret i64 [[TMP0]]
-;
-entry:
- br label %vector.body
-
-vector.body: ; preds = %vector.body, %entry
- %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
- %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
- %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
- %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
- %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
- %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
- %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
- %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
- %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
- %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
- %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
- %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.real.ext
- %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
- %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.imag.ext
- %imag.mul.neg = sub <vscale x 8 x i64> zeroinitializer, %imag.mul
- %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul.neg)
- br i1 true, label %middle.block, label %vector.body
-
-middle.block: ; preds = %vector.body
- %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
- ret i64 %0
-}
-
-define i64 @cdotp_i16_rot90(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
-; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot90(
-; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE2: [[VECTOR_BODY]]:
-; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
-; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
-; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
-; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 90)
-; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 90)
-; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
-; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE2: [[MIDDLE_BLOCK]]:
-; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
-; CHECK-SVE2-NEXT: ret i64 [[TMP0]]
-;
-; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot90(
-; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-SVE-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE: [[VECTOR_BODY]]:
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
-; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
-; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
-; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
-; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
-; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
-; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
-; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
-; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
-; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE: [[MIDDLE_BLOCK]]:
-; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
-; CHECK-SVE-NEXT: ret i64 [[TMP11]]
-;
-; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot90(
-; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
-; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
-; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-NOSVE: [[VECTOR_BODY]]:
-; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
-; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
-; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
-; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
-; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
-; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
-; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
-; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
-; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
-; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
-; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
-; CHECK-NOSVE-NEXT: ret i64 [[TMP0]]
-;
-entry:
- br label %vector.body
-
-vector.body: ; preds = %vector.body, %entry
- %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
- %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
- %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
- %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
- %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
- %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
- %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
- %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
- %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
- %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
- %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
- %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.imag.ext
- %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
- %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.real.ext
- %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
- br i1 true, label %middle.block, label %vector.body
-
-middle.block: ; preds = %vector.body
- %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
- ret i64 %0
-}
-
-define i64 @cdotp_i16_rot180(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
-; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot180(
-; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE2: [[VECTOR_BODY]]:
-; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
-; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
-; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
-; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 180)
-; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 180)
-; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
-; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE2: [[MIDDLE_BLOCK]]:
-; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
-; CHECK-SVE2-NEXT: ret i64 [[TMP0]]
-;
-; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot180(
-; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-SVE-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE: [[VECTOR_BODY]]:
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
-; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
-; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
-; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
-; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
-; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
-; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
-; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
-; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE: [[MIDDLE_BLOCK]]:
-; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
-; CHECK-SVE-NEXT: ret i64 [[TMP11]]
-;
-; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot180(
-; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
-; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
-; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-NOSVE: [[VECTOR_BODY]]:
-; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
-; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
-; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
-; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
-; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
-; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
-; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL]])
-; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
-; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
-; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
-; CHECK-NOSVE-NEXT: ret i64 [[TMP0]]
-;
-entry:
- br label %vector.body
-
-vector.body: ; preds = %vector.body, %entry
- %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
- %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
- %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
- %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
- %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
- %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
- %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
- %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
- %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
- %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
- %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
- %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.real.ext
- %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul)
- %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.imag.ext
- %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
- br i1 true, label %middle.block, label %vector.body
-
-middle.block: ; preds = %vector.body
- %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
- ret i64 %0
-}
-
-define i64 @cdotp_i16_rot270(<vscale x 16 x i16> %a, <vscale x 16 x i16> %b) {
-; CHECK-SVE2-LABEL: define i64 @cdotp_i16_rot270(
-; CHECK-SVE2-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE2: [[VECTOR_BODY]]:
-; CHECK-SVE2-NEXT: [[TMP11:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP10:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE2-NEXT: [[TMP1:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP2:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP3:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[A]], i64 8)
-; CHECK-SVE2-NEXT: [[TMP4:%.*]] = call <vscale x 8 x i16> @llvm.vector.extract.nxv8i16.nxv16i16(<vscale x 16 x i16> [[B]], i64 8)
-; CHECK-SVE2-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.vector.extract.nxv2i64.nxv4i64(<vscale x 4 x i64> [[TMP11]], i64 2)
-; CHECK-SVE2-NEXT: [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP5]], <vscale x 8 x i16> [[TMP1]], <vscale x 8 x i16> [[TMP2]], i32 270)
-; CHECK-SVE2-NEXT: [[TMP8:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.cdot.nxv2i64(<vscale x 2 x i64> [[TMP6]], <vscale x 8 x i16> [[TMP3]], <vscale x 8 x i16> [[TMP4]], i32 270)
-; CHECK-SVE2-NEXT: [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> poison, <vscale x 2 x i64> [[TMP7]], i64 0)
-; CHECK-SVE2-NEXT: [[TMP10]] = call <vscale x 4 x i64> @llvm.vector.insert.nxv4i64.nxv2i64(<vscale x 4 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP8]], i64 2)
-; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE2: [[MIDDLE_BLOCK]]:
-; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP10]])
-; CHECK-SVE2-NEXT: ret i64 [[TMP0]]
-;
-; CHECK-SVE-LABEL: define i64 @cdotp_i16_rot270(
-; CHECK-SVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-SVE-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE: [[VECTOR_BODY]]:
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
-; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
-; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
-; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
-; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
-; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
-; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
-; CHECK-SVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[REAL_MUL]]
-; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL_NEG]])
-; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
-; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE: [[MIDDLE_BLOCK]]:
-; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
-; CHECK-SVE-NEXT: ret i64 [[TMP11]]
-;
-; CHECK-NOSVE-LABEL: define i64 @cdotp_i16_rot270(
-; CHECK-NOSVE-SAME: <vscale x 16 x i16> [[A:%.*]], <vscale x 16 x i16> [[B:%.*]]) {
-; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
-; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-NOSVE: [[VECTOR_BODY]]:
-; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[A]])
-; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.nxv16i16(<vscale x 16 x i16> [[B]])
-; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[A_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } [[B_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[A_REAL]] to <vscale x 8 x i64>
-; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[A_IMAG]] to <vscale x 8 x i64>
-; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 8 x i16> [[B_REAL]] to <vscale x 8 x i64>
-; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 8 x i16> [[B_IMAG]] to <vscale x 8 x i64>
-; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 8 x i64> [[B_REAL_EXT]], [[A_IMAG_EXT]]
-; CHECK-NOSVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub <vscale x 8 x i64> zeroinitializer, [[REAL_MUL]]
-; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 8 x i64> [[REAL_MUL_NEG]])
-; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 8 x i64> [[B_IMAG_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> [[REAL_MUL_REDUCED]], <vscale x 8 x i64> [[IMAG_MUL]])
-; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
-; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE_SUB]])
-; CHECK-NOSVE-NEXT: ret i64 [[TMP0]]
-;
-entry:
- br label %vector.body
-
-vector.body: ; preds = %vector.body, %entry
- %vec.phi = phi <vscale x 2 x i64> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
- %a.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %a)
- %b.deinterleaved = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.vector.deinterleave2.v16i16(<vscale x 16 x i16> %b)
- %a.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 0
- %a.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %a.deinterleaved, 1
- %b.real = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 0
- %b.imag = extractvalue { <vscale x 8 x i16>, <vscale x 8 x i16> } %b.deinterleaved, 1
- %a.real.ext = sext <vscale x 8 x i16> %a.real to <vscale x 8 x i64>
- %a.imag.ext = sext <vscale x 8 x i16> %a.imag to <vscale x 8 x i64>
- %b.real.ext = sext <vscale x 8 x i16> %b.real to <vscale x 8 x i64>
- %b.imag.ext = sext <vscale x 8 x i16> %b.imag to <vscale x 8 x i64>
- %real.mul = mul <vscale x 8 x i64> %b.real.ext, %a.imag.ext
- %real.mul.neg = sub <vscale x 8 x i64> zeroinitializer, %real.mul
- %real.mul.reduced = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %vec.phi, <vscale x 8 x i64> %real.mul.neg)
- %imag.mul = mul <vscale x 8 x i64> %b.imag.ext, %a.real.ext
- %partial.reduce.sub = call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %real.mul.reduced, <vscale x 8 x i64> %imag.mul)
- br i1 true, label %middle.block, label %vector.body
-
-middle.block: ; preds = %vector.body
- %0 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> %partial.reduce.sub)
- ret i64 %0
-}
-
-
-define i32 @not_cdotp(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
-; CHECK-SVE2-LABEL: define i32 @not_cdotp(
-; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE2: [[VECTOR_BODY]]:
-; CHECK-SVE2-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE2-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
-; CHECK-SVE2-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
-; CHECK-SVE2-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
-; CHECK-SVE2-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
-; CHECK-SVE2-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
-; CHECK-SVE2-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
-; CHECK-SVE2-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
-; CHECK-SVE2-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
-; CHECK-SVE2-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
-; CHECK-SVE2-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
-; CHECK-SVE2-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE2-NEXT: [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
-; CHECK-SVE2-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
-; CHECK-SVE2-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-SVE2-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
-; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE2: [[MIDDLE_BLOCK]]:
-; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
-; CHECK-SVE2-NEXT: ret i32 [[TMP0]]
-;
-; CHECK-SVE-LABEL: define i32 @not_cdotp(
-; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-SVE-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE: [[VECTOR_BODY]]:
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
-; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
-; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
-; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
-; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
-; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE: [[MIDDLE_BLOCK]]:
-; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
-; CHECK-SVE-NEXT: ret i32 [[TMP0]]
-;
-; CHECK-NOSVE-LABEL: define i32 @not_cdotp(
-; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
-; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
-; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-NOSVE: [[VECTOR_BODY]]:
-; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
-; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
-; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT: [[REAL_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[REAL_MUL]]
-; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL_NEG]])
-; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
-; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
-; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE_SUB]])
-; CHECK-NOSVE-NEXT: ret i32 [[TMP0]]
-;
-entry:
- br label %vector.body
-
-vector.body: ; preds = %vector.body, %entry
- %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
- %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
- %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
- %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
- %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
- %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
- %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
- %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
- %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
- %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
- %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
- %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
- %real.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %real.mul
- %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul.neg)
- %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
- %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
- %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
- br i1 true, label %middle.block, label %vector.body
-
-middle.block: ; preds = %vector.body
- %0 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
- ret i32 %0
-}
-
-define i16 @invalid_type(<vscale x 32 x i8> %a, <vscale x 32 x i8> %b) {
-; CHECK-SVE2-LABEL: define i16 @invalid_type(
-; CHECK-SVE2-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE2: [[VECTOR_BODY]]:
-; CHECK-SVE2-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE2-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
-; CHECK-SVE2-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
-; CHECK-SVE2-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
-; CHECK-SVE2-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
-; CHECK-SVE2-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
-; CHECK-SVE2-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
-; CHECK-SVE2-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
-; CHECK-SVE2-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
-; CHECK-SVE2-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
-; CHECK-SVE2-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
-; CHECK-SVE2-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE2-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
-; CHECK-SVE2-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-SVE2-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
-; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE2: [[MIDDLE_BLOCK]]:
-; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
-; CHECK-SVE2-NEXT: ret i16 [[TMP0]]
-;
-; CHECK-SVE-LABEL: define i16 @invalid_type(
-; CHECK-SVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-SVE-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE: [[VECTOR_BODY]]:
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
-; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
-; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
-; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
-; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
-; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE: [[MIDDLE_BLOCK]]:
-; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
-; CHECK-SVE-NEXT: ret i16 [[TMP0]]
-;
-; CHECK-NOSVE-LABEL: define i16 @invalid_type(
-; CHECK-NOSVE-SAME: <vscale x 32 x i8> [[A:%.*]], <vscale x 32 x i8> [[B:%.*]]) {
-; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
-; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-NOSVE: [[VECTOR_BODY]]:
-; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i16> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[A]])
-; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[B]])
-; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[A_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[B_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[A_REAL]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[A_IMAG]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <vscale x 16 x i8> [[B_REAL]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <vscale x 16 x i8> [[B_IMAG]] to <vscale x 16 x i32>
-; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <vscale x 16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[VEC_PHI]], <vscale x 16 x i32> [[REAL_MUL]])
-; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <vscale x 16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> [[REAL_MUL_REDUCED]], <vscale x 16 x i32> [[IMAG_MUL_NEG]])
-; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
-; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> [[PARTIAL_REDUCE_SUB]])
-; CHECK-NOSVE-NEXT: ret i16 [[TMP0]]
-;
-entry:
- br label %vector.body
-
-vector.body: ; preds = %vector.body, %entry
- %vec.phi = phi <vscale x 8 x i16> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
- %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %a)
- %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.v32i8(<vscale x 32 x i8> %b)
- %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
- %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
- %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
- %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
- %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
- %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
- %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
- %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
- %real.mul = mul <vscale x 16 x i32> %b.real.ext, %a.real.ext
- %real.mul.reduced = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> %vec.phi, <vscale x 16 x i32> %real.mul)
- %imag.mul = mul <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
- %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
- %partial.reduce.sub = call <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
- br i1 true, label %middle.block, label %vector.body
-
-middle.block: ; preds = %vector.body
- %0 = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> %partial.reduce.sub)
- ret i16 %0
-}
-
-define i32 @not_cdotp_i8_rot0_fixed_length(<32 x i8> %a, <32 x i8> %b) {
-; CHECK-SVE2-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length(
-; CHECK-SVE2-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-SVE2-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE2-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE2: [[VECTOR_BODY]]:
-; CHECK-SVE2-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE2-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]])
-; CHECK-SVE2-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]])
-; CHECK-SVE2-NEXT: [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0
-; CHECK-SVE2-NEXT: [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1
-; CHECK-SVE2-NEXT: [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0
-; CHECK-SVE2-NEXT: [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1
-; CHECK-SVE2-NEXT: [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32>
-; CHECK-SVE2-NEXT: [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32>
-; CHECK-SVE2-NEXT: [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
-; CHECK-SVE2-NEXT: [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
-; CHECK-SVE2-NEXT: [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE2-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
-; CHECK-SVE2-NEXT: [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-SVE2-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-SVE2-NEXT: [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
-; CHECK-SVE2-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE2: [[MIDDLE_BLOCK]]:
-; CHECK-SVE2-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
-; CHECK-SVE2-NEXT: ret i32 [[TMP0]]
-;
-; CHECK-SVE-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length(
-; CHECK-SVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) #[[ATTR0]] {
-; CHECK-SVE-NEXT: [[ENTRY:.*]]:
-; CHECK-SVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-SVE: [[VECTOR_BODY]]:
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]])
-; CHECK-SVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]])
-; CHECK-SVE-NEXT: [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0
-; CHECK-SVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1
-; CHECK-SVE-NEXT: [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32>
-; CHECK-SVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32>
-; CHECK-SVE-NEXT: [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
-; CHECK-SVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
-; CHECK-SVE-NEXT: [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-SVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
-; CHECK-SVE-NEXT: [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-SVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-SVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
-; CHECK-SVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-SVE: [[MIDDLE_BLOCK]]:
-; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
-; CHECK-SVE-NEXT: ret i32 [[TMP0]]
-;
-; CHECK-NOSVE-LABEL: define i32 @not_cdotp_i8_rot0_fixed_length(
-; CHECK-NOSVE-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]]) {
-; CHECK-NOSVE-NEXT: [[ENTRY:.*]]:
-; CHECK-NOSVE-NEXT: br label %[[VECTOR_BODY:.*]]
-; CHECK-NOSVE: [[VECTOR_BODY]]:
-; CHECK-NOSVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[ENTRY]] ], [ [[PARTIAL_REDUCE_SUB:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NOSVE-NEXT: [[A_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[A]])
-; CHECK-NOSVE-NEXT: [[B_DEINTERLEAVED:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> [[B]])
-; CHECK-NOSVE-NEXT: [[A_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[A_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[A_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[B_REAL:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 0
-; CHECK-NOSVE-NEXT: [[B_IMAG:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[B_DEINTERLEAVED]], 1
-; CHECK-NOSVE-NEXT: [[A_REAL_EXT:%.*]] = sext <16 x i8> [[A_REAL]] to <16 x i32>
-; CHECK-NOSVE-NEXT: [[A_IMAG_EXT:%.*]] = sext <16 x i8> [[A_IMAG]] to <16 x i32>
-; CHECK-NOSVE-NEXT: [[B_REAL_EXT:%.*]] = sext <16 x i8> [[B_REAL]] to <16 x i32>
-; CHECK-NOSVE-NEXT: [[B_IMAG_EXT:%.*]] = sext <16 x i8> [[B_IMAG]] to <16 x i32>
-; CHECK-NOSVE-NEXT: [[REAL_MUL:%.*]] = mul <16 x i32> [[B_REAL_EXT]], [[A_REAL_EXT]]
-; CHECK-NOSVE-NEXT: [[REAL_MUL_REDUCED:%.*]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[REAL_MUL]])
-; CHECK-NOSVE-NEXT: [[IMAG_MUL:%.*]] = mul <16 x i32> [[B_IMAG_EXT]], [[A_IMAG_EXT]]
-; CHECK-NOSVE-NEXT: [[IMAG_MUL_NEG:%.*]] = sub <16 x i32> zeroinitializer, [[IMAG_MUL]]
-; CHECK-NOSVE-NEXT: [[PARTIAL_REDUCE_SUB]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[REAL_MUL_REDUCED]], <16 x i32> [[IMAG_MUL_NEG]])
-; CHECK-NOSVE-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
-; CHECK-NOSVE: [[MIDDLE_BLOCK]]:
-; CHECK-NOSVE-NEXT: [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE_SUB]])
-; CHECK-NOSVE-NEXT: ret i32 [[TMP0]]
-;
-entry:
- br label %vector.body
-
-vector.body: ; preds = %vector.body, %entry
- %vec.phi = phi <4 x i32> [ zeroinitializer, %entry ], [ %partial.reduce.sub, %vector.body ]
- %a.deinterleaved = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %a)
- %b.deinterleaved = call { <16 x i8>, <16 x i8> } @llvm.vector.deinterleave2.v32i8(<32 x i8> %b)
- %a.real = extractvalue { <16 x i8>, <16 x i8> } %a.deinterleaved, 0
- %a.imag = extractvalue { <16 x i8>, <16 x i8> } %a.deinterleaved, 1
- %b.real = extractvalue { <16 x i8>, <16 x i8> } %b.deinterleaved, 0
- %b.imag = extractvalue { <16 x i8>, <16 x i8> } %b.deinterleaved, 1
- %a.real.ext = sext <16 x i8> %a.real to <16 x i32>
- %a.imag.ext = sext <16 x i8> %a.imag to <16 x i32>
- %b.real.ext = sext <16 x i8> %b.real to <16 x i32>
- %b.imag.ext = sext <16 x i8> %b.imag to <16 x i32>
- %real.mul = mul <16 x i32> %b.real.ext, %a.real.ext
- %real.mul.reduced = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %vec.phi, <16 x i32> %real.mul)
- %imag.mul = mul <16 x i32> %b.imag.ext, %a.imag.ext
- %imag.mul.neg = sub <16 x i32> zeroinitializer, %imag.mul
- %partial.reduce.sub = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %real.mul.reduced, <16 x i32> %imag.mul.neg)
- br i1 true, label %middle.block, label %vector.body
-
-middle.block: ; preds = %vector.body
- %0 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %partial.reduce.sub)
- ret i32 %0
-}
-
-declare <vscale x 8 x i16> @llvm.experimental.vector.partial.reduce.add.nxv8i16.nxv16i32(<vscale x 8 x i16>, <vscale x 16 x i32>)
-declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32>, <vscale x 16 x i32>)
-declare <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i32(<vscale x 2 x i64>, <vscale x 16 x i32>)
-
-declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32>, <16 x i32>)
-declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
-
-declare i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32>)
-declare i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64>)
More information about the llvm-commits
mailing list