[llvm] Add support for single reductions in ComplexDeinterleavingPass (PR #112875)
Nicholas Guy via llvm-commits
llvm-commits at lists.llvm.org
Tue Oct 22 06:38:19 PDT 2024
https://github.com/NickGuy-Arm updated https://github.com/llvm/llvm-project/pull/112875
>From d77a3e402d05290903f75ce2dc7478f14a3943ef Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Tue, 8 Oct 2024 14:53:36 +0100
Subject: [PATCH 1/3] Add support for single reductions in
ComplexDeinterleavingPass
---
.../llvm/CodeGen/ComplexDeinterleavingPass.h | 1 +
.../lib/CodeGen/ComplexDeinterleavingPass.cpp | 121 +++++++++++--
.../Target/AArch64/AArch64ISelLowering.cpp | 19 +-
.../AArch64/complex-deinterleaving-cdot.ll | 170 ++++++++++++++++++
4 files changed, 288 insertions(+), 23 deletions(-)
create mode 100644 llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
index 84a2673fecb5bf..a3fa2197727701 100644
--- a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
+++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
@@ -43,6 +43,7 @@ enum class ComplexDeinterleavingOperation {
ReductionPHI,
ReductionOperation,
ReductionSelect,
+ ReductionSingle
};
enum class ComplexDeinterleavingRotation {
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 8573b016d1e5bb..08287a4d5ed022 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -145,6 +145,7 @@ struct ComplexDeinterleavingCompositeNode {
friend class ComplexDeinterleavingGraph;
using NodePtr = std::shared_ptr<ComplexDeinterleavingCompositeNode>;
using RawNodePtr = ComplexDeinterleavingCompositeNode *;
+ bool OperandsValid = true;
public:
ComplexDeinterleavingOperation Operation;
@@ -161,7 +162,11 @@ struct ComplexDeinterleavingCompositeNode {
SmallVector<RawNodePtr> Operands;
Value *ReplacementNode = nullptr;
- void addOperand(NodePtr Node) { Operands.push_back(Node.get()); }
+ void addOperand(NodePtr Node) {
+ if (!Node || !Node.get())
+ OperandsValid = false;
+ Operands.push_back(Node.get());
+ }
void dump() { dump(dbgs()); }
void dump(raw_ostream &OS) {
@@ -195,6 +200,10 @@ struct ComplexDeinterleavingCompositeNode {
PrintNodeRef(Op);
}
}
+
+ bool AreOperandsValid() {
+ return OperandsValid;
+ }
};
class ComplexDeinterleavingGraph {
@@ -294,7 +303,7 @@ class ComplexDeinterleavingGraph {
NodePtr submitCompositeNode(NodePtr Node) {
CompositeNodes.push_back(Node);
- if (Node->Real && Node->Imag)
+ if (Node->Real)
CachedResult[{Node->Real, Node->Imag}] = Node;
return Node;
}
@@ -328,8 +337,10 @@ class ComplexDeinterleavingGraph {
/// i: ai - br
NodePtr identifyAdd(Instruction *Real, Instruction *Imag);
NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag);
+ NodePtr identifyPartialReduction(Value *R, Value *I);
NodePtr identifyNode(Value *R, Value *I);
+ NodePtr identifyNode(Value *R, Value *I, bool &FromCache);
/// Determine if a sum of complex numbers can be formed from \p RealAddends
/// and \p ImagAddens. If \p Accumulator is not null, add the result to it.
@@ -397,6 +408,7 @@ class ComplexDeinterleavingGraph {
/// * Deinterleave the final value outside of the loop and repurpose original
/// reduction users
void processReductionOperation(Value *OperationReplacement, RawNodePtr Node);
+ void processReductionSingle(Value *OperationReplacement, RawNodePtr Node);
public:
void dump() { dump(dbgs()); }
@@ -893,16 +905,26 @@ ComplexDeinterleavingGraph::identifySymmetricOperation(Instruction *Real,
ComplexDeinterleavingGraph::NodePtr
ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I) {
- LLVM_DEBUG(dbgs() << "identifyNode on " << *R << " / " << *I << "\n");
- assert(R->getType() == I->getType() &&
- "Real and imaginary parts should not have different types");
+ bool _;
+ return identifyNode(R, I, _);
+}
+ComplexDeinterleavingGraph::NodePtr
+ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool &FromCache) {
auto It = CachedResult.find({R, I});
if (It != CachedResult.end()) {
LLVM_DEBUG(dbgs() << " - Folding to existing node\n");
+ FromCache = true;
return It->second;
}
+ if(NodePtr CN = identifyPartialReduction(R, I))
+ return CN;
+
+ bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I);
+ if(!IsReduction && R->getType() != I->getType())
+ return nullptr;
+
if (NodePtr CN = identifySplat(R, I))
return CN;
@@ -1428,12 +1450,18 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
if (It != RootToNode.end()) {
auto RootNode = It->second;
assert(RootNode->Operation ==
- ComplexDeinterleavingOperation::ReductionOperation);
+ ComplexDeinterleavingOperation::ReductionOperation || RootNode->Operation == ComplexDeinterleavingOperation::ReductionSingle);
// Find out which part, Real or Imag, comes later, and only if we come to
// the latest part, add it to OrderedRoots.
auto *R = cast<Instruction>(RootNode->Real);
- auto *I = cast<Instruction>(RootNode->Imag);
- auto *ReplacementAnchor = R->comesBefore(I) ? I : R;
+ auto *I = RootNode->Imag ? cast<Instruction>(RootNode->Imag) : nullptr;
+
+ Instruction *ReplacementAnchor;
+ if(I)
+ ReplacementAnchor = R->comesBefore(I) ? I : R;
+ else
+ ReplacementAnchor = R;
+
if (ReplacementAnchor != RootI)
return false;
OrderedRoots.push_back(RootI);
@@ -1521,11 +1549,11 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
for (size_t i = 0; i < OperationInstruction.size(); ++i) {
if (Processed[i])
continue;
+ auto *Real = OperationInstruction[i];
for (size_t j = i + 1; j < OperationInstruction.size(); ++j) {
if (Processed[j])
continue;
-
- auto *Real = OperationInstruction[i];
+
auto *Imag = OperationInstruction[j];
if (Real->getType() != Imag->getType())
continue;
@@ -1557,6 +1585,25 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
break;
}
}
+
+ // We want to check that we have 2 operands, but the function attributes
+ // being counted as operands bloats this value.
+ if(Real->getNumOperands() < 2)
+ continue;
+
+ RealPHI = ReductionInfo[Real].first;
+ ImagPHI = nullptr;
+ PHIsFound = false;
+ auto Node = identifyNode(Real->getOperand(0), Real->getOperand(1));
+ if(Node && PHIsFound) {
+ LLVM_DEBUG(dbgs() << "Identified single reduction starting from instruction: "
+ << *Real << "/" << *ReductionInfo[Real].second << "\n");
+ Processed[i] = true;
+ auto RootNode = prepareCompositeNode(ComplexDeinterleavingOperation::ReductionSingle, Real, nullptr);
+ RootNode->addOperand(Node);
+ RootToNode[Real] = RootNode;
+ submitCompositeNode(RootNode);
+ }
}
RealPHI = nullptr;
@@ -1564,6 +1611,12 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
}
bool ComplexDeinterleavingGraph::checkNodes() {
+
+ for (NodePtr N : CompositeNodes) {
+ if (!N->AreOperandsValid())
+ return false;
+ }
+
// Collect all instructions from roots to leaves
SmallPtrSet<Instruction *, 16> AllInstructions;
SmallVector<Instruction *, 8> Worklist;
@@ -1832,7 +1885,7 @@ ComplexDeinterleavingGraph::identifySplat(Value *R, Value *I) {
ComplexDeinterleavingGraph::NodePtr
ComplexDeinterleavingGraph::identifyPHINode(Instruction *Real,
Instruction *Imag) {
- if (Real != RealPHI || Imag != ImagPHI)
+ if (Real != RealPHI || (ImagPHI && Imag != ImagPHI))
return nullptr;
PHIsFound = true;
@@ -1970,13 +2023,18 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
case ComplexDeinterleavingOperation::ReductionPHI: {
// If Operation is ReductionPHI, a new empty PHINode is created.
// It is filled later when the ReductionOperation is processed.
+ auto *OldPHI = cast<PHINode>(Node->Real);
auto *VTy = cast<VectorType>(Node->Real->getType());
auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
auto *NewPHI = PHINode::Create(NewVTy, 0, "", BackEdge->getFirstNonPHIIt());
- OldToNewPHI[dyn_cast<PHINode>(Node->Real)] = NewPHI;
+ OldToNewPHI[OldPHI] = NewPHI;
ReplacementNode = NewPHI;
break;
}
+ case ComplexDeinterleavingOperation::ReductionSingle:
+ ReplacementNode = replaceNode(Builder, Node->Operands[0]);
+ processReductionSingle(ReplacementNode, Node);
+ break;
case ComplexDeinterleavingOperation::ReductionOperation:
ReplacementNode = replaceNode(Builder, Node->Operands[0]);
processReductionOperation(ReplacementNode, Node);
@@ -2001,6 +2059,37 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
return ReplacementNode;
}
+void ComplexDeinterleavingGraph::processReductionSingle(Value *OperationReplacement, RawNodePtr Node) {
+ auto *Real = cast<Instruction>(Node->Real);
+ auto *OldPHI = ReductionInfo[Real].first;
+ auto *NewPHI = OldToNewPHI[OldPHI];
+ auto *VTy = cast<VectorType>(Real->getType());
+ auto *NewVTy = VectorType::getDoubleElementsVectorType(VTy);
+
+ Value *Init = OldPHI->getIncomingValueForBlock(Incoming);
+
+ IRBuilder<> Builder(Incoming->getTerminator());
+
+ Value *NewInit = nullptr;
+ if(auto *C = dyn_cast<Constant>(Init)) {
+ if(C->isZeroValue())
+ NewInit = Constant::getNullValue(NewVTy);
+ }
+
+ if (!NewInit)
+ NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy,
+ {Init, Constant::getNullValue(VTy)});
+
+ NewPHI->addIncoming(NewInit, Incoming);
+ NewPHI->addIncoming(OperationReplacement, BackEdge);
+
+ auto *FinalReduction = ReductionInfo[Real].second;
+ Builder.SetInsertPoint(&*FinalReduction->getParent()->getFirstInsertionPt());
+ // TODO Ensure that the `AddReduce` here matches the original, found in `FinalReduction`
+ auto *AddReduce = Builder.CreateAddReduce(OperationReplacement);
+ FinalReduction->replaceAllUsesWith(AddReduce);
+}
+
void ComplexDeinterleavingGraph::processReductionOperation(
Value *OperationReplacement, RawNodePtr Node) {
auto *Real = cast<Instruction>(Node->Real);
@@ -2060,8 +2149,12 @@ void ComplexDeinterleavingGraph::replaceNodes() {
auto *RootImag = cast<Instruction>(RootNode->Imag);
ReductionInfo[RootReal].first->removeIncomingValue(BackEdge);
ReductionInfo[RootImag].first->removeIncomingValue(BackEdge);
- DeadInstrRoots.push_back(cast<Instruction>(RootReal));
- DeadInstrRoots.push_back(cast<Instruction>(RootImag));
+ DeadInstrRoots.push_back(RootReal);
+ DeadInstrRoots.push_back(RootImag);
+ } else if(RootNode->Operation == ComplexDeinterleavingOperation::ReductionSingle) {
+ auto *RootInst = cast<Instruction>(RootNode->Real);
+ ReductionInfo[RootInst].first->removeIncomingValue(BackEdge);
+ DeadInstrRoots.push_back(ReductionInfo[RootInst].second);
} else {
assert(R && "Unable to find replacement for RootInstruction");
DeadInstrRoots.push_back(RootInstruction);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5e5afdb7fa0a6c..8068bb67408814 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29171,6 +29171,8 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
Value *Accumulator) const {
VectorType *Ty = cast<VectorType>(InputA->getType());
+ if (Accumulator == nullptr)
+ Accumulator = Constant::getNullValue(Ty);
bool IsScalable = Ty->isScalableTy();
bool IsInt = Ty->getElementType()->isIntegerTy();
@@ -29182,6 +29184,7 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
if (TyWidth > 128) {
int Stride = Ty->getElementCount().getKnownMinValue() / 2;
+ int AccStride = cast<VectorType>(Accumulator->getType())->getElementCount().getKnownMinValue() / 2;
auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
@@ -29191,25 +29194,23 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
B.CreateExtractVector(HalfTy, InputB, B.getInt64(Stride));
Value *LowerSplitAcc = nullptr;
Value *UpperSplitAcc = nullptr;
- if (Accumulator) {
- LowerSplitAcc = B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(0));
+ Type *FullTy = Ty;
+ FullTy = Accumulator->getType();
+ auto *HalfAccTy = VectorType::getHalfElementsVectorType(cast<VectorType>(Accumulator->getType()));
+ LowerSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(0));
UpperSplitAcc =
- B.CreateExtractVector(HalfTy, Accumulator, B.getInt64(Stride));
- }
+ B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(AccStride));
auto *LowerSplitInt = createComplexDeinterleavingIR(
B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
auto *UpperSplitInt = createComplexDeinterleavingIR(
B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
- auto *Result = B.CreateInsertVector(Ty, PoisonValue::get(Ty), LowerSplitInt,
+ auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy), LowerSplitInt,
B.getInt64(0));
- return B.CreateInsertVector(Ty, Result, UpperSplitInt, B.getInt64(Stride));
+ return B.CreateInsertVector(FullTy, Result, UpperSplitInt, B.getInt64(AccStride));
}
if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
- if (Accumulator == nullptr)
- Accumulator = Constant::getNullValue(Ty);
-
if (IsScalable) {
if (IsInt)
return B.CreateIntrinsic(
diff --git a/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
new file mode 100644
index 00000000000000..6277f9a3842bbe
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/complex-deinterleaving-cdot.ll
@@ -0,0 +1,170 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=complex-deinterleaving %s --mattr=+sve2 -o - | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define i32 @cdotp(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef %N) {
+; CHECK-LABEL: define i32 @cdotp(
+; CHECK-SAME: ptr nocapture noundef readonly [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[ENTRY:.*]]:
+; CHECK-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
+; CHECK-NEXT: br i1 [[CMP28_NOT]], label %[[FOR_COND_CLEANUP:.*]], label %[[FOR_BODY_PREHEADER:.*]]
+; CHECK: [[FOR_BODY_PREHEADER]]:
+; CHECK-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1
+; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK: [[VECTOR_PH]]:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK: [[VECTOR_BODY]]:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP20:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT: [[INDEX_I:%.*]] = shl nuw nsw i64 [[INDEX]], 1
+; CHECK-NEXT: [[A_PTR:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX_I]]
+; CHECK-NEXT: [[A_LOAD:%.*]] = load <vscale x 32 x i8>, ptr [[A_PTR]], align 32
+; CHECK-NEXT: [[B_PTR:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX_I]]
+; CHECK-NEXT: [[B_LOAD:%.*]] = load <vscale x 32 x i8>, ptr [[B_PTR]], align 32
+; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A_LOAD]], i64 0)
+; CHECK-NEXT: [[TMP7:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B_LOAD]], i64 0)
+; CHECK-NEXT: [[TMP8:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[A_LOAD]], i64 16)
+; CHECK-NEXT: [[TMP9:%.*]] = call <vscale x 16 x i8> @llvm.vector.extract.nxv16i8.nxv32i8(<vscale x 32 x i8> [[B_LOAD]], i64 16)
+; CHECK-NEXT: [[VEC_PHI:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 0)
+; CHECK-NEXT: [[TMP13:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP11]], i64 4)
+; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i32 0)
+; CHECK-NEXT: [[TMP21:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.cdot.nxv4i32(<vscale x 4 x i32> [[TMP13]], <vscale x 16 x i8> [[TMP8]], <vscale x 16 x i8> [[TMP9]], i32 0)
+; CHECK-NEXT: [[TMP22:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP10]], i64 0)
+; CHECK-NEXT: [[TMP20]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP22]], <vscale x 4 x i32> [[TMP21]], i64 4)
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]]
+; CHECK: [[MIDDLE_BLOCK]]:
+; CHECK-NEXT: [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP20]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP_LOOPEXIT:.*]], label %[[SCALAR_PH]]
+; CHECK: [[SCALAR_PH]]:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP23]], %[[MIDDLE_BLOCK]] ], [ 0, %[[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: br label %[[FOR_BODY:.*]]
+; CHECK: [[FOR_COND_CLEANUP_LOOPEXIT]]:
+; CHECK-NEXT: [[SUB_LCSSA:%.*]] = phi i32 [ [[SUB:%.*]], %[[FOR_BODY]] ], [ [[TMP23]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: br label %[[FOR_COND_CLEANUP]]
+; CHECK: [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[SUB_LCSSA]], %[[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT: ret i32 [[RES_0_LCSSA]]
+; CHECK: [[FOR_BODY]]:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], %[[FOR_BODY]] ]
+; CHECK-NEXT: [[RES_030:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[SUB]], %[[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP14]]
+; CHECK-NEXT: [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[CONV:%.*]] = sext i8 [[TMP15]] to i32
+; CHECK-NEXT: [[TMP16:%.*]] = or disjoint i64 [[TMP14]], 1
+; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP16]]
+; CHECK-NEXT: [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; CHECK-NEXT: [[CONV5:%.*]] = sext i8 [[TMP17]] to i32
+; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP14]]
+; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX9]], align 1
+; CHECK-NEXT: [[CONV10:%.*]] = sext i8 [[TMP18]] to i32
+; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP16]]
+; CHECK-NEXT: [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX14]], align 1
+; CHECK-NEXT: [[CONV15:%.*]] = sext i8 [[TMP19]] to i32
+; CHECK-NEXT: [[MUL16:%.*]] = mul nsw i32 [[CONV10]], [[CONV]]
+; CHECK-NEXT: [[ADD17:%.*]] = add nsw i32 [[MUL16]], [[RES_030]]
+; CHECK-NEXT: [[MUL18:%.*]] = mul nsw i32 [[CONV15]], [[CONV5]]
+; CHECK-NEXT: [[SUB]] = sub i32 [[ADD17]], [[MUL18]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP_LOOPEXIT]], label %[[FOR_BODY]]
+;
+entry:
+ %cmp28.not = icmp ult i32 %N, 2
+ br i1 %cmp28.not, label %for.cond.cleanup, label %for.body.preheader
+for.body.preheader: ; preds = %entry
+ %div27 = lshr i32 %N, 1
+ %wide.trip.count = zext nneg i32 %div27 to i64
+ %0 = call i64 @llvm.vscale.i64()
+ %1 = mul i64 %0, 16
+ %min.iters.check = icmp ult i64 %wide.trip.count, %1
+ br i1 %min.iters.check, label %scalar.ph, label %vector.ph
+vector.ph: ; preds = %for.body.preheader
+ %2 = call i64 @llvm.vscale.i64()
+ %3 = mul i64 %2, 16
+ %n.mod.vf = urem i64 %wide.trip.count, %3
+ %n.vec = sub i64 %wide.trip.count, %n.mod.vf
+ %4 = call i64 @llvm.vscale.i64()
+ %5 = mul i64 %4, 16
+ br label %vector.body
+vector.body:
+ %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %vec.phi = phi <vscale x 4 x i32> [ zeroinitializer, %vector.ph ], [ %partial.reduce.sub, %vector.body ]
+ %index.i = shl nuw nsw i64 %index, 1
+ %a.ptr = getelementptr inbounds i8, ptr %a, i64 %index.i
+ %a.load = load <vscale x 32 x i8>, ptr %a.ptr
+ %a.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %a.load)
+ %a.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 0
+ %a.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %a.deinterleaved, 1
+ %b.ptr = getelementptr inbounds i8, ptr %b, i64 %index.i
+ %b.load = load <vscale x 32 x i8>, ptr %b.ptr
+ %b.deinterleaved = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> %b.load)
+ %b.real = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 0
+ %b.imag = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } %b.deinterleaved, 1
+ %a.real.ext = sext <vscale x 16 x i8> %a.real to <vscale x 16 x i32>
+ %a.imag.ext = sext <vscale x 16 x i8> %a.imag to <vscale x 16 x i32>
+ %b.real.ext = sext <vscale x 16 x i8> %b.real to <vscale x 16 x i32>
+ %b.imag.ext = sext <vscale x 16 x i8> %b.imag to <vscale x 16 x i32>
+ %real.mul = mul nsw <vscale x 16 x i32> %b.real.ext, %a.real.ext
+ %real.mul.reduced = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %vec.phi, <vscale x 16 x i32> %real.mul)
+ %imag.mul = mul nsw <vscale x 16 x i32> %b.imag.ext, %a.imag.ext
+ %imag.mul.neg = sub <vscale x 16 x i32> zeroinitializer, %imag.mul
+ %partial.reduce.sub = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %real.mul.reduced, <vscale x 16 x i32> %imag.mul.neg)
+ %index.next = add nuw i64 %index, %5
+ %22 = icmp eq i64 %index.next, %n.vec
+ br i1 %22, label %middle.block, label %vector.body
+middle.block: ; preds = %vector.body
+ %25 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> %partial.reduce.sub)
+ %cmp.n = icmp eq i64 %wide.trip.count, %n.vec
+ br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %scalar.ph
+scalar.ph: ; preds = %middle.block, %for.body.preheader
+ %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %for.body.preheader ]
+ %bc.merge.rdx = phi i32 [ %25, %middle.block ], [ 0, %for.body.preheader ]
+ br label %for.body
+for.cond.cleanup.loopexit: ; preds = %middle.block, %for.body
+ %sub.lcssa = phi i32 [ %sub, %for.body ], [ %25, %middle.block ]
+ br label %for.cond.cleanup
+for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
+ %res.0.lcssa = phi i32 [ 0, %entry ], [ %sub.lcssa, %for.cond.cleanup.loopexit ]
+ ret i32 %res.0.lcssa
+for.body: ; preds = %scalar.ph, %for.body
+ %indvars.iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %indvars.iv.next, %for.body ]
+ %res.030 = phi i32 [ %bc.merge.rdx, %scalar.ph ], [ %sub, %for.body ]
+ %26 = shl nuw nsw i64 %indvars.iv, 1
+ %arrayidx = getelementptr inbounds i8, ptr %a, i64 %26
+ %27 = load i8, ptr %arrayidx, align 1
+ %conv = sext i8 %27 to i32
+ %28 = or disjoint i64 %26, 1
+ %arrayidx4 = getelementptr inbounds i8, ptr %a, i64 %28
+ %29 = load i8, ptr %arrayidx4, align 1
+ %conv5 = sext i8 %29 to i32
+ %arrayidx9 = getelementptr inbounds i8, ptr %b, i64 %26
+ %30 = load i8, ptr %arrayidx9, align 1
+ %conv10 = sext i8 %30 to i32
+ %arrayidx14 = getelementptr inbounds i8, ptr %b, i64 %28
+ %31 = load i8, ptr %arrayidx14, align 1
+ %conv15 = sext i8 %31 to i32
+ %mul16 = mul nsw i32 %conv10, %conv
+ %add17 = add nsw i32 %mul16, %res.030
+ %mul18 = mul nsw i32 %conv15, %conv5
+ %sub = sub i32 %add17, %mul18
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
>From aa22727eec2e53a3c3ba8ef23b62695d28471431 Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Fri, 18 Oct 2024 11:14:49 +0100
Subject: [PATCH 2/3] Apply clang-format
---
.../lib/CodeGen/ComplexDeinterleavingPass.cpp | 46 ++++++++++---------
.../Target/AArch64/AArch64ISelLowering.cpp | 24 ++++++----
2 files changed, 40 insertions(+), 30 deletions(-)
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 08287a4d5ed022..3a5436714715b5 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -201,9 +201,7 @@ struct ComplexDeinterleavingCompositeNode {
}
}
- bool AreOperandsValid() {
- return OperandsValid;
- }
+ bool AreOperandsValid() { return OperandsValid; }
};
class ComplexDeinterleavingGraph {
@@ -918,11 +916,11 @@ ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool &FromCache) {
return It->second;
}
- if(NodePtr CN = identifyPartialReduction(R, I))
+ if (NodePtr CN = identifyPartialReduction(R, I))
return CN;
bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I);
- if(!IsReduction && R->getType() != I->getType())
+ if (!IsReduction && R->getType() != I->getType())
return nullptr;
if (NodePtr CN = identifySplat(R, I))
@@ -1450,18 +1448,20 @@ bool ComplexDeinterleavingGraph::identifyNodes(Instruction *RootI) {
if (It != RootToNode.end()) {
auto RootNode = It->second;
assert(RootNode->Operation ==
- ComplexDeinterleavingOperation::ReductionOperation || RootNode->Operation == ComplexDeinterleavingOperation::ReductionSingle);
+ ComplexDeinterleavingOperation::ReductionOperation ||
+ RootNode->Operation ==
+ ComplexDeinterleavingOperation::ReductionSingle);
// Find out which part, Real or Imag, comes later, and only if we come to
// the latest part, add it to OrderedRoots.
auto *R = cast<Instruction>(RootNode->Real);
auto *I = RootNode->Imag ? cast<Instruction>(RootNode->Imag) : nullptr;
Instruction *ReplacementAnchor;
- if(I)
+ if (I)
ReplacementAnchor = R->comesBefore(I) ? I : R;
- else
+ else
ReplacementAnchor = R;
-
+
if (ReplacementAnchor != RootI)
return false;
OrderedRoots.push_back(RootI);
@@ -1553,7 +1553,7 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
for (size_t j = i + 1; j < OperationInstruction.size(); ++j) {
if (Processed[j])
continue;
-
+
auto *Imag = OperationInstruction[j];
if (Real->getType() != Imag->getType())
continue;
@@ -1588,18 +1588,20 @@ void ComplexDeinterleavingGraph::identifyReductionNodes() {
// We want to check that we have 2 operands, but the function attributes
// being counted as operands bloats this value.
- if(Real->getNumOperands() < 2)
+ if (Real->getNumOperands() < 2)
continue;
RealPHI = ReductionInfo[Real].first;
ImagPHI = nullptr;
PHIsFound = false;
auto Node = identifyNode(Real->getOperand(0), Real->getOperand(1));
- if(Node && PHIsFound) {
- LLVM_DEBUG(dbgs() << "Identified single reduction starting from instruction: "
- << *Real << "/" << *ReductionInfo[Real].second << "\n");
+ if (Node && PHIsFound) {
+ LLVM_DEBUG(
+ dbgs() << "Identified single reduction starting from instruction: "
+ << *Real << "/" << *ReductionInfo[Real].second << "\n");
Processed[i] = true;
- auto RootNode = prepareCompositeNode(ComplexDeinterleavingOperation::ReductionSingle, Real, nullptr);
+ auto RootNode = prepareCompositeNode(
+ ComplexDeinterleavingOperation::ReductionSingle, Real, nullptr);
RootNode->addOperand(Node);
RootToNode[Real] = RootNode;
submitCompositeNode(RootNode);
@@ -2059,7 +2061,8 @@ Value *ComplexDeinterleavingGraph::replaceNode(IRBuilderBase &Builder,
return ReplacementNode;
}
-void ComplexDeinterleavingGraph::processReductionSingle(Value *OperationReplacement, RawNodePtr Node) {
+void ComplexDeinterleavingGraph::processReductionSingle(
+ Value *OperationReplacement, RawNodePtr Node) {
auto *Real = cast<Instruction>(Node->Real);
auto *OldPHI = ReductionInfo[Real].first;
auto *NewPHI = OldToNewPHI[OldPHI];
@@ -2071,21 +2074,21 @@ void ComplexDeinterleavingGraph::processReductionSingle(Value *OperationReplacem
IRBuilder<> Builder(Incoming->getTerminator());
Value *NewInit = nullptr;
- if(auto *C = dyn_cast<Constant>(Init)) {
- if(C->isZeroValue())
+ if (auto *C = dyn_cast<Constant>(Init)) {
+ if (C->isZeroValue())
NewInit = Constant::getNullValue(NewVTy);
}
if (!NewInit)
NewInit = Builder.CreateIntrinsic(Intrinsic::vector_interleave2, NewVTy,
- {Init, Constant::getNullValue(VTy)});
+ {Init, Constant::getNullValue(VTy)});
NewPHI->addIncoming(NewInit, Incoming);
NewPHI->addIncoming(OperationReplacement, BackEdge);
auto *FinalReduction = ReductionInfo[Real].second;
Builder.SetInsertPoint(&*FinalReduction->getParent()->getFirstInsertionPt());
- // TODO Ensure that the `AddReduce` here matches the original, found in `FinalReduction`
+
auto *AddReduce = Builder.CreateAddReduce(OperationReplacement);
FinalReduction->replaceAllUsesWith(AddReduce);
}
@@ -2151,7 +2154,8 @@ void ComplexDeinterleavingGraph::replaceNodes() {
ReductionInfo[RootImag].first->removeIncomingValue(BackEdge);
DeadInstrRoots.push_back(RootReal);
DeadInstrRoots.push_back(RootImag);
- } else if(RootNode->Operation == ComplexDeinterleavingOperation::ReductionSingle) {
+ } else if (RootNode->Operation ==
+ ComplexDeinterleavingOperation::ReductionSingle) {
auto *RootInst = cast<Instruction>(RootNode->Real);
ReductionInfo[RootInst].first->removeIncomingValue(BackEdge);
DeadInstrRoots.push_back(ReductionInfo[RootInst].second);
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8068bb67408814..6f58ed4f6cc311 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29184,7 +29184,10 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
if (TyWidth > 128) {
int Stride = Ty->getElementCount().getKnownMinValue() / 2;
- int AccStride = cast<VectorType>(Accumulator->getType())->getElementCount().getKnownMinValue() / 2;
+ int AccStride = cast<VectorType>(Accumulator->getType())
+ ->getElementCount()
+ .getKnownMinValue() /
+ 2;
auto *HalfTy = VectorType::getHalfElementsVectorType(Ty);
auto *LowerSplitA = B.CreateExtractVector(HalfTy, InputA, B.getInt64(0));
auto *LowerSplitB = B.CreateExtractVector(HalfTy, InputB, B.getInt64(0));
@@ -29195,19 +29198,22 @@ Value *AArch64TargetLowering::createComplexDeinterleavingIR(
Value *LowerSplitAcc = nullptr;
Value *UpperSplitAcc = nullptr;
Type *FullTy = Ty;
- FullTy = Accumulator->getType();
- auto *HalfAccTy = VectorType::getHalfElementsVectorType(cast<VectorType>(Accumulator->getType()));
- LowerSplitAcc = B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(0));
- UpperSplitAcc =
- B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(AccStride));
+ FullTy = Accumulator->getType();
+ auto *HalfAccTy = VectorType::getHalfElementsVectorType(
+ cast<VectorType>(Accumulator->getType()));
+ LowerSplitAcc =
+ B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(0));
+ UpperSplitAcc =
+ B.CreateExtractVector(HalfAccTy, Accumulator, B.getInt64(AccStride));
auto *LowerSplitInt = createComplexDeinterleavingIR(
B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
auto *UpperSplitInt = createComplexDeinterleavingIR(
B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
- auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy), LowerSplitInt,
- B.getInt64(0));
- return B.CreateInsertVector(FullTy, Result, UpperSplitInt, B.getInt64(AccStride));
+ auto *Result = B.CreateInsertVector(FullTy, PoisonValue::get(FullTy),
+ LowerSplitInt, B.getInt64(0));
+ return B.CreateInsertVector(FullTy, Result, UpperSplitInt,
+ B.getInt64(AccStride));
}
if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
>From effea38f68aa683c524ca628c64bb9618b3cb4db Mon Sep 17 00:00:00 2001
From: Nick Guy <nicholas.guy at arm.com>
Date: Fri, 18 Oct 2024 13:14:33 +0100
Subject: [PATCH 3/3] Remove erroneously added function and call
---
llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp | 3 ---
1 file changed, 3 deletions(-)
diff --git a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
index 3a5436714715b5..18ad74aa9bae12 100644
--- a/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
+++ b/llvm/lib/CodeGen/ComplexDeinterleavingPass.cpp
@@ -335,7 +335,6 @@ class ComplexDeinterleavingGraph {
/// i: ai - br
NodePtr identifyAdd(Instruction *Real, Instruction *Imag);
NodePtr identifySymmetricOperation(Instruction *Real, Instruction *Imag);
- NodePtr identifyPartialReduction(Value *R, Value *I);
NodePtr identifyNode(Value *R, Value *I);
NodePtr identifyNode(Value *R, Value *I, bool &FromCache);
@@ -916,8 +915,6 @@ ComplexDeinterleavingGraph::identifyNode(Value *R, Value *I, bool &FromCache) {
return It->second;
}
- if (NodePtr CN = identifyPartialReduction(R, I))
- return CN;
bool IsReduction = RealPHI == R && (!ImagPHI || ImagPHI == I);
if (!IsReduction && R->getType() != I->getType())
More information about the llvm-commits
mailing list