[llvm] [LoopVectorizer] Add support for partial reductions (PR #92418)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jun 5 06:34:50 PDT 2024
https://github.com/NickGuy-Arm updated https://github.com/llvm/llvm-project/pull/92418
>From 7f960746ec81bde496acbaf844d0e3f097fa4b0a Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy at arm.com>
Date: Fri, 17 May 2024 11:15:11 +0100
Subject: [PATCH 01/10] [NFC] Test pre-commit
---
.../CodeGen/AArch64/partial-reduce-sdot.ll | 99 +++++++++++++++++++
1 file changed, 99 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll b/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
new file mode 100644
index 0000000000000..fc6e3239a1b43
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=loop-vectorize -force-vector-interleave=1 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define void @dotp(ptr %a, ptr %b) #0 {
+; CHECK-LABEL: define void @dotp(
+; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 16
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 16
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP11]]
+; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1
+; CHECK-NEXT: [[TMP19:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-NEXT: [[TMP25:%.*]] = getelementptr i8, ptr [[TMP21]], i32 0
+; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP25]], align 1
+; CHECK-NEXT: [[TMP27:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP29:%.*]] = mul <vscale x 16 x i32> [[TMP27]], [[TMP19]]
+; CHECK-NEXT: [[TMP14]] = add <vscale x 16 x i32> [[TMP29]], [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP14]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK: scalar.ph:
+; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[TMP20:%.*]] = lshr i32 [[ADD_LCSSA]], 0
+; CHECK-NEXT: ret void
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ACC_010:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP18:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP18]] to i32
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP22:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TMP22]] to i32
+; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[CONV3]], [[CONV]]
+; CHECK-NEXT: [[ADD]] = add i32 [[MUL]], [[ACC_010]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 0
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %0 = lshr i32 %add, 0
+ ret void
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %acc.010 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr i8, ptr %a, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx, align 1
+ %conv = zext i8 %1 to i32
+ %arrayidx2 = getelementptr i8, ptr %b, i64 %indvars.iv
+ %2 = load i8, ptr %arrayidx2, align 1
+ %conv3 = zext i8 %2 to i32
+ %mul = mul i32 %conv3, %conv
+ %add = add i32 %mul, %acc.010
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, 0
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+
+; uselistorder directives
+ uselistorder i32 %add, { 1, 0 }
+}
+
+attributes #0 = { "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+v8a" }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
>From 9de3c24c2f08ef014f55ecc9542cd1baa01759e0 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy at arm.com>
Date: Fri, 17 May 2024 11:17:26 +0100
Subject: [PATCH 02/10] [LoopVectorizer] Add support for partial reductions
---
llvm/include/llvm/IR/DerivedTypes.h | 10 ++
llvm/include/llvm/IR/Intrinsics.h | 5 +-
llvm/include/llvm/IR/Intrinsics.td | 10 ++
llvm/lib/IR/Function.cpp | 16 +++
.../Transforms/Vectorize/LoopVectorize.cpp | 122 ++++++++++++++++++
.../Transforms/Vectorize/VPRecipeBuilder.h | 2 +
llvm/lib/Transforms/Vectorize/VPlan.h | 43 +++++-
.../Transforms/Vectorize/VPlanAnalysis.cpp | 6 +-
llvm/lib/Transforms/Vectorize/VPlanAnalysis.h | 2 +
.../lib/Transforms/Vectorize/VPlanRecipes.cpp | 80 +++++++++++-
llvm/lib/Transforms/Vectorize/VPlanValue.h | 1 +
.../CodeGen/AArch64/partial-reduce-sdot.ll | 7 +-
12 files changed, 291 insertions(+), 13 deletions(-)
diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index 443fb7de3b821..866a01c9afebd 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -512,6 +512,16 @@ class VectorType : public Type {
EltCnt.divideCoefficientBy(2));
}
+ /// This static method returns a VectorType with quarter as many elements as the
+ /// input type and the same element type.
+ static VectorType *getQuarterElementsVectorType(VectorType *VTy) {
+ auto EltCnt = VTy->getElementCount();
+ assert(EltCnt.isKnownEven() &&
+ "Cannot halve vector with odd number of elements.");
+ return VectorType::get(VTy->getElementType(),
+ EltCnt.divideCoefficientBy(4));
+ }
+
/// This static method returns a VectorType with twice as many elements as the
/// input type and the same element type.
static VectorType *getDoubleElementsVectorType(VectorType *VTy) {
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 340c1c326d066..e03e7e0bf50de 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -131,6 +131,7 @@ namespace Intrinsic {
ExtendArgument,
TruncArgument,
HalfVecArgument,
+ QuarterVecArgument,
SameVecWidthArgument,
VecOfAnyPtrsToElt,
VecElementArgument,
@@ -160,7 +161,7 @@ namespace Intrinsic {
unsigned getArgumentNumber() const {
assert(Kind == Argument || Kind == ExtendArgument ||
- Kind == TruncArgument || Kind == HalfVecArgument ||
+ Kind == TruncArgument || Kind == HalfVecArgument || Kind == QuarterVecArgument ||
Kind == SameVecWidthArgument || Kind == VecElementArgument ||
Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
Kind == VecOfBitcastsToInt);
@@ -168,7 +169,7 @@ namespace Intrinsic {
}
ArgKind getArgumentKind() const {
assert(Kind == Argument || Kind == ExtendArgument ||
- Kind == TruncArgument || Kind == HalfVecArgument ||
+ Kind == TruncArgument || Kind == HalfVecArgument || Kind == QuarterVecArgument ||
Kind == SameVecWidthArgument ||
Kind == VecElementArgument || Kind == Subdivide2Argument ||
Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt);
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 1d20f7e1b1985..dad177e595341 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -321,6 +321,7 @@ def IIT_I4 : IIT_Int<4, 58>;
def IIT_AARCH64_SVCOUNT : IIT_VT<aarch64svcount, 59>;
def IIT_V6 : IIT_Vec<6, 60>;
def IIT_V10 : IIT_Vec<10, 61>;
+def IIT_QUARTER_VEC_ARG : IIT_Base<62>;
}
defvar IIT_all_FixedTypes = !filter(iit, IIT_all,
@@ -457,6 +458,9 @@ class LLVMVectorElementType<int num> : LLVMMatchType<num, IIT_VEC_ELEMENT>;
class LLVMHalfElementsVectorType<int num>
: LLVMMatchType<num, IIT_HALF_VEC_ARG>;
+class LLVMQuarterElementsVectorType<int num>
+ : LLVMMatchType<num, IIT_QUARTER_VEC_ARG>;
+
// Match the type of another intrinsic parameter that is expected to be a
// vector type (i.e. <N x iM>) but with each element subdivided to
// form a vector with more elements that are smaller than the original.
@@ -2605,6 +2609,12 @@ def int_experimental_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfEleme
[llvm_anyvector_ty],
[IntrNoMem]>;
+//===-------------- Intrinsics to perform partial reduction ---------------===//
+
+def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMQuarterElementsVectorType<0>],
+ [llvm_anyvector_ty],
+ [IntrNoMem]>;
+
//===----------------- Pointer Authentication Intrinsics ------------------===//
//
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index e66fe73425e86..e9eebd5e35300 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -1240,6 +1240,12 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
ArgInfo));
return;
}
+ case IIT_QUARTER_VEC_ARG: {
+ unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
+ OutputTable.push_back(IITDescriptor::get(IITDescriptor::QuarterVecArgument,
+ ArgInfo));
+ return;
+ }
case IIT_SAME_VEC_WIDTH_ARG: {
unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
OutputTable.push_back(IITDescriptor::get(IITDescriptor::SameVecWidthArgument,
@@ -1404,6 +1410,9 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
case IITDescriptor::HalfVecArgument:
return VectorType::getHalfElementsVectorType(cast<VectorType>(
Tys[D.getArgumentNumber()]));
+ case IITDescriptor::QuarterVecArgument: {
+ return VectorType::getQuarterElementsVectorType(cast<VectorType>(Tys[D.getArgumentNumber()]));
+ }
case IITDescriptor::SameVecWidthArgument: {
Type *EltTy = DecodeFixedType(Infos, Tys, Context);
Type *Ty = Tys[D.getArgumentNumber()];
@@ -1619,6 +1628,13 @@ static bool matchIntrinsicType(
return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
VectorType::getHalfElementsVectorType(
cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+ case IITDescriptor::QuarterVecArgument: {
+ if (D.getArgumentNumber() >= ArgTys.size())
+ return IsDeferredCheck || DeferCheck(Ty);
+ return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
+ VectorType::getQuarterElementsVectorType(
+ cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
+ }
case IITDescriptor::SameVecWidthArgument: {
if (D.getArgumentNumber() >= ArgTys.size()) {
// Defer check and subsequent check for the vector element type.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 33c4decd58a6c..1f37df061bbf7 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2203,6 +2203,92 @@ static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
}
+static void getPartialReductionInstrChain(Instruction *Instr, SmallVector<Value*, 4> &Chain) {
+ Instruction *Mul = cast<Instruction>(Instr->getOperand(0));
+ Instruction *Ext0 = cast<ZExtInst>(Mul->getOperand(0));
+ Instruction *Ext1 = cast<ZExtInst>(Mul->getOperand(1));
+
+ Chain.push_back(Mul);
+ Chain.push_back(Ext0);
+ Chain.push_back(Ext1);
+ Chain.push_back(Instr->getOperand(1));
+}
+
+
+/// @param Instr The root instruction to scan
+static bool isInstrPartialReduction(Instruction *Instr) {
+ Value *ExpectedPhi;
+ Value *A, *B;
+ Value *InductionA, *InductionB;
+
+ using namespace llvm::PatternMatch;
+ auto Pattern = m_Add(
+ m_OneUse(m_Mul(
+ m_OneUse(m_ZExt(
+ m_OneUse(m_Load(
+ m_GEP(
+ m_Value(A),
+ m_Value(InductionA)))))),
+ m_OneUse(m_ZExt(
+ m_OneUse(m_Load(
+ m_GEP(
+ m_Value(B),
+ m_Value(InductionB))))))
+ )), m_Value(ExpectedPhi));
+
+ bool Matches = match(Instr, Pattern);
+
+ if(!Matches)
+ return false;
+
+ // Check that the two induction variable uses are to the same induction variable
+ if(InductionA != InductionB) {
+ LLVM_DEBUG(dbgs() << "Loop uses different induction variables for each input variable, cannot create a partial reduction.\n");
+ return false;
+ }
+
+ Instruction *Mul = cast<Instruction>(Instr->getOperand(0));
+ Instruction *Ext0 = cast<ZExtInst>(Mul->getOperand(0));
+ Instruction *Ext1 = cast<ZExtInst>(Mul->getOperand(1));
+
+ // Check that the extends extend to i32
+ if(!Ext0->getType()->isIntegerTy(32) || !Ext1->getType()->isIntegerTy(32)) {
+ LLVM_DEBUG(dbgs() << "Extends don't extend to the correct width, cannot create a partial reduction.\n");
+ return false;
+ }
+
+ // Check that the loads are loading i8
+ LoadInst *Load0 = cast<LoadInst>(Ext0->getOperand(0));
+ LoadInst *Load1 = cast<LoadInst>(Ext1->getOperand(0));
+ if(!Load0->getType()->isIntegerTy(8) || !Load1->getType()->isIntegerTy(8)) {
+ LLVM_DEBUG(dbgs() << "Loads don't load the correct width, cannot create a partial reduction\n");
+ return false;
+ }
+
+ // Check that the add feeds into ExpectedPhi
+ PHINode *PhiNode = dyn_cast<PHINode>(ExpectedPhi);
+ if(!PhiNode) {
+ LLVM_DEBUG(dbgs() << "Expected Phi node was not a phi, cannot create a partial reduction.\n");
+ return false;
+ }
+
+ // Check that the first phi value is a zero initializer
+ ConstantInt *ZeroInit = dyn_cast<ConstantInt>(PhiNode->getIncomingValue(0));
+ if(!ZeroInit || !ZeroInit->isZero()) {
+ LLVM_DEBUG(dbgs() << "First PHI value is not a constant zero, cannot create a partial reduction.\n");
+ return false;
+ }
+
+ // Check that the second phi value is the instruction we're looking at
+ Instruction *MaybeAdd = dyn_cast<Instruction>(PhiNode->getIncomingValue(1));
+ if(!MaybeAdd || MaybeAdd != Instr) {
+ LLVM_DEBUG(dbgs() << "Second PHI value is not the root add, cannot create a partial reduction.\n");
+ return false;
+ }
+
+ return true;
+}
+
// Return true if \p OuterLp is an outer loop annotated with hints for explicit
// vectorization. The loop needs to be annotated with #pragma omp simd
// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
@@ -5084,6 +5170,13 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
return false;
}
+ // Prevent epilogue vectorization if a partial reduction is involved
+ // TODO Is there a cleaner way to check this?
+ if(any_of(Legal->getReductionVars(), [&](const std::pair<PHINode *, RecurrenceDescriptor> &Reduction) {
+ return isInstrPartialReduction(Reduction.second.getLoopExitInstr());
+ }))
+ return false;
+
// Epilogue vectorization code has not been auditted to ensure it handles
// non-latch exits properly. It may be fine, but it needs auditted and
// tested.
@@ -7182,6 +7275,17 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
VecValuesToIgnore.insert(Casts.begin(), Casts.end());
}
+
+ // Ignore any values that we know will be flattened
+ for(auto Reduction : this->Legal->getReductionVars()) {
+ auto &Recurrence = Reduction.second;
+ if(isInstrPartialReduction(Recurrence.getLoopExitInstr())) {
+ SmallVector<Value*, 4> PartialReductionValues;
+ getPartialReductionInstrChain(Recurrence.getLoopExitInstr(), PartialReductionValues);
+ ValuesToIgnore.insert(PartialReductionValues.begin(), PartialReductionValues.end());
+ VecValuesToIgnore.insert(PartialReductionValues.begin(), PartialReductionValues.end());
+ }
+ }
}
void LoopVectorizationCostModel::collectInLoopReductions() {
@@ -8536,9 +8640,24 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
*CI);
}
+ if(auto *PartialReduce = tryToCreatePartialReduction(Range, Instr, Operands))
+ return PartialReduce;
+
return tryToWiden(Instr, Operands, VPBB);
}
+VPRecipeBase *VPRecipeBuilder::tryToCreatePartialReduction(
+ VFRange &Range, Instruction *Instr, ArrayRef<VPValue *> Operands) {
+
+ if(isInstrPartialReduction(Instr)) {
+ auto EC = ElementCount::getScalable(16);
+ if(std::find(Range.begin(), Range.end(), EC) == Range.end())
+ return nullptr;
+ return new VPPartialReductionRecipe(*Instr, make_range(Operands.begin(), Operands.end()));
+ }
+ return nullptr;
+}
+
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
ElementCount MaxVF) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -8746,6 +8865,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
VPBB->appendRecipe(Recipe);
}
+ for(auto &Recipe : *VPBB)
+ Recipe.postInsertionOp();
+
VPBlockUtils::insertBlockAfter(new VPBasicBlock(), VPBB);
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
}
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index b4c7ab02f928f..c439f221709e1 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -116,6 +116,8 @@ class VPRecipeBuilder {
ArrayRef<VPValue *> Operands,
VFRange &Range, VPBasicBlock *VPBB);
+ VPRecipeBase* tryToCreatePartialReduction(VFRange &Range, Instruction* Instr, ArrayRef<VPValue*> Operands);
+
/// Set the recipe created for given ingredient.
void setRecipe(Instruction *I, VPRecipeBase *R) {
assert(!Ingredient2Recipe.contains(I) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c74329a0bcc4a..5a572ecb798d6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -767,6 +767,8 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
/// \returns an iterator pointing to the element after the erased one
iplist<VPRecipeBase>::iterator eraseFromParent();
+ virtual void postInsertionOp() {}
+
/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *D) {
// All VPDefs are also VPRecipeBases.
@@ -1881,14 +1883,19 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
/// The phi is part of an ordered reduction. Requires IsInLoop to be true.
bool IsOrdered;
+ /// The amount that the VF should be divided by during ::execute
+ unsigned VFScaleFactor = 1;
+
public:
+
/// Create a new VPReductionPHIRecipe for the reduction \p Phi described by \p
/// RdxDesc.
VPReductionPHIRecipe(PHINode *Phi, const RecurrenceDescriptor &RdxDesc,
VPValue &Start, bool IsInLoop = false,
- bool IsOrdered = false)
+ bool IsOrdered = false, unsigned VFScaleFactor = 1)
: VPHeaderPHIRecipe(VPDef::VPReductionPHISC, Phi, &Start),
- RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered) {
+ RdxDesc(RdxDesc), IsInLoop(IsInLoop), IsOrdered(IsOrdered),
+ VFScaleFactor(VFScaleFactor) {
assert((!IsOrdered || IsInLoop) && "IsOrdered requires IsInLoop");
}
@@ -1897,7 +1904,7 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
VPReductionPHIRecipe *clone() override {
auto *R =
new VPReductionPHIRecipe(cast<PHINode>(getUnderlyingInstr()), RdxDesc,
- *getOperand(0), IsInLoop, IsOrdered);
+ *getOperand(0), IsInLoop, IsOrdered, VFScaleFactor);
R->addOperand(getBackedgeValue());
return R;
}
@@ -1908,6 +1915,10 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
return R->getVPDefID() == VPDef::VPReductionPHISC;
}
+ void SetVFScaleFactor(unsigned ScaleFactor) {
+ VFScaleFactor = ScaleFactor;
+ }
+
/// Generate the phi/select nodes.
void execute(VPTransformState &State) override;
@@ -1928,6 +1939,32 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
bool isInLoop() const { return IsInLoop; }
};
+class VPPartialReductionRecipe : public VPRecipeWithIRFlags {
+ unsigned Opcode;
+public:
+ template <typename IterT>
+ VPPartialReductionRecipe(Instruction &I,
+ iterator_range<IterT> Operands) : VPRecipeWithIRFlags(
+ VPDef::VPPartialReductionSC, Operands, I), Opcode(I.getOpcode())
+ {}
+ ~VPPartialReductionRecipe() override = default;
+ VPPartialReductionRecipe *clone() override {
+ auto *R = new VPPartialReductionRecipe(*getUnderlyingInstr(), operands());
+ R->transferFlags(*this);
+ return R;
+ }
+ VP_CLASSOF_IMPL(VPDef::VPPartialReductionSC)
+ /// Generate the reduction in the loop
+ void execute(VPTransformState &State) override;
+ void postInsertionOp() override;
+ unsigned getOpcode() { return Opcode; }
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+#endif
+};
+
/// A recipe for vectorizing a phi-node as a sequence of mask-based select
/// instructions.
class VPBlendRecipe : public VPSingleDefRecipe {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 5f93339083f0c..8a75668886599 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -208,6 +208,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPReplicateRecipe *R) {
llvm_unreachable("Unhandled opcode");
}
+Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPPartialReductionRecipe *R) {
+ return R->getUnderlyingInstr()->getType();
+}
+
Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
if (Type *CachedTy = CachedTypes.lookup(V))
return CachedTy;
@@ -238,7 +242,7 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
return inferScalarType(R->getOperand(0));
})
.Case<VPBlendRecipe, VPInstruction, VPWidenRecipe, VPReplicateRecipe,
- VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe>(
+ VPWidenCallRecipe, VPWidenMemoryRecipe, VPWidenSelectRecipe, VPPartialReductionRecipe>(
[this](const auto *R) { return inferScalarTypeForRecipe(R); })
.Case<VPInterleaveRecipe>([V](const VPInterleaveRecipe *R) {
// TODO: Use info from interleave group.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
index 7d310b1b31b6f..3bd8d24542199 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.h
@@ -23,6 +23,7 @@ class VPWidenIntOrFpInductionRecipe;
class VPWidenMemoryRecipe;
struct VPWidenSelectRecipe;
class VPReplicateRecipe;
+class VPPartialReductionRecipe;
class Type;
/// An analysis for type-inference for VPValues.
@@ -49,6 +50,7 @@ class VPTypeAnalysis {
Type *inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R);
Type *inferScalarTypeForRecipe(const VPWidenSelectRecipe *R);
Type *inferScalarTypeForRecipe(const VPReplicateRecipe *R);
+ Type *inferScalarTypeForRecipe(const VPPartialReductionRecipe *R);
public:
VPTypeAnalysis(Type *CanonicalIVTy, LLVMContext &Ctx)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 9ec422ec002c8..905fa21d38252 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -245,6 +245,76 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
insertBefore(BB, I);
}
+void VPPartialReductionRecipe::execute(VPTransformState &State) {
+ State.setDebugLocFrom(getDebugLoc());
+ auto &Builder = State.Builder;
+
+ switch(Opcode) {
+ case Instruction::Add: {
+
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ Value* Mul = nullptr;
+ Value* Phi = nullptr;
+ SmallVector<Value*, 2> Ops;
+ for (VPValue *VPOp : operands()) {
+ auto *Op = State.get(VPOp, Part);
+ Ops.push_back(Op);
+ if(isa<PHINode>(Op))
+ Phi = Op;
+ else
+ Mul = Op;
+ }
+
+ assert(Phi && Mul && "Phi and Mul must be set");
+ assert(isa<ScalableVectorType>(Ops[0]->getType()) && "Type must be a scalable vector");
+
+ ScalableVectorType *FullTy = cast<ScalableVectorType>(Ops[0]->getType());
+ Type *RetTy = ScalableVectorType::get(FullTy->getScalarType(), 4);
+
+ Intrinsic::ID PartialIntrinsic = Intrinsic::not_intrinsic;
+ switch(Opcode) {
+ case Instruction::Add:
+ PartialIntrinsic =
+ Intrinsic::experimental_vector_partial_reduce_add;
+ break;
+ default:
+ llvm_unreachable("Opcode not handled");
+ }
+
+ assert(PartialIntrinsic != Intrinsic::not_intrinsic);
+
+ Value *V = Builder.CreateIntrinsic(RetTy, PartialIntrinsic, Mul, nullptr, Twine("partial.reduce"));
+ V = Builder.CreateNAryOp(Opcode, {V, Phi});
+ if (auto *VecOp = dyn_cast<Instruction>(V))
+ setFlags(VecOp);
+
+ // Use this vector value for all users of the original instruction.
+ State.set(this, V, Part);
+ State.addMetadata(V, dyn_cast_or_null<Instruction>(getUnderlyingValue()));
+ }
+ break;
+ }
+ default:
+ LLVM_DEBUG(dbgs() << "LV: Found an unhandled opcode : " << Instruction::getOpcodeName(Opcode));
+ llvm_unreachable("Unhandled instruction!");
+ }
+}
+
+void VPPartialReductionRecipe::postInsertionOp() {
+ cast<VPReductionPHIRecipe>(this->getOperand(1))->SetVFScaleFactor(4);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPPartialReductionRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << Indent << "PARTIAL-REDUCE ";
+ printAsOperand(O, SlotTracker);
+ O << " = " << Instruction::getOpcodeName(Opcode);
+ printFlags(O);
+ printOperands(O, SlotTracker);
+}
+#endif
+
FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
assert(OpType == OperationType::FPMathOp &&
"recipe doesn't have fast math flags");
@@ -1953,6 +2023,8 @@ void VPFirstOrderRecurrencePHIRecipe::print(raw_ostream &O, const Twine &Indent,
void VPReductionPHIRecipe::execute(VPTransformState &State) {
auto &Builder = State.Builder;
+ auto VF = State.VF.divideCoefficientBy(VFScaleFactor);
+
// Reductions do not have to start at zero. They can start with
// any loop invariant values.
VPValue *StartVPV = getStartValue();
@@ -1962,9 +2034,9 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
// Phi nodes have cycles, so we need to vectorize them in two stages. This is
// stage #1: We create a new vector PHI node with no incoming edges. We'll use
// this value when we vectorize all of the instructions that use the PHI.
- bool ScalarPHI = State.VF.isScalar() || IsInLoop;
+ bool ScalarPHI = VF.isScalar() || IsInLoop;
Type *VecTy = ScalarPHI ? StartV->getType()
- : VectorType::get(StartV->getType(), State.VF);
+ : VectorType::get(StartV->getType(), VF);
BasicBlock *HeaderBB = State.CFG.PrevBB;
assert(State.CurrentVectorLoop->getHeader() == HeaderBB &&
@@ -1989,14 +2061,14 @@ void VPReductionPHIRecipe::execute(VPTransformState &State) {
IRBuilderBase::InsertPointGuard IPBuilder(Builder);
Builder.SetInsertPoint(VectorPH->getTerminator());
StartV = Iden =
- Builder.CreateVectorSplat(State.VF, StartV, "minmax.ident");
+ Builder.CreateVectorSplat(VF, StartV, "minmax.ident");
}
} else {
Iden = RdxDesc.getRecurrenceIdentity(RK, VecTy->getScalarType(),
RdxDesc.getFastMathFlags());
if (!ScalarPHI) {
- Iden = Builder.CreateVectorSplat(State.VF, Iden);
+ Iden = Builder.CreateVectorSplat(VF, Iden);
IRBuilderBase::InsertPointGuard IPBuilder(Builder);
Builder.SetInsertPoint(VectorPH->getTerminator());
Constant *Zero = Builder.getInt32(0);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 96d04271850f7..c971c9a48364e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -348,6 +348,7 @@ class VPDef {
VPInstructionSC,
VPInterleaveSC,
VPReductionSC,
+ VPPartialReductionSC,
VPReplicateSC,
VPScalarCastSC,
VPScalarIVStepsSC,
diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll b/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
index fc6e3239a1b43..1eafd505b199e 100644
--- a/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
+++ b/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
@@ -22,7 +22,7 @@ define void @dotp(ptr %a, ptr %b) #0 {
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP11]]
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i32 0
@@ -33,12 +33,13 @@ define void @dotp(ptr %a, ptr %b) #0 {
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP25]], align 1
; CHECK-NEXT: [[TMP27:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
; CHECK-NEXT: [[TMP29:%.*]] = mul <vscale x 16 x i32> [[TMP27]], [[TMP19]]
-; CHECK-NEXT: [[TMP14]] = add <vscale x 16 x i32> [[TMP29]], [[VEC_PHI]]
+; CHECK-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP29]])
+; CHECK-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE]], [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
-; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP14]])
+; CHECK-NEXT: [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
>From d85a2a44239ddd3973008d1d0a607cdc283bdcec Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy at arm.com>
Date: Thu, 30 May 2024 15:03:24 +0100
Subject: [PATCH 03/10] [LoopVectorizer] Add langref entry for partial
reduction intrinsic
---
llvm/docs/LangRef.rst | 28 ++++++++++++++++++++++++++++
1 file changed, 28 insertions(+)
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 9592929d79feb..e92098ddf7d98 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -19030,6 +19030,34 @@ will be on any later loop iteration.
This intrinsic will only return 0 if the input count is also 0. A non-zero input
count will produce a non-zero result.
+'``llvm.experimental.vector.partial.reduce.add.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+ declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<8 x i32> %in)
+ declare <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<16 x i32> %in)
+ declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 8 x i32> %in)
+ declare <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 16 x i32> %in)
+
+Overview:
+"""""""""
+
+The '``llvm.vector.experimental.partial.reduce.add.*``' intrinsics do an integer
+``ADD`` reduction of subvectors within a vector, returning each scalar result as
+a lane within a vector. The return type is a vector type with an
+element-type of the vector input and a width a factor of the vector input
+(typically either half or quarter).
+
+Arguments:
+""""""""""
+
+The argument to this intrinsic must be a vector of integer values.
+
Matrix Intrinsics
-----------------
>From 48966166a629ebed8191d0b0c3256c96e31a358f Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy at arm.com>
Date: Thu, 30 May 2024 15:04:55 +0100
Subject: [PATCH 04/10] [LoopVectorizer] Removed 4x restriction from partial
reduction intrinsic
---
llvm/include/llvm/IR/DerivedTypes.h | 10 ----------
llvm/include/llvm/IR/Intrinsics.h | 5 ++---
llvm/include/llvm/IR/Intrinsics.td | 6 +-----
llvm/lib/IR/Function.cpp | 16 ----------------
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
llvm/lib/Transforms/Vectorize/VPlan.h | 7 ++++---
llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp | 5 +++--
llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll | 2 +-
8 files changed, 12 insertions(+), 41 deletions(-)
diff --git a/llvm/include/llvm/IR/DerivedTypes.h b/llvm/include/llvm/IR/DerivedTypes.h
index 866a01c9afebd..443fb7de3b821 100644
--- a/llvm/include/llvm/IR/DerivedTypes.h
+++ b/llvm/include/llvm/IR/DerivedTypes.h
@@ -512,16 +512,6 @@ class VectorType : public Type {
EltCnt.divideCoefficientBy(2));
}
- /// This static method returns a VectorType with quarter as many elements as the
- /// input type and the same element type.
- static VectorType *getQuarterElementsVectorType(VectorType *VTy) {
- auto EltCnt = VTy->getElementCount();
- assert(EltCnt.isKnownEven() &&
- "Cannot halve vector with odd number of elements.");
- return VectorType::get(VTy->getElementType(),
- EltCnt.divideCoefficientBy(4));
- }
-
/// This static method returns a VectorType with twice as many elements as the
/// input type and the same element type.
static VectorType *getDoubleElementsVectorType(VectorType *VTy) {
diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index e03e7e0bf50de..340c1c326d066 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -131,7 +131,6 @@ namespace Intrinsic {
ExtendArgument,
TruncArgument,
HalfVecArgument,
- QuarterVecArgument,
SameVecWidthArgument,
VecOfAnyPtrsToElt,
VecElementArgument,
@@ -161,7 +160,7 @@ namespace Intrinsic {
unsigned getArgumentNumber() const {
assert(Kind == Argument || Kind == ExtendArgument ||
- Kind == TruncArgument || Kind == HalfVecArgument || Kind == QuarterVecArgument ||
+ Kind == TruncArgument || Kind == HalfVecArgument ||
Kind == SameVecWidthArgument || Kind == VecElementArgument ||
Kind == Subdivide2Argument || Kind == Subdivide4Argument ||
Kind == VecOfBitcastsToInt);
@@ -169,7 +168,7 @@ namespace Intrinsic {
}
ArgKind getArgumentKind() const {
assert(Kind == Argument || Kind == ExtendArgument ||
- Kind == TruncArgument || Kind == HalfVecArgument || Kind == QuarterVecArgument ||
+ Kind == TruncArgument || Kind == HalfVecArgument ||
Kind == SameVecWidthArgument ||
Kind == VecElementArgument || Kind == Subdivide2Argument ||
Kind == Subdivide4Argument || Kind == VecOfBitcastsToInt);
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index dad177e595341..16524ff974626 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -321,7 +321,6 @@ def IIT_I4 : IIT_Int<4, 58>;
def IIT_AARCH64_SVCOUNT : IIT_VT<aarch64svcount, 59>;
def IIT_V6 : IIT_Vec<6, 60>;
def IIT_V10 : IIT_Vec<10, 61>;
-def IIT_QUARTER_VEC_ARG : IIT_Base<62>;
}
defvar IIT_all_FixedTypes = !filter(iit, IIT_all,
@@ -458,9 +457,6 @@ class LLVMVectorElementType<int num> : LLVMMatchType<num, IIT_VEC_ELEMENT>;
class LLVMHalfElementsVectorType<int num>
: LLVMMatchType<num, IIT_HALF_VEC_ARG>;
-class LLVMQuarterElementsVectorType<int num>
- : LLVMMatchType<num, IIT_QUARTER_VEC_ARG>;
-
// Match the type of another intrinsic parameter that is expected to be a
// vector type (i.e. <N x iM>) but with each element subdivided to
// form a vector with more elements that are smaller than the original.
@@ -2611,7 +2607,7 @@ def int_experimental_vector_deinterleave2 : DefaultAttrsIntrinsic<[LLVMHalfEleme
//===-------------- Intrinsics to perform partial reduction ---------------===//
-def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[LLVMQuarterElementsVectorType<0>],
+def int_experimental_vector_partial_reduce_add : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
[llvm_anyvector_ty],
[IntrNoMem]>;
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index e9eebd5e35300..e66fe73425e86 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -1240,12 +1240,6 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
ArgInfo));
return;
}
- case IIT_QUARTER_VEC_ARG: {
- unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
- OutputTable.push_back(IITDescriptor::get(IITDescriptor::QuarterVecArgument,
- ArgInfo));
- return;
- }
case IIT_SAME_VEC_WIDTH_ARG: {
unsigned ArgInfo = (NextElt == Infos.size() ? 0 : Infos[NextElt++]);
OutputTable.push_back(IITDescriptor::get(IITDescriptor::SameVecWidthArgument,
@@ -1410,9 +1404,6 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
case IITDescriptor::HalfVecArgument:
return VectorType::getHalfElementsVectorType(cast<VectorType>(
Tys[D.getArgumentNumber()]));
- case IITDescriptor::QuarterVecArgument: {
- return VectorType::getQuarterElementsVectorType(cast<VectorType>(Tys[D.getArgumentNumber()]));
- }
case IITDescriptor::SameVecWidthArgument: {
Type *EltTy = DecodeFixedType(Infos, Tys, Context);
Type *Ty = Tys[D.getArgumentNumber()];
@@ -1628,13 +1619,6 @@ static bool matchIntrinsicType(
return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
VectorType::getHalfElementsVectorType(
cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
- case IITDescriptor::QuarterVecArgument: {
- if (D.getArgumentNumber() >= ArgTys.size())
- return IsDeferredCheck || DeferCheck(Ty);
- return !isa<VectorType>(ArgTys[D.getArgumentNumber()]) ||
- VectorType::getQuarterElementsVectorType(
- cast<VectorType>(ArgTys[D.getArgumentNumber()])) != Ty;
- }
case IITDescriptor::SameVecWidthArgument: {
if (D.getArgumentNumber() >= ArgTys.size()) {
// Defer check and subsequent check for the vector element type.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1f37df061bbf7..f697122390b3b 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8653,7 +8653,7 @@ VPRecipeBase *VPRecipeBuilder::tryToCreatePartialReduction(
auto EC = ElementCount::getScalable(16);
if(std::find(Range.begin(), Range.end(), EC) == Range.end())
return nullptr;
- return new VPPartialReductionRecipe(*Instr, make_range(Operands.begin(), Operands.end()));
+ return new VPPartialReductionRecipe(*Instr, make_range(Operands.begin(), Operands.end()), 4);
}
return nullptr;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 5a572ecb798d6..06fb7a7e3d399 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1941,15 +1941,16 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
class VPPartialReductionRecipe : public VPRecipeWithIRFlags {
unsigned Opcode;
+ unsigned Scale;
public:
template <typename IterT>
VPPartialReductionRecipe(Instruction &I,
- iterator_range<IterT> Operands) : VPRecipeWithIRFlags(
- VPDef::VPPartialReductionSC, Operands, I), Opcode(I.getOpcode())
+ iterator_range<IterT> Operands, unsigned Scale) : VPRecipeWithIRFlags(
+ VPDef::VPPartialReductionSC, Operands, I), Opcode(I.getOpcode()), Scale(Scale)
{}
~VPPartialReductionRecipe() override = default;
VPPartialReductionRecipe *clone() override {
- auto *R = new VPPartialReductionRecipe(*getUnderlyingInstr(), operands());
+ auto *R = new VPPartialReductionRecipe(*getUnderlyingInstr(), operands(), Scale);
R->transferFlags(*this);
return R;
}
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 905fa21d38252..8c4ce8789ecc7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -269,7 +269,8 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
assert(isa<ScalableVectorType>(Ops[0]->getType()) && "Type must be a scalable vector");
ScalableVectorType *FullTy = cast<ScalableVectorType>(Ops[0]->getType());
- Type *RetTy = ScalableVectorType::get(FullTy->getScalarType(), 4);
+ auto EC = FullTy->getElementCount();
+ Type *RetTy = ScalableVectorType::get(FullTy->getScalarType(), EC.divideCoefficientBy(Scale).getKnownMinValue());
Intrinsic::ID PartialIntrinsic = Intrinsic::not_intrinsic;
switch(Opcode) {
@@ -301,7 +302,7 @@ void VPPartialReductionRecipe::execute(VPTransformState &State) {
}
void VPPartialReductionRecipe::postInsertionOp() {
- cast<VPReductionPHIRecipe>(this->getOperand(1))->SetVFScaleFactor(4);
+ cast<VPReductionPHIRecipe>(this->getOperand(1))->SetVFScaleFactor(Scale);
}
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll b/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
index 1eafd505b199e..7883cfc05a13b 100644
--- a/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
+++ b/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
@@ -33,7 +33,7 @@ define void @dotp(ptr %a, ptr %b) #0 {
; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP25]], align 1
; CHECK-NEXT: [[TMP27:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
; CHECK-NEXT: [[TMP29:%.*]] = mul <vscale x 16 x i32> [[TMP27]], [[TMP19]]
-; CHECK-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP29]])
+; CHECK-NEXT: [[PARTIAL_REDUCE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 16 x i32> [[TMP29]])
; CHECK-NEXT: [[TMP14]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE]], [[VEC_PHI]]
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
>From 1b9858acba66a1db5eddf01e403a1b63049464d3 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy at arm.com>
Date: Thu, 30 May 2024 15:05:36 +0100
Subject: [PATCH 05/10] [LoopVectorizer] Add generic lowering of partial
reduction intrinsic
---
.../SelectionDAG/SelectionDAGBuilder.cpp | 21 +++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 0db484a5e06bc..39cf121802858 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7843,6 +7843,27 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
setValue(&I, Trunc);
return;
}
+ case Intrinsic::experimental_vector_partial_reduce_add: {
+ auto DL = getCurSDLoc();
+ auto ReducedTy = EVT::getEVT(I.getType());
+ auto OpNode = getValue(I.getOperand(0));
+ auto Index = DAG.getVectorIdxConstant(0, DL);
+ auto FullTy = OpNode.getValueType();
+
+ auto ResultVector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {OpNode, Index});
+ unsigned ScaleFactor = FullTy.getVectorMinNumElements() / ReducedTy.getVectorMinNumElements();
+
+ for(unsigned i = 0; i < ScaleFactor; i++) {
+ auto SourceIndex = DAG.getVectorIdxConstant(i * ScaleFactor, DL);
+ auto TargetIndex = DAG.getVectorIdxConstant(i, DL);
+ auto N = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {OpNode, SourceIndex});
+ N = DAG.getNode(ISD::VP_REDUCE_ADD, DL, ReducedTy.getScalarType(), N);
+ DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReducedTy, {ResultVector, N, TargetIndex});
+ }
+
+ setValue(&I, ResultVector);
+ return;
+ }
case Intrinsic::experimental_cttz_elts: {
auto DL = getCurSDLoc();
SDValue Op = getValue(I.getOperand(0));
>From 78c0bbfd9316dc144b40f0952a2e1cbd656d8106 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy at arm.com>
Date: Wed, 5 Jun 2024 14:12:04 +0100
Subject: [PATCH 06/10] Commit of test files
---
.../CodeGen/AArch64/partial-reduce-sdot-ir.ll | 99 +++++++++++++++++++
...rtial-reduce-sdot.ll => partial-reduce.ll} | 0
2 files changed, 99 insertions(+)
create mode 100644 llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll
rename llvm/test/CodeGen/AArch64/{partial-reduce-sdot.ll => partial-reduce.ll} (100%)
diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll b/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll
new file mode 100644
index 0000000000000..3519ba58b3df3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/partial-reduce-sdot-ir.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes="default<O3>" -force-vector-interleave=1 -S < %s | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-none-unknown-elf"
+
+define void @dotp(ptr %out, ptr %a, ptr %b, i64 %wide.trip.count) #0 {
+; CHECK-LABEL: define void @dotp(
+; CHECK-SAME: ptr nocapture writeonly [[OUT:%.*]], ptr nocapture readonly [[A:%.*]], ptr nocapture readonly [[B:%.*]], i64 [[WIDE_TRIP_COUNT:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[WIDE_TRIP_COUNT]], 1
+; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 4
+; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
+; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK: vector.ph:
+; CHECK-NEXT: [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[TMP3]], 4
+; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
+; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT: [[TMP5:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 4
+; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK: vector.body:
+; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
+; CHECK-NEXT: [[TMP8:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
+; CHECK-NEXT: [[TMP10:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-NEXT: [[TMP11:%.*]] = mul nuw nsw <vscale x 16 x i32> [[TMP10]], [[TMP8]]
+; CHECK-NEXT: [[TMP12]] = add <vscale x 16 x i32> [[TMP11]], [[VEC_PHI]]
+; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK: middle.block:
+; CHECK-NEXT: [[TMP14:%.*]] = tail call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> [[TMP12]])
+; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY_PREHEADER]]
+; CHECK: for.body.preheader:
+; CHECK-NEXT: [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: [[ACC_010_PH:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK: for.cond.cleanup.loopexit:
+; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[TMP14]], [[MIDDLE_BLOCK]] ], [ [[ADD:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT: [[TMP15:%.*]] = trunc i32 [[ADD_LCSSA]] to i8
+; CHECK-NEXT: store i8 [[TMP15]], ptr [[OUT]], align 1
+; CHECK-NEXT: ret void
+; CHECK: for.body:
+; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[ACC_010:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[ACC_010_PH]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP16:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP16]] to i32
+; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT: [[TMP17:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT: [[CONV3:%.*]] = zext i8 [[TMP17]] to i32
+; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[CONV3]], [[CONV]]
+; CHECK-NEXT: [[ADD]] = add i32 [[MUL]], [[ACC_010]]
+; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], 1
+; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+ br label %for.body
+
+for.cond.cleanup.loopexit: ; preds = %for.body
+ %0 = trunc i32 %add to i8
+ store i8 %0, ptr %out, align 1
+ ret void
+
+for.body: ; preds = %for.body, %entry
+ %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+ %acc.010 = phi i32 [ 0, %entry ], [ %add, %for.body ]
+ %arrayidx = getelementptr i8, ptr %a, i64 %indvars.iv
+ %1 = load i8, ptr %arrayidx, align 1
+ %conv = zext i8 %1 to i32
+ %arrayidx2 = getelementptr i8, ptr %b, i64 %indvars.iv
+ %2 = load i8, ptr %arrayidx2, align 1
+ %conv3 = zext i8 %2 to i32
+ %mul = mul i32 %conv3, %conv
+ %add = add i32 %mul, %acc.010
+ %indvars.iv.next = add i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+
+; uselistorder directives
+ uselistorder i32 %add, { 1, 0 }
+}
+
+attributes #0 = { "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+v8a" }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll b/llvm/test/CodeGen/AArch64/partial-reduce.ll
similarity index 100%
rename from llvm/test/CodeGen/AArch64/partial-reduce-sdot.ll
rename to llvm/test/CodeGen/AArch64/partial-reduce.ll
>From a69f8fa0b4ec167d59d49e268442605a8da1aff8 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy at arm.com>
Date: Wed, 5 Jun 2024 14:12:58 +0100
Subject: [PATCH 07/10] Add generic decomposition of partial reduction
intrinsic
---
llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 5 +++++
llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 6 +++---
2 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 261f7e49e5c8c..2f50d54b19f03 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -942,6 +942,11 @@ END_TWO_BYTE_PACK()
inline const APInt &getAsAPIntVal() const;
const SDValue &getOperand(unsigned Num) const {
+ if(Num >= NumOperands) {
+ dbgs() << Num << ">=" << NumOperands << "\n";
+ printr(dbgs());
+ dbgs() << "\n";
+ }
assert(Num < NumOperands && "Invalid child # of SDNode!");
return OperandList[Num];
}
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 39cf121802858..fe46a49530a95 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -7850,15 +7850,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
auto Index = DAG.getVectorIdxConstant(0, DL);
auto FullTy = OpNode.getValueType();
- auto ResultVector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {OpNode, Index});
+ auto ResultVector = DAG.getSplat(ReducedTy, DL, DAG.getConstant(0, DL, ReducedTy.getScalarType()));
unsigned ScaleFactor = FullTy.getVectorMinNumElements() / ReducedTy.getVectorMinNumElements();
for(unsigned i = 0; i < ScaleFactor; i++) {
auto SourceIndex = DAG.getVectorIdxConstant(i * ScaleFactor, DL);
auto TargetIndex = DAG.getVectorIdxConstant(i, DL);
auto N = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ReducedTy, {OpNode, SourceIndex});
- N = DAG.getNode(ISD::VP_REDUCE_ADD, DL, ReducedTy.getScalarType(), N);
- DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReducedTy, {ResultVector, N, TargetIndex});
+ N = DAG.getNode(ISD::VECREDUCE_ADD, DL, ReducedTy.getScalarType(), N);
+ ResultVector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReducedTy, {ResultVector, N, TargetIndex});
}
setValue(&I, ResultVector);
>From 3a8b1c9bd6234853f45a0eab641f95332ad346d0 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy at arm.com>
Date: Wed, 5 Jun 2024 14:14:15 +0100
Subject: [PATCH 08/10] Add basic cost modeling for partial reductions
---
llvm/include/llvm/Analysis/TargetTransformInfo.h | 13 +++++++++++++
.../llvm/Analysis/TargetTransformInfoImpl.h | 7 +++++++
llvm/include/llvm/CodeGen/BasicTTIImpl.h | 7 +++++++
llvm/lib/Analysis/TargetTransformInfo.cpp | 6 ++++++
llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 14 +++++++++++++-
5 files changed, 46 insertions(+), 1 deletion(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 1c76821fe5e4a..29f4895a7ef53 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1516,6 +1516,10 @@ class TargetTransformInfo {
FastMathFlags FMF,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
+ InstructionCost getPartialReductionCost(
+ unsigned Opcode, bool IsUnsigned, VectorType *ResTy, VectorType *Ty,
+ FastMathFlags FMF, TargetCostKind CostKind = TCK_RecipThroughput) const;
+
/// \returns The cost of Intrinsic instructions. Analyses the real arguments.
/// Three cases are handled: 1. scalar instruction 2. vector instruction
/// 3. scalar instruction which is to be vectorized.
@@ -2078,6 +2082,9 @@ class TargetTransformInfo::Concept {
unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty,
FastMathFlags FMF,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) = 0;
+ virtual InstructionCost getPartialReductionCost(
+ unsigned Opcode, bool IsUnsigned, VectorType *ResTy, VectorType *Ty,
+ FastMathFlags FMF, TargetCostKind CostKind = TCK_RecipThroughput) = 0;
virtual InstructionCost getMulAccReductionCost(
bool IsUnsigned, Type *ResTy, VectorType *Ty,
TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) = 0;
@@ -2752,6 +2759,12 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
return Impl.getExtendedReductionCost(Opcode, IsUnsigned, ResTy, Ty, FMF,
CostKind);
}
+ InstructionCost getPartialReductionCost(unsigned Opcode, bool IsUnsigned,
+ VectorType *ResTy, VectorType *Ty, FastMathFlags FMF,
+ TargetCostKind CostKind = TCK_RecipThroughput) override {
+ return Impl.getPartialReductionCost(Opcode, IsUnsigned, ResTy, Ty, FMF,
+ CostKind);
+ }
InstructionCost
getMulAccReductionCost(bool IsUnsigned, Type *ResTy, VectorType *Ty,
TTI::TargetCostKind CostKind) override {
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 4d5cd963e0926..22f137daa0634 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -791,6 +791,13 @@ class TargetTransformInfoImplBase {
return 1;
}
+ InstructionCost getPartialReductionCost(unsigned Opcode, bool IsUnsigned,
+ VectorType *ResTy, VectorType *Ty,
+ FastMathFlags FMF,
+ TTI::TargetCostKind CostKind) const {
+ return InstructionCost::getMax();
+ }
+
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
VectorType *Ty,
TTI::TargetCostKind CostKind) const {
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 4a3a03dc5ad48..49274f216d58d 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2555,6 +2555,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
return RedCost + ExtCost;
}
+ InstructionCost getPartialReductionCost(unsigned Opcode, bool IsUnsigned,
+ VectorType *ResTy, VectorType *Ty,
+ FastMathFlags FMD,
+ TTI::TargetCostKind CostKind) {
+ return InstructionCost::getMax();
+ }
+
InstructionCost getMulAccReductionCost(bool IsUnsigned, Type *ResTy,
VectorType *Ty,
TTI::TargetCostKind CostKind) {
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 33c899fe88999..72c93ca765a8f 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1154,6 +1154,12 @@ InstructionCost TargetTransformInfo::getExtendedReductionCost(
CostKind);
}
+InstructionCost TargetTransformInfo::getPartialReductionCost(
+ unsigned Opcode, bool IsUnsigned, VectorType *ResTy, VectorType *Ty,
+ FastMathFlags FMF, TargetCostKind CostKind) const {
+ return TTIImpl->getPartialReductionCost(Opcode, IsUnsigned, ResTy, Ty, FMF, CostKind);
+}
+
InstructionCost TargetTransformInfo::getMulAccReductionCost(
bool IsUnsigned, Type *ResTy, VectorType *Ty,
TTI::TargetCostKind CostKind) const {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f697122390b3b..07141d87e0293 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8653,7 +8653,19 @@ VPRecipeBase *VPRecipeBuilder::tryToCreatePartialReduction(
auto EC = ElementCount::getScalable(16);
if(std::find(Range.begin(), Range.end(), EC) == Range.end())
return nullptr;
- return new VPPartialReductionRecipe(*Instr, make_range(Operands.begin(), Operands.end()), 4);
+
+ // Scale factor of 4 for sdot/udot.
+ unsigned Scale = 4;
+ VectorType* ResTy = ScalableVectorType::get(Instr->getType(), Scale);
+ VectorType* ValTy = ScalableVectorType::get(Instr->getType(), EC.getKnownMinValue());
+ using namespace llvm::PatternMatch;
+ bool IsUnsigned = match(Instr, m_Add(m_Mul(m_ZExt(m_Value()), m_ZExt(m_Value())), m_Value()));
+ auto RecipeCost = this->CM.TTI.getPartialReductionCost(Instr->getOpcode(), IsUnsigned, ResTy, ValTy, FastMathFlags::getFast());
+ // TODO replace with more informed cost check
+ if(RecipeCost == InstructionCost::getMax())
+ return nullptr;
+
+ return new VPPartialReductionRecipe(*Instr, make_range(Operands.begin(), Operands.end()), Scale);
}
return nullptr;
}
>From ac9f434a7c00288f274ed5781740229360654f06 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy at arm.com>
Date: Wed, 5 Jun 2024 14:14:55 +0100
Subject: [PATCH 09/10] Add missing support for sign-extends
---
.../lib/Transforms/Vectorize/LoopVectorize.cpp | 18 ++++++++++++++----
1 file changed, 14 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 07141d87e0293..03c5359d3d6d1 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2215,6 +2215,9 @@ static void getPartialReductionInstrChain(Instruction *Instr, SmallVector<Value*
}
+/// Checks if the given instruction the root of a partial reduction chain
+///
+/// Note: This currently only supports udot/sdot chains
/// @param Instr The root instruction to scan
static bool isInstrPartialReduction(Instruction *Instr) {
Value *ExpectedPhi;
@@ -2224,12 +2227,12 @@ static bool isInstrPartialReduction(Instruction *Instr) {
using namespace llvm::PatternMatch;
auto Pattern = m_Add(
m_OneUse(m_Mul(
- m_OneUse(m_ZExt(
+ m_OneUse(m_ZExtOrSExt(
m_OneUse(m_Load(
m_GEP(
m_Value(A),
m_Value(InductionA)))))),
- m_OneUse(m_ZExt(
+ m_OneUse(m_ZExtOrSExt(
m_OneUse(m_Load(
m_GEP(
m_Value(B),
@@ -2248,8 +2251,13 @@ static bool isInstrPartialReduction(Instruction *Instr) {
}
Instruction *Mul = cast<Instruction>(Instr->getOperand(0));
- Instruction *Ext0 = cast<ZExtInst>(Mul->getOperand(0));
- Instruction *Ext1 = cast<ZExtInst>(Mul->getOperand(1));
+ Instruction *Ext0 = cast<CastInst>(Mul->getOperand(0));
+ Instruction *Ext1 = cast<CastInst>(Mul->getOperand(1));
+
+ if(Ext0->getOpcode() != Ext1->getOpcode()) {
+ LLVM_DEBUG(dbgs() << "Extends aren't of the same type, cannot create a partial reduction.\n");
+ return false;
+ }
// Check that the extends extend to i32
if(!Ext0->getType()->isIntegerTy(32) || !Ext1->getType()->isIntegerTy(32)) {
@@ -8650,6 +8658,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreatePartialReduction(
VFRange &Range, Instruction *Instr, ArrayRef<VPValue *> Operands) {
if(isInstrPartialReduction(Instr)) {
+ // Restricting this case to 16x means that, using a scale of 4, we avoid
+ // trying to generate illegal types such as <vscale x 2 x i32>
auto EC = ElementCount::getScalable(16);
if(std::find(Range.begin(), Range.end(), EC) == Range.end())
return nullptr;
>From a18d9b5c1cb9ee25670cf09ad39664e1d91f5275 Mon Sep 17 00:00:00 2001
From: Nicholas Guy <nicholas.guy at arm.com>
Date: Wed, 5 Jun 2024 14:27:39 +0100
Subject: [PATCH 10/10] Remove debug statements
---
llvm/include/llvm/CodeGen/SelectionDAGNodes.h | 5 -----
1 file changed, 5 deletions(-)
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 2f50d54b19f03..261f7e49e5c8c 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -942,11 +942,6 @@ END_TWO_BYTE_PACK()
inline const APInt &getAsAPIntVal() const;
const SDValue &getOperand(unsigned Num) const {
- if(Num >= NumOperands) {
- dbgs() << Num << ">=" << NumOperands << "\n";
- printr(dbgs());
- dbgs() << "\n";
- }
assert(Num < NumOperands && "Invalid child # of SDNode!");
return OperandList[Num];
}
More information about the llvm-commits
mailing list