[llvm] 0e6f0b7 - [RISCV] Add cost model for fixed broadcast shuffle
ShihPo Hung via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 30 05:01:01 PST 2022
Author: ShihPo Hung
Date: 2022-11-30T04:58:52-08:00
New Revision: 0e6f0b7cc38391f3365a862266a8aef50d093135
URL: https://github.com/llvm/llvm-project/commit/0e6f0b7cc38391f3365a862266a8aef50d093135
DIFF: https://github.com/llvm/llvm-project/commit/0e6f0b7cc38391f3365a862266a8aef50d093135.diff
LOG: [RISCV] Add cost model for fixed broadcast shuffle
This patch adds basic broadcast shuffle costs in order to enable SLP vectorization.
And adds `getLMULCost` to consider reciprocal throughput for different LMUL.
Reviewed By: reames
Differential Revision: https://reviews.llvm.org/D137276
Added:
llvm/test/Analysis/CostModel/RISCV/fixed-shuffle-broadcast.ll
Modified:
llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
Removed:
################################################################################
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 3d39789c4d071..28a8be7631fd9 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -31,6 +31,27 @@ static cl::opt<unsigned> SLPMaxVF(
"SLP vectorizer. Defaults to 1 which disables SLP."),
cl::init(1), cl::Hidden);
+InstructionCost RISCVTTIImpl::getLMULCost(MVT VT) {
+ // TODO: Here assume reciprocal throughput is 1 for LMUL_1, it is
+ // implementation-defined.
+ if (!VT.isVector())
+ return InstructionCost::getInvalid();
+ unsigned Cost;
+ if (VT.isScalableVector()) {
+ unsigned LMul;
+ bool Fractional;
+ std::tie(LMul, Fractional) =
+ RISCVVType::decodeVLMUL(RISCVTargetLowering::getLMUL(VT));
+ if (Fractional)
+ Cost = 1;
+ else
+ Cost = LMul;
+ } else {
+ Cost = VT.getSizeInBits() / ST->getRealMinVLen();
+ }
+ return std::max<unsigned>(Cost, 1);
+}
+
InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy() &&
@@ -255,6 +276,44 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
}
}
+ if (isa<FixedVectorType>(Tp) && Kind == TargetTransformInfo::SK_Broadcast) {
+ std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
+ bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
+ Instruction::InsertElement);
+ if (LT.second.getScalarSizeInBits() == 1) {
+ if (HasScalar) {
+ // Example sequence:
+ // andi a0, a0, 1
+ // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
+ // vmv.v.x v8, a0
+ // vmsne.vi v0, v8, 0
+ return LT.first * getLMULCost(LT.second) * 3;
+ }
+ // Example sequence:
+ // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
+ // vmv.v.i v8, 0
+ // vmerge.vim v8, v8, 1, v0
+ // vmv.x.s a0, v8
+ // andi a0, a0, 1
+ // vmv.v.x v8, a0
+ // vmsne.vi v0, v8, 0
+
+ return LT.first * getLMULCost(LT.second) * 6;
+ }
+
+ if (HasScalar) {
+ // Example sequence:
+ // vmv.v.x v8, a0
+ return LT.first * getLMULCost(LT.second);
+ }
+
+ // Example sequence:
+ // vrgather.vi v9, v8, 0
+ // TODO: vrgather could be slower than vmv.v.x. It is
+ // implementation-dependent.
+ return LT.first * getLMULCost(LT.second);
+ }
+
return BaseT::getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp);
}
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index bbd903257d86d..36dd86992263d 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -46,6 +46,10 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
/// the true cost significantly if getVScaleForTuning is wildly off for the
/// actual target hardware.
unsigned getEstimatedVLFor(VectorType *Ty);
+
+ /// Return the cost of LMUL. The larger the LMUL, the higher the cost.
+ InstructionCost getLMULCost(MVT VT);
+
public:
explicit RISCVTTIImpl(const RISCVTargetMachine *TM, const Function &F)
: BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
diff --git a/llvm/test/Analysis/CostModel/RISCV/fixed-shuffle-broadcast.ll b/llvm/test/Analysis/CostModel/RISCV/fixed-shuffle-broadcast.ll
new file mode 100644
index 0000000000000..916df3a0a44b3
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/RISCV/fixed-shuffle-broadcast.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+experimental-zvfh | FileCheck %s
+
+define void @broadcast_fixed() #0{
+; CHECK-LABEL: 'broadcast_fixed'
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %zero = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %1 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %2 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %3 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %4 = shufflevector <32 x half> undef, <32 x half> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %5 = shufflevector <64 x half> undef, <64 x half> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %6 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %7 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %8 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %9 = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %10 = shufflevector <32 x float> undef, <32 x float> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %11 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %12 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %13 = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %14 = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %15 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %16 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %17 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %18 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %19 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %20 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %21 = shufflevector <128 x i8> undef, <128 x i8> undef, <128 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %22 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %23 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %24 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %25 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %26 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %27 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %28 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %29 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %30 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %31 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %33 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %34 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %35 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %36 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %37 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %38 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %39 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %40 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %41 = shufflevector <32 x i1> undef, <32 x i1> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %42 = shufflevector <64 x i1> undef, <64 x i1> undef, <64 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %43 = shufflevector <128 x i1> undef, <128 x i1> undef, <128 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %ins1 = insertelement <128 x i1> poison, i1 poison, i32 0
+; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %44 = shufflevector <128 x i1> %ins1, <128 x i1> poison, <128 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %ins2 = insertelement <2 x i8> poison, i8 3, i32 0
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %45 = shufflevector <2 x i8> %ins2, <2 x i8> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+ %zero = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
+ %1 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
+ %2 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
+ %3 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
+ %4 = shufflevector <32 x half> undef, <32 x half> undef, <32 x i32> zeroinitializer
+ %5 = shufflevector <64 x half> undef, <64 x half> undef, <64 x i32> zeroinitializer
+
+ %6 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
+ %7 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
+ %8 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
+ %9 = shufflevector <16 x float> undef, <16 x float> undef, <16 x i32> zeroinitializer
+ %10 = shufflevector <32 x float> undef, <32 x float> undef, <32 x i32> zeroinitializer
+
+ %11 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
+ %12 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
+ %13 = shufflevector <8 x double> undef, <8 x double> undef, <8 x i32> zeroinitializer
+ %14 = shufflevector <16 x double> undef, <16 x double> undef, <16 x i32> zeroinitializer
+
+ %15 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
+ %16 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
+ %17 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
+ %18 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+ %19 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> zeroinitializer
+ %20 = shufflevector <64 x i8> undef, <64 x i8> undef, <64 x i32> zeroinitializer
+ %21 = shufflevector <128 x i8> undef, <128 x i8> undef, <128 x i32> zeroinitializer
+
+ %22 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
+ %23 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
+ %24 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+ %25 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
+ %26 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> zeroinitializer
+ %27 = shufflevector <64 x i16> undef, <64 x i16> undef, <64 x i32> zeroinitializer
+
+ %28 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
+ %29 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+ %30 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
+ %31 = shufflevector <16 x i32> undef, <16 x i32> undef, <16 x i32> zeroinitializer
+ %32 = shufflevector <32 x i32> undef, <32 x i32> undef, <32 x i32> zeroinitializer
+
+ %33 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
+ %34 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
+ %35 = shufflevector <8 x i64> undef, <8 x i64> undef, <8 x i32> zeroinitializer
+ %36 = shufflevector <16 x i64> undef, <16 x i64> undef, <16 x i32> zeroinitializer
+
+ %37 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> zeroinitializer
+ %38 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> zeroinitializer
+ %39 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> zeroinitializer
+ %40 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> zeroinitializer
+ %41 = shufflevector <32 x i1> undef, <32 x i1> undef, <32 x i32> zeroinitializer
+ %42 = shufflevector <64 x i1> undef, <64 x i1> undef, <64 x i32> zeroinitializer
+ %43 = shufflevector <128 x i1> undef, <128 x i1> undef, <128 x i32> zeroinitializer
+
+ %ins1 = insertelement <128 x i1> poison, i1 poison, i32 0
+ %44 = shufflevector <128 x i1> %ins1, <128 x i1> poison, <128 x i32> zeroinitializer
+
+ %ins2 = insertelement <2 x i8> poison, i8 3, i32 0
+ %45 = shufflevector <2 x i8> %ins2, <2 x i8> undef, <2 x i32> zeroinitializer
+ ret void
+}
More information about the llvm-commits
mailing list