[llvm] [RISCV][TTI] Improve SiFive7 reduction cost (PR #90951)
Shih-Po Hung via llvm-commits
llvm-commits at lists.llvm.org
Sun May 5 20:02:00 PDT 2024
https://github.com/arcbbb updated https://github.com/llvm/llvm-project/pull/90951
>From 38228ba04527be100ad7436391bd1dfac0812893 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Fri, 3 May 2024 01:10:50 -0700
Subject: [PATCH 1/2] [RISCV][TTI] Improve SiFive7 reduction cost
The cost calculation for intrinsic llvm.vector.reduce.* factors in
the costs of instructions VRED, VFRED, VMV, and VFMV.
This patch implements a target-specific cost function.
If an opcode is supported in the target-specific cost function, it
takes precedence over the default cost.
Co-authored-by: Elvis Wang <elvis.wang at sifive.com>
---
.../Target/RISCV/RISCVTargetTransformInfo.cpp | 60 ++
.../CostModel/RISCV/sifive-x280-reduce.ll | 675 ++++++++++++++++++
2 files changed, 735 insertions(+)
create mode 100644 llvm/test/Analysis/CostModel/RISCV/sifive-x280-reduce.ll
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index ce26e61880fd05..81fc3e81fd1d0f 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -34,6 +34,54 @@ static cl::opt<unsigned> SLPMaxVF(
"exclusively by SLP vectorizer."),
cl::Hidden);
+static std::optional<InstructionCost>
+getSiFiveX280RVVCost(unsigned Op, MVT VT, TTI::TargetCostKind CostKind) {
+ std::optional<InstructionCost> Cost;
+ unsigned VScale = 8;
+ switch (Op) {
+ default:
+ Cost = std::nullopt;
+ break;
+ case RISCV::VREDMAX_VS:
+ case RISCV::VREDMIN_VS:
+ case RISCV::VREDMAXU_VS:
+ case RISCV::VREDMINU_VS:
+ case RISCV::VREDSUM_VS:
+ case RISCV::VREDAND_VS:
+ case RISCV::VREDOR_VS:
+ case RISCV::VREDXOR_VS:
+ case RISCV::VFREDMAX_VS:
+ case RISCV::VFREDMIN_VS:
+ case RISCV::VFREDUSUM_VS: {
+ unsigned VL = VT.getVectorMinNumElements();
+ if (!VT.isFixedLengthVector())
+ VL *= VScale;
+ // For the cases with small VL, we use a lookup table for accurate
+ // cost estimation.
+ unsigned LookUpSiFive7ReduceLatency[] = {0, 20, 27, 32, 34, 38, 40, 41, 42};
+ if (VL <= 32) {
+ Cost = LookUpSiFive7ReduceLatency[(VL + 3) >> 2];
+ break;
+ }
+ Cost = 6 + 7 * Log2_32_Ceil(VL);
+ break;
+ }
+ case RISCV::VFREDOSUM_VS: {
+ unsigned VL = VT.getVectorMinNumElements();
+ if (!VT.isFixedLengthVector())
+ VL *= VScale;
+ Cost = VL * 6;
+ break;
+ }
+ case RISCV::VMV_X_S:
+ case RISCV::VFMV_F_S:
+ case RISCV::VCPOP_M:
+ /* Vector-to-scalar communication */
+ Cost = 8;
+ }
+ return Cost;
+}
+
InstructionCost
RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
TTI::TargetCostKind CostKind) {
@@ -43,11 +91,23 @@ RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
size_t NumInstr = OpCodes.size();
if (CostKind == TTI::TCK_CodeSize)
return NumInstr;
+
+ std::optional<InstructionCost> (*GetTargetCost)(
+ unsigned, MVT, TTI::TargetCostKind) = nullptr;
+ if (ST->getProcFamily() == RISCVSubtarget::SiFive7)
+ GetTargetCost = getSiFiveX280RVVCost;
InstructionCost LMULCost = TLI->getLMULCost(VT);
if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency))
return LMULCost * NumInstr;
InstructionCost Cost = 0;
for (auto Op : OpCodes) {
+ std::optional<InstructionCost> OverrideCost =
+ GetTargetCost ? GetTargetCost(Op, VT, CostKind) : std::nullopt;
+ if (OverrideCost) {
+ Cost += *OverrideCost;
+ continue;
+ }
+
switch (Op) {
case RISCV::VRGATHER_VI:
Cost += TLI->getVRGatherVICost(VT);
diff --git a/llvm/test/Analysis/CostModel/RISCV/sifive-x280-reduce.ll b/llvm/test/Analysis/CostModel/RISCV/sifive-x280-reduce.ll
new file mode 100644
index 00000000000000..01632f9ab80643
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/RISCV/sifive-x280-reduce.ll
@@ -0,0 +1,675 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=riscv64 -mcpu=sifive-x280 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefix=THROUGHPUT
+; RUN: opt < %s -mtriple=riscv64 -mcpu=sifive-x280 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck %s --check-prefix=SIZE
+
+declare i1 @llvm.vector.reduce.add.v16i1(<16 x i1>)
+declare i1 @llvm.vector.reduce.add.v32i1(<32 x i1>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.add.v256i8(<256 x i8>)
+declare i8 @llvm.vector.reduce.add.v512i8(<512 x i8>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.add.v128i16(<128 x i16>)
+declare i16 @llvm.vector.reduce.add.v256i16(<256 x i16>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
+declare i32 @llvm.vector.reduce.add.v128i32(<128 x i32>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v32i64(<32 x i64>)
+declare i64 @llvm.vector.reduce.add.v64i64(<64 x i64>)
+
+define void @reduce_add() {
+; THROUGHPUT-LABEL: 'reduce_add'
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %1 = call i1 @llvm.vector.reduce.add.v16i1(<16 x i1> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %2 = call i1 @llvm.vector.reduce.add.v32i1(<32 x i1> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %4 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %5 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %6 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %7 = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %8 = call i8 @llvm.vector.reduce.add.v512i8(<512 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %9 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %10 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %11 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %12 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %13 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %14 = call i16 @llvm.vector.reduce.add.v256i16(<256 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %17 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %18 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %19 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %20 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %21 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %22 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %23 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %24 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %25 = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %26 = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %27 = call i1 @llvm.vector.reduce.add.nxv1i1(<vscale x 1 x i1> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %28 = call i1 @llvm.vector.reduce.add.nxv2i1(<vscale x 2 x i1> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %29 = call i1 @llvm.vector.reduce.add.nxv4i1(<vscale x 4 x i1> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %30 = call i1 @llvm.vector.reduce.add.nxv8i1(<vscale x 8 x i1> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %31 = call i1 @llvm.vector.reduce.add.nxv16i1(<vscale x 16 x i1> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %32 = call i8 @llvm.vector.reduce.add.nxv1i8(<vscale x 1 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %33 = call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %34 = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %35 = call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %36 = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %37 = call i8 @llvm.vector.reduce.add.nxv32i8(<vscale x 32 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %38 = call i8 @llvm.vector.reduce.add.nxv64i8(<vscale x 64 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %39 = call i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %40 = call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %41 = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %42 = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %43 = call i16 @llvm.vector.reduce.add.nxv16i16(<vscale x 16 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %44 = call i16 @llvm.vector.reduce.add.nxv32i16(<vscale x 32 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %45 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %46 = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %47 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %48 = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %49 = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %50 = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %51 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %52 = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %53 = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'reduce_add'
+; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %1 = call i1 @llvm.vector.reduce.add.v16i1(<16 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %2 = call i1 @llvm.vector.reduce.add.v32i1(<32 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %4 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %5 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %6 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %7 = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %8 = call i8 @llvm.vector.reduce.add.v512i8(<512 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %9 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %10 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %11 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %12 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %13 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %14 = call i16 @llvm.vector.reduce.add.v256i16(<256 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %17 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %18 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %19 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %20 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %21 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %22 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %23 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %24 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %25 = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %26 = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %27 = call i1 @llvm.vector.reduce.add.nxv1i1(<vscale x 1 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %28 = call i1 @llvm.vector.reduce.add.nxv2i1(<vscale x 2 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %29 = call i1 @llvm.vector.reduce.add.nxv4i1(<vscale x 4 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %30 = call i1 @llvm.vector.reduce.add.nxv8i1(<vscale x 8 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %31 = call i1 @llvm.vector.reduce.add.nxv16i1(<vscale x 16 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %32 = call i8 @llvm.vector.reduce.add.nxv1i8(<vscale x 1 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %33 = call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %34 = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %35 = call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %36 = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %37 = call i8 @llvm.vector.reduce.add.nxv32i8(<vscale x 32 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %38 = call i8 @llvm.vector.reduce.add.nxv64i8(<vscale x 64 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %39 = call i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %40 = call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %41 = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %42 = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %43 = call i16 @llvm.vector.reduce.add.nxv16i16(<vscale x 16 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %44 = call i16 @llvm.vector.reduce.add.nxv32i16(<vscale x 32 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %45 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %46 = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %47 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %48 = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %49 = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %50 = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %51 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %52 = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %53 = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+ call i1 @llvm.vector.reduce.add.v16i1(<16 x i1> undef)
+ call i1 @llvm.vector.reduce.add.v32i1(<32 x i1> undef)
+ call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+ call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+ call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+ call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+ call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> undef)
+ call i8 @llvm.vector.reduce.add.v512i8(<512 x i8> undef)
+ call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+ call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+ call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+ call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
+ call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
+ call i16 @llvm.vector.reduce.add.v256i16(<256 x i16> undef)
+ call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+ call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+ call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+ call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
+ call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
+ call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
+ call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+ call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+ call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+ call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+ call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> undef)
+ call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> undef)
+
+ call i1 @llvm.vector.reduce.add.nxv1i1(<vscale x 1 x i1> undef)
+ call i1 @llvm.vector.reduce.add.nxv2i1(<vscale x 2 x i1> undef)
+ call i1 @llvm.vector.reduce.add.nxv4i1(<vscale x 4 x i1> undef)
+ call i1 @llvm.vector.reduce.add.nxv8i1(<vscale x 8 x i1> undef)
+ call i1 @llvm.vector.reduce.add.nxv16i1(<vscale x 16 x i1> undef)
+ call i8 @llvm.vector.reduce.add.nxv1i8(<vscale x 1 x i8> undef)
+ call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> undef)
+ call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> undef)
+ call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> undef)
+ call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> undef)
+ call i8 @llvm.vector.reduce.add.nxv32i8(<vscale x 32 x i8> undef)
+ call i8 @llvm.vector.reduce.add.nxv64i8(<vscale x 64 x i8> undef)
+ call i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16> undef)
+ call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> undef)
+ call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> undef)
+ call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> undef)
+ call i16 @llvm.vector.reduce.add.nxv16i16(<vscale x 16 x i16> undef)
+ call i16 @llvm.vector.reduce.add.nxv32i16(<vscale x 32 x i16> undef)
+ call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> undef)
+ call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> undef)
+ call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> undef)
+ call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> undef)
+ call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> undef)
+ call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> undef)
+ call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> undef)
+ call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> undef)
+ call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> undef)
+ ret void
+}
+
+declare half @llvm.vector.reduce.fadd.v1f16(half, <1 x half>)
+declare half @llvm.vector.reduce.fadd.v2f16(half, <2 x half>)
+declare half @llvm.vector.reduce.fadd.v4f16(half, <4 x half>)
+declare half @llvm.vector.reduce.fadd.v8f16(half, <8 x half>)
+declare half @llvm.vector.reduce.fadd.v16f16(half, <16 x half>)
+declare half @llvm.vector.reduce.fadd.v32f16(half, <32 x half>)
+declare half @llvm.vector.reduce.fadd.v64f16(half, <64 x half>)
+declare half @llvm.vector.reduce.fadd.v128f16(half, <128 x half>)
+declare half @llvm.vector.reduce.fadd.v256f16(half, <256 x half>)
+declare float @llvm.vector.reduce.fadd.v1f32(float, <1 x float>)
+declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>)
+declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
+declare float @llvm.vector.reduce.fadd.v16f32(float, <16 x float>)
+declare float @llvm.vector.reduce.fadd.v32f32(float, <32 x float>)
+declare float @llvm.vector.reduce.fadd.v64f32(float, <64 x float>)
+declare float @llvm.vector.reduce.fadd.v128f32(float, <128 x float>)
+declare double @llvm.vector.reduce.fadd.v1f64(double, <1 x double>)
+declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
+declare double @llvm.vector.reduce.fadd.v8f64(double, <8 x double>)
+declare double @llvm.vector.reduce.fadd.v16f64(double, <16 x double>)
+declare double @llvm.vector.reduce.fadd.v32f64(double, <32 x double>)
+declare double @llvm.vector.reduce.fadd.v64f64(double, <64 x double>)
+
+define void @ordered_reduce_fadd() {
+; THROUGHPUT-LABEL: 'ordered_reduce_fadd'
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %1 = call half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %2 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %3 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %4 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 105 for instruction: %5 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 201 for instruction: %6 = call half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 393 for instruction: %7 = call half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 777 for instruction: %8 = call half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1545 for instruction: %9 = call half @llvm.vector.reduce.fadd.v256f16(half 0xH0000, <256 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %10 = call half @llvm.vector.reduce.fadd.nxv1f16(half 0xH0000, <vscale x 1 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 105 for instruction: %11 = call half @llvm.vector.reduce.fadd.nxv2f16(half 0xH0000, <vscale x 2 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 201 for instruction: %12 = call half @llvm.vector.reduce.fadd.nxv4f16(half 0xH0000, <vscale x 4 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 393 for instruction: %13 = call half @llvm.vector.reduce.fadd.nxv8f16(half 0xH0000, <vscale x 8 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 777 for instruction: %14 = call half @llvm.vector.reduce.fadd.nxv16f16(half 0xH0000, <vscale x 16 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 1545 for instruction: %15 = call half @llvm.vector.reduce.fadd.nxv32f16(half 0xH0000, <vscale x 32 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %16 = call float @llvm.vector.reduce.fadd.v1f32(float 0.000000e+00, <1 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %17 = call float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %18 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %19 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 105 for instruction: %20 = call float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 201 for instruction: %21 = call float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 393 for instruction: %22 = call float @llvm.vector.reduce.fadd.v64f32(float 0.000000e+00, <64 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 777 for instruction: %23 = call float @llvm.vector.reduce.fadd.v128f32(float 0.000000e+00, <128 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %24 = call float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 105 for instruction: %25 = call float @llvm.vector.reduce.fadd.nxv2f32(float 0.000000e+00, <vscale x 2 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 201 for instruction: %26 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 393 for instruction: %27 = call float @llvm.vector.reduce.fadd.nxv8f32(float 0.000000e+00, <vscale x 8 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 777 for instruction: %28 = call float @llvm.vector.reduce.fadd.nxv16f32(float 0.000000e+00, <vscale x 16 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %29 = call double @llvm.vector.reduce.fadd.v1f64(double 0.000000e+00, <1 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 21 for instruction: %30 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 33 for instruction: %31 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %32 = call double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 105 for instruction: %33 = call double @llvm.vector.reduce.fadd.v16f64(double 0.000000e+00, <16 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 201 for instruction: %34 = call double @llvm.vector.reduce.fadd.v32f64(double 0.000000e+00, <32 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 393 for instruction: %35 = call double @llvm.vector.reduce.fadd.v64f64(double 0.000000e+00, <64 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 105 for instruction: %36 = call double @llvm.vector.reduce.fadd.v16f64(double 0.000000e+00, <16 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 201 for instruction: %37 = call double @llvm.vector.reduce.fadd.v32f64(double 0.000000e+00, <32 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 393 for instruction: %38 = call double @llvm.vector.reduce.fadd.v64f64(double 0.000000e+00, <64 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 777 for instruction: %39 = call double @llvm.vector.reduce.fadd.v128f64(double 0.000000e+00, <128 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'ordered_reduce_fadd'
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %1 = call half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %3 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %4 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %5 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %6 = call half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %7 = call half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %8 = call half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %9 = call half @llvm.vector.reduce.fadd.v256f16(half 0xH0000, <256 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %10 = call half @llvm.vector.reduce.fadd.nxv1f16(half 0xH0000, <vscale x 1 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %11 = call half @llvm.vector.reduce.fadd.nxv2f16(half 0xH0000, <vscale x 2 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %12 = call half @llvm.vector.reduce.fadd.nxv4f16(half 0xH0000, <vscale x 4 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %13 = call half @llvm.vector.reduce.fadd.nxv8f16(half 0xH0000, <vscale x 8 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %14 = call half @llvm.vector.reduce.fadd.nxv16f16(half 0xH0000, <vscale x 16 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %15 = call half @llvm.vector.reduce.fadd.nxv32f16(half 0xH0000, <vscale x 32 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %16 = call float @llvm.vector.reduce.fadd.v1f32(float 0.000000e+00, <1 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %17 = call float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %18 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %19 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %20 = call float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %21 = call float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %22 = call float @llvm.vector.reduce.fadd.v64f32(float 0.000000e+00, <64 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %23 = call float @llvm.vector.reduce.fadd.v128f32(float 0.000000e+00, <128 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %24 = call float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %25 = call float @llvm.vector.reduce.fadd.nxv2f32(float 0.000000e+00, <vscale x 2 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %26 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %27 = call float @llvm.vector.reduce.fadd.nxv8f32(float 0.000000e+00, <vscale x 8 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %28 = call float @llvm.vector.reduce.fadd.nxv16f32(float 0.000000e+00, <vscale x 16 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %29 = call double @llvm.vector.reduce.fadd.v1f64(double 0.000000e+00, <1 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %30 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %31 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %32 = call double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %33 = call double @llvm.vector.reduce.fadd.v16f64(double 0.000000e+00, <16 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %34 = call double @llvm.vector.reduce.fadd.v32f64(double 0.000000e+00, <32 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %35 = call double @llvm.vector.reduce.fadd.v64f64(double 0.000000e+00, <64 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %36 = call double @llvm.vector.reduce.fadd.v16f64(double 0.000000e+00, <16 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %37 = call double @llvm.vector.reduce.fadd.v32f64(double 0.000000e+00, <32 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %38 = call double @llvm.vector.reduce.fadd.v64f64(double 0.000000e+00, <64 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %39 = call double @llvm.vector.reduce.fadd.v128f64(double 0.000000e+00, <128 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+ call half @llvm.vector.reduce.fadd.v1f16(half 0.0, <1 x half> undef)
+ call half @llvm.vector.reduce.fadd.v2f16(half 0.0, <2 x half> undef)
+ call half @llvm.vector.reduce.fadd.v4f16(half 0.0, <4 x half> undef)
+ call half @llvm.vector.reduce.fadd.v8f16(half 0.0, <8 x half> undef)
+ call half @llvm.vector.reduce.fadd.v16f16(half 0.0, <16 x half> undef)
+ call half @llvm.vector.reduce.fadd.v32f16(half 0.0, <32 x half> undef)
+ call half @llvm.vector.reduce.fadd.v64f16(half 0.0, <64 x half> undef)
+ call half @llvm.vector.reduce.fadd.v128f16(half 0.0, <128 x half> undef)
+ call half @llvm.vector.reduce.fadd.v256(half 0.0, <256 x half> undef)
+ call half @llvm.vector.reduce.fadd.nxv1f16(half 0.0, <vscale x 1 x half> undef)
+ call half @llvm.vector.reduce.fadd.nxv2f16(half 0.0, <vscale x 2 x half> undef)
+ call half @llvm.vector.reduce.fadd.nxv4f16(half 0.0, <vscale x 4 x half> undef)
+ call half @llvm.vector.reduce.fadd.nxv8f16(half 0.0, <vscale x 8 x half> undef)
+ call half @llvm.vector.reduce.fadd.nxv16f16(half 0.0, <vscale x 16 x half> undef)
+ call half @llvm.vector.reduce.fadd.nxv32f16(half 0.0, <vscale x 32 x half> undef)
+ call float @llvm.vector.reduce.fadd.v1f32(float 0.0, <1 x float> undef)
+ call float @llvm.vector.reduce.fadd.v2f32(float 0.0, <2 x float> undef)
+ call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
+ call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
+ call float @llvm.vector.reduce.fadd.v16f32(float 0.0, <16 x float> undef)
+ call float @llvm.vector.reduce.fadd.v32f32(float 0.0, <32 x float> undef)
+ call float @llvm.vector.reduce.fadd.v64f32(float 0.0, <64 x float> undef)
+ call float @llvm.vector.reduce.fadd.v128f32(float 0.0, <128 x float> undef)
+ call float @llvm.vector.reduce.fadd.nxv1f32(float 0.0, <vscale x 1 x float> undef)
+ call float @llvm.vector.reduce.fadd.nxv2f32(float 0.0, <vscale x 2 x float> undef)
+ call float @llvm.vector.reduce.fadd.nxv4f32(float 0.0, <vscale x 4 x float> undef)
+ call float @llvm.vector.reduce.fadd.nxv8f32(float 0.0, <vscale x 8 x float> undef)
+ call float @llvm.vector.reduce.fadd.nxv16f32(float 0.0, <vscale x 16 x float> undef)
+ call double @llvm.vector.reduce.fadd.v1f64(double 0.0, <1 x double> undef)
+ call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
+ call double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
+ call double @llvm.vector.reduce.fadd.v8f64(double 0.0, <8 x double> undef)
+ call double @llvm.vector.reduce.fadd.v16f64(double 0.0, <16 x double> undef)
+ call double @llvm.vector.reduce.fadd.v32f64(double 0.0, <32 x double> undef)
+ call double @llvm.vector.reduce.fadd.v64f64(double 0.0, <64 x double> undef)
+ call double @llvm.vector.reduce.fadd.v16f64(double 0.0, <16 x double> undef)
+ call double @llvm.vector.reduce.fadd.v32f64(double 0.0, <32 x double> undef)
+ call double @llvm.vector.reduce.fadd.v64f64(double 0.0, <64 x double> undef)
+ call double @llvm.vector.reduce.fadd.v128f64(double 0.0, <128 x double> undef)
+ ret void
+}
+
+define void @fast_reduce_fadd() {
+; THROUGHPUT-LABEL: 'fast_reduce_fadd'
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %1 = call fast half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %2 = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %3 = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %4 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %5 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %6 = call fast half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %7 = call fast half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %8 = call fast half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %9 = call fast half @llvm.vector.reduce.fadd.v256f16(half 0xH0000, <256 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %10 = call fast half @llvm.vector.reduce.fadd.nxv1f16(half 0xH0000, <vscale x 1 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %11 = call fast half @llvm.vector.reduce.fadd.nxv2f16(half 0xH0000, <vscale x 2 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %12 = call fast half @llvm.vector.reduce.fadd.nxv4f16(half 0xH0000, <vscale x 4 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %13 = call fast half @llvm.vector.reduce.fadd.nxv8f16(half 0xH0000, <vscale x 8 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %14 = call fast half @llvm.vector.reduce.fadd.nxv16f16(half 0xH0000, <vscale x 16 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %15 = call fast half @llvm.vector.reduce.fadd.nxv32f16(half 0xH0000, <vscale x 32 x half> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %16 = call fast float @llvm.vector.reduce.fadd.v1f32(float 0.000000e+00, <1 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %17 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %18 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %19 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %20 = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %21 = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %22 = call fast float @llvm.vector.reduce.fadd.v64f32(float 0.000000e+00, <64 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %23 = call fast float @llvm.vector.reduce.fadd.v128f32(float 0.000000e+00, <128 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %24 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %25 = call fast float @llvm.vector.reduce.fadd.nxv2f32(float 0.000000e+00, <vscale x 2 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %26 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %27 = call fast float @llvm.vector.reduce.fadd.nxv8f32(float 0.000000e+00, <vscale x 8 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %28 = call fast float @llvm.vector.reduce.fadd.nxv16f32(float 0.000000e+00, <vscale x 16 x float> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %29 = call fast double @llvm.vector.reduce.fadd.v1f64(double 0.000000e+00, <1 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %30 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %31 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %32 = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %33 = call fast double @llvm.vector.reduce.fadd.v16f64(double 0.000000e+00, <16 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %34 = call fast double @llvm.vector.reduce.fadd.v32f64(double 0.000000e+00, <32 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %35 = call fast double @llvm.vector.reduce.fadd.v64f64(double 0.000000e+00, <64 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %36 = call fast double @llvm.vector.reduce.fadd.nxv1f64(double 0.000000e+00, <vscale x 1 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %37 = call fast double @llvm.vector.reduce.fadd.nxv2f64(double 0.000000e+00, <vscale x 2 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %38 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %39 = call fast double @llvm.vector.reduce.fadd.nxv8f64(double 0.000000e+00, <vscale x 8 x double> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'fast_reduce_fadd'
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %1 = call fast half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %3 = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %4 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %5 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %6 = call fast half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %7 = call fast half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %8 = call fast half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %9 = call fast half @llvm.vector.reduce.fadd.v256f16(half 0xH0000, <256 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %10 = call fast half @llvm.vector.reduce.fadd.nxv1f16(half 0xH0000, <vscale x 1 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %11 = call fast half @llvm.vector.reduce.fadd.nxv2f16(half 0xH0000, <vscale x 2 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %12 = call fast half @llvm.vector.reduce.fadd.nxv4f16(half 0xH0000, <vscale x 4 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %13 = call fast half @llvm.vector.reduce.fadd.nxv8f16(half 0xH0000, <vscale x 8 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %14 = call fast half @llvm.vector.reduce.fadd.nxv16f16(half 0xH0000, <vscale x 16 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %15 = call fast half @llvm.vector.reduce.fadd.nxv32f16(half 0xH0000, <vscale x 32 x half> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %16 = call fast float @llvm.vector.reduce.fadd.v1f32(float 0.000000e+00, <1 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %17 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %18 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %19 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %20 = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %21 = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %22 = call fast float @llvm.vector.reduce.fadd.v64f32(float 0.000000e+00, <64 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %23 = call fast float @llvm.vector.reduce.fadd.v128f32(float 0.000000e+00, <128 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %24 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %25 = call fast float @llvm.vector.reduce.fadd.nxv2f32(float 0.000000e+00, <vscale x 2 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %26 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %27 = call fast float @llvm.vector.reduce.fadd.nxv8f32(float 0.000000e+00, <vscale x 8 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %28 = call fast float @llvm.vector.reduce.fadd.nxv16f32(float 0.000000e+00, <vscale x 16 x float> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %29 = call fast double @llvm.vector.reduce.fadd.v1f64(double 0.000000e+00, <1 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %30 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %31 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %32 = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %33 = call fast double @llvm.vector.reduce.fadd.v16f64(double 0.000000e+00, <16 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %34 = call fast double @llvm.vector.reduce.fadd.v32f64(double 0.000000e+00, <32 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %35 = call fast double @llvm.vector.reduce.fadd.v64f64(double 0.000000e+00, <64 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %36 = call fast double @llvm.vector.reduce.fadd.nxv1f64(double 0.000000e+00, <vscale x 1 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %37 = call fast double @llvm.vector.reduce.fadd.nxv2f64(double 0.000000e+00, <vscale x 2 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %38 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %39 = call fast double @llvm.vector.reduce.fadd.nxv8f64(double 0.000000e+00, <vscale x 8 x double> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+ call fast half @llvm.vector.reduce.fadd.v1f16(half 0.0, <1 x half> undef)
+ call fast half @llvm.vector.reduce.fadd.v2f16(half 0.0, <2 x half> undef)
+ call fast half @llvm.vector.reduce.fadd.v4f16(half 0.0, <4 x half> undef)
+ call fast half @llvm.vector.reduce.fadd.v8f16(half 0.0, <8 x half> undef)
+ call fast half @llvm.vector.reduce.fadd.v16f16(half 0.0, <16 x half> undef)
+ call fast half @llvm.vector.reduce.fadd.v32f16(half 0.0, <32 x half> undef)
+ call fast half @llvm.vector.reduce.fadd.v64f16(half 0.0, <64 x half> undef)
+ call fast half @llvm.vector.reduce.fadd.v128f16(half 0.0, <128 x half> undef)
+ call fast half @llvm.vector.reduce.fadd.v256(half 0.0, <256 x half> undef)
+ call fast half @llvm.vector.reduce.fadd.nxv1f16(half 0.0, <vscale x 1 x half> undef)
+ call fast half @llvm.vector.reduce.fadd.nxv2f16(half 0.0, <vscale x 2 x half> undef)
+ call fast half @llvm.vector.reduce.fadd.nxv4f16(half 0.0, <vscale x 4 x half> undef)
+ call fast half @llvm.vector.reduce.fadd.nxv8f16(half 0.0, <vscale x 8 x half> undef)
+ call fast half @llvm.vector.reduce.fadd.nxv16f16(half 0.0, <vscale x 16 x half> undef)
+ call fast half @llvm.vector.reduce.fadd.nxv32f16(half 0.0, <vscale x 32 x half> undef)
+ call fast float @llvm.vector.reduce.fadd.v1f32(float 0.0, <1 x float> undef)
+ call fast float @llvm.vector.reduce.fadd.v2f32(float 0.0, <2 x float> undef)
+ call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
+ call fast float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
+ call fast float @llvm.vector.reduce.fadd.v16f32(float 0.0, <16 x float> undef)
+ call fast float @llvm.vector.reduce.fadd.v32f32(float 0.0, <32 x float> undef)
+ call fast float @llvm.vector.reduce.fadd.v64f32(float 0.0, <64 x float> undef)
+ call fast float @llvm.vector.reduce.fadd.v128f32(float 0.0, <128 x float> undef)
+ call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.0, <vscale x 1 x float> undef)
+ call fast float @llvm.vector.reduce.fadd.nxv2f32(float 0.0, <vscale x 2 x float> undef)
+ call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.0, <vscale x 4 x float> undef)
+ call fast float @llvm.vector.reduce.fadd.nxv8f32(float 0.0, <vscale x 8 x float> undef)
+ call fast float @llvm.vector.reduce.fadd.nxv16f32(float 0.0, <vscale x 16 x float> undef)
+ call fast double @llvm.vector.reduce.fadd.v1f64(double 0.0, <1 x double> undef)
+ call fast double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
+ call fast double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
+ call fast double @llvm.vector.reduce.fadd.v8f64(double 0.0, <8 x double> undef)
+ call fast double @llvm.vector.reduce.fadd.v16f64(double 0.0, <16 x double> undef)
+ call fast double @llvm.vector.reduce.fadd.v32f64(double 0.0, <32 x double> undef)
+ call fast double @llvm.vector.reduce.fadd.v64f64(double 0.0, <64 x double> undef)
+ call fast double @llvm.vector.reduce.fadd.nxv1f64(double 0.0, <vscale x 1 x double> undef)
+ call fast double @llvm.vector.reduce.fadd.nxv2f64(double 0.0, <vscale x 2 x double> undef)
+ call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.0, <vscale x 4 x double> undef)
+ call fast double @llvm.vector.reduce.fadd.nxv8f64(double 0.0, <vscale x 8 x double> undef)
+ ret void
+}
+
+declare i1 @llvm.vector.reduce.smax.v16i1(<16 x i1>)
+declare i1 @llvm.vector.reduce.smax.v32i1(<32 x i1>)
+declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.smax.v256i8(<256 x i8>)
+declare i8 @llvm.vector.reduce.smax.v512i8(<512 x i8>)
+declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.smax.v128i16(<128 x i16>)
+declare i16 @llvm.vector.reduce.smax.v256i16(<256 x i16>)
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.smax.v64i32(<64 x i32>)
+declare i32 @llvm.vector.reduce.smax.v128i32(<128 x i32>)
+declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.smax.v32i64(<32 x i64>)
+declare i64 @llvm.vector.reduce.smax.v64i64(<64 x i64>)
+
+define void @reduce_smax() {
+; THROUGHPUT-LABEL: 'reduce_smax'
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %1 = call i1 @llvm.vector.reduce.smax.v16i1(<16 x i1> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %2 = call i1 @llvm.vector.reduce.smax.v32i1(<32 x i1> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %3 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %4 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %5 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %6 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %7 = call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %8 = call i8 @llvm.vector.reduce.smax.v512i8(<512 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %9 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %10 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %11 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %12 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %13 = call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %14 = call i16 @llvm.vector.reduce.smax.v256i16(<256 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %15 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %16 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %17 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %18 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %19 = call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %20 = call i32 @llvm.vector.reduce.smax.v128i32(<128 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %21 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 29 for instruction: %22 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %23 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %24 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %25 = call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %26 = call i64 @llvm.vector.reduce.smax.v64i64(<64 x i64> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %27 = call i1 @llvm.vector.reduce.smax.nxv1i1(<vscale x 1 x i1> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %28 = call i1 @llvm.vector.reduce.smax.nxv2i1(<vscale x 2 x i1> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %29 = call i1 @llvm.vector.reduce.smax.nxv4i1(<vscale x 4 x i1> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %30 = call i1 @llvm.vector.reduce.smax.nxv8i1(<vscale x 8 x i1> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %31 = call i1 @llvm.vector.reduce.smax.nxv16i1(<vscale x 16 x i1> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %32 = call i8 @llvm.vector.reduce.smax.nxv1i8(<vscale x 1 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %33 = call i8 @llvm.vector.reduce.smax.nxv2i8(<vscale x 2 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %34 = call i8 @llvm.vector.reduce.smax.nxv4i8(<vscale x 4 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %35 = call i8 @llvm.vector.reduce.smax.nxv8i8(<vscale x 8 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %36 = call i8 @llvm.vector.reduce.smax.nxv16i8(<vscale x 16 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %37 = call i8 @llvm.vector.reduce.smax.nxv32i8(<vscale x 32 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 78 for instruction: %38 = call i8 @llvm.vector.reduce.smax.nxv64i8(<vscale x 64 x i8> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %39 = call i16 @llvm.vector.reduce.smax.nxv1i16(<vscale x 1 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %40 = call i16 @llvm.vector.reduce.smax.nxv2i16(<vscale x 2 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %41 = call i16 @llvm.vector.reduce.smax.nxv4i16(<vscale x 4 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %42 = call i16 @llvm.vector.reduce.smax.nxv8i16(<vscale x 8 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %43 = call i16 @llvm.vector.reduce.smax.nxv16i16(<vscale x 16 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 71 for instruction: %44 = call i16 @llvm.vector.reduce.smax.nxv32i16(<vscale x 32 x i16> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %45 = call i32 @llvm.vector.reduce.smax.nxv1i32(<vscale x 1 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %46 = call i32 @llvm.vector.reduce.smax.nxv2i32(<vscale x 2 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %47 = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %48 = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 64 for instruction: %49 = call i32 @llvm.vector.reduce.smax.nxv16i32(<vscale x 16 x i32> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 36 for instruction: %50 = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 43 for instruction: %51 = call i64 @llvm.vector.reduce.smax.nxv2i64(<vscale x 2 x i64> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 51 for instruction: %52 = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 57 for instruction: %53 = call i64 @llvm.vector.reduce.smax.nxv8i64(<vscale x 8 x i64> undef)
+; THROUGHPUT-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'reduce_smax'
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %1 = call i1 @llvm.vector.reduce.smax.v16i1(<16 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %2 = call i1 @llvm.vector.reduce.smax.v32i1(<32 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %3 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %4 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %5 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %6 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %7 = call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %8 = call i8 @llvm.vector.reduce.smax.v512i8(<512 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %9 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %10 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %11 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %12 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %13 = call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %14 = call i16 @llvm.vector.reduce.smax.v256i16(<256 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %15 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %16 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %17 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %18 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %19 = call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %20 = call i32 @llvm.vector.reduce.smax.v128i32(<128 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %21 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %22 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %23 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %24 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %25 = call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %26 = call i64 @llvm.vector.reduce.smax.v64i64(<64 x i64> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %27 = call i1 @llvm.vector.reduce.smax.nxv1i1(<vscale x 1 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %28 = call i1 @llvm.vector.reduce.smax.nxv2i1(<vscale x 2 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %29 = call i1 @llvm.vector.reduce.smax.nxv4i1(<vscale x 4 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %30 = call i1 @llvm.vector.reduce.smax.nxv8i1(<vscale x 8 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %31 = call i1 @llvm.vector.reduce.smax.nxv16i1(<vscale x 16 x i1> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %32 = call i8 @llvm.vector.reduce.smax.nxv1i8(<vscale x 1 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %33 = call i8 @llvm.vector.reduce.smax.nxv2i8(<vscale x 2 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %34 = call i8 @llvm.vector.reduce.smax.nxv4i8(<vscale x 4 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %35 = call i8 @llvm.vector.reduce.smax.nxv8i8(<vscale x 8 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %36 = call i8 @llvm.vector.reduce.smax.nxv16i8(<vscale x 16 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %37 = call i8 @llvm.vector.reduce.smax.nxv32i8(<vscale x 32 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %38 = call i8 @llvm.vector.reduce.smax.nxv64i8(<vscale x 64 x i8> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %39 = call i16 @llvm.vector.reduce.smax.nxv1i16(<vscale x 1 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %40 = call i16 @llvm.vector.reduce.smax.nxv2i16(<vscale x 2 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %41 = call i16 @llvm.vector.reduce.smax.nxv4i16(<vscale x 4 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %42 = call i16 @llvm.vector.reduce.smax.nxv8i16(<vscale x 8 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %43 = call i16 @llvm.vector.reduce.smax.nxv16i16(<vscale x 16 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %44 = call i16 @llvm.vector.reduce.smax.nxv32i16(<vscale x 32 x i16> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %45 = call i32 @llvm.vector.reduce.smax.nxv1i32(<vscale x 1 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %46 = call i32 @llvm.vector.reduce.smax.nxv2i32(<vscale x 2 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %47 = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %48 = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %49 = call i32 @llvm.vector.reduce.smax.nxv16i32(<vscale x 16 x i32> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %50 = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %51 = call i64 @llvm.vector.reduce.smax.nxv2i64(<vscale x 2 x i64> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %52 = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %53 = call i64 @llvm.vector.reduce.smax.nxv8i64(<vscale x 8 x i64> undef)
+; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+ call i1 @llvm.vector.reduce.smax.v16i1(<16 x i1> undef)
+ call i1 @llvm.vector.reduce.smax.v32i1(<32 x i1> undef)
+ call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+ call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+ call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+ call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
+ call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> undef)
+ call i8 @llvm.vector.reduce.smax.v512i8(<512 x i8> undef)
+ call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+ call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+ call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+ call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
+ call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> undef)
+ call i16 @llvm.vector.reduce.smax.v256i16(<256 x i16> undef)
+ call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+ call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+ call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+ call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
+ call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> undef)
+ call i32 @llvm.vector.reduce.smax.v128i32(<128 x i32> undef)
+ call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+ call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+ call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+ call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
+ call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> undef)
+ call i64 @llvm.vector.reduce.smax.v64i64(<64 x i64> undef)
+ call i1 @llvm.vector.reduce.smax.nxv1i1(<vscale x 1 x i1> undef)
+ call i1 @llvm.vector.reduce.smax.nxv2i1(<vscale x 2 x i1> undef)
+ call i1 @llvm.vector.reduce.smax.nxv4i1(<vscale x 4 x i1> undef)
+ call i1 @llvm.vector.reduce.smax.nxv8i1(<vscale x 8 x i1> undef)
+ call i1 @llvm.vector.reduce.smax.nxv16i1(<vscale x 16 x i1> undef)
+ call i8 @llvm.vector.reduce.smax.nxv1i8(<vscale x 1 x i8> undef)
+ call i8 @llvm.vector.reduce.smax.nxv2i8(<vscale x 2 x i8> undef)
+ call i8 @llvm.vector.reduce.smax.nxv4i8(<vscale x 4 x i8> undef)
+ call i8 @llvm.vector.reduce.smax.nxv8i8(<vscale x 8 x i8> undef)
+ call i8 @llvm.vector.reduce.smax.nxv16i8(<vscale x 16 x i8> undef)
+ call i8 @llvm.vector.reduce.smax.nxv32i8(<vscale x 32 x i8> undef)
+ call i8 @llvm.vector.reduce.smax.nxv64i8(<vscale x 64 x i8> undef)
+ call i16 @llvm.vector.reduce.smax.nxv1i16(<vscale x 1 x i16> undef)
+ call i16 @llvm.vector.reduce.smax.nxv2i16(<vscale x 2 x i16> undef)
+ call i16 @llvm.vector.reduce.smax.nxv4i16(<vscale x 4 x i16> undef)
+ call i16 @llvm.vector.reduce.smax.nxv8i16(<vscale x 8 x i16> undef)
+ call i16 @llvm.vector.reduce.smax.nxv16i16(<vscale x 16 x i16> undef)
+ call i16 @llvm.vector.reduce.smax.nxv32i16(<vscale x 32 x i16> undef)
+ call i32 @llvm.vector.reduce.smax.nxv1i32(<vscale x 1 x i32> undef)
+ call i32 @llvm.vector.reduce.smax.nxv2i32(<vscale x 2 x i32> undef)
+ call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> undef)
+ call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> undef)
+ call i32 @llvm.vector.reduce.smax.nxv16i32(<vscale x 16 x i32> undef)
+ call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> undef)
+ call i64 @llvm.vector.reduce.smax.nxv2i64(<vscale x 2 x i64> undef)
+ call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> undef)
+ call i64 @llvm.vector.reduce.smax.nxv8i64(<vscale x 8 x i64> undef)
+ ret void
+}
>From bdab018aac80b8097ba9ce930eb3f9b065f2d737 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Sun, 5 May 2024 19:59:19 -0700
Subject: [PATCH 2/2] move `for (auto Op: OCodes)` into `getSiFiveX280RVVCost`
and common cost into `getRVVBaseCost`
---
.../Target/RISCV/RISCVTargetTransformInfo.cpp | 143 +++++++++---------
1 file changed, 72 insertions(+), 71 deletions(-)
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 81fc3e81fd1d0f..625b405654cc58 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -34,14 +34,21 @@ static cl::opt<unsigned> SLPMaxVF(
"exclusively by SLP vectorizer."),
cl::Hidden);
-static std::optional<InstructionCost>
-getSiFiveX280RVVCost(unsigned Op, MVT VT, TTI::TargetCostKind CostKind) {
- std::optional<InstructionCost> Cost;
- unsigned VScale = 8;
+static InstructionCost getRVVBaseCost(unsigned Op, MVT VT,
+ const RISCVTTIImpl *TTI,
+ const RISCVTargetLowering *TLI) {
+ InstructionCost LMULCost = TLI->getLMULCost(VT);
switch (Op) {
- default:
- Cost = std::nullopt;
- break;
+ case RISCV::VRGATHER_VI:
+ return TLI->getVRGatherVICost(VT);
+ case RISCV::VRGATHER_VV:
+ return TLI->getVRGatherVVCost(VT);
+ case RISCV::VSLIDEUP_VI:
+ case RISCV::VSLIDEDOWN_VI:
+ return TLI->getVSlideVICost(VT);
+ case RISCV::VSLIDEUP_VX:
+ case RISCV::VSLIDEDOWN_VX:
+ return TLI->getVSlideVXCost(VT);
case RISCV::VREDMAX_VS:
case RISCV::VREDMIN_VS:
case RISCV::VREDMAXU_VS:
@@ -55,74 +62,42 @@ getSiFiveX280RVVCost(unsigned Op, MVT VT, TTI::TargetCostKind CostKind) {
case RISCV::VFREDUSUM_VS: {
unsigned VL = VT.getVectorMinNumElements();
if (!VT.isFixedLengthVector())
- VL *= VScale;
- // For the cases with small VL, we use a lookup table for accurate
- // cost estimation.
- unsigned LookUpSiFive7ReduceLatency[] = {0, 20, 27, 32, 34, 38, 40, 41, 42};
- if (VL <= 32) {
- Cost = LookUpSiFive7ReduceLatency[(VL + 3) >> 2];
- break;
- }
- Cost = 6 + 7 * Log2_32_Ceil(VL);
- break;
+ VL *= *(TTI->getVScaleForTuning());
+ return Log2_32_Ceil(VL);
}
case RISCV::VFREDOSUM_VS: {
unsigned VL = VT.getVectorMinNumElements();
if (!VT.isFixedLengthVector())
- VL *= VScale;
- Cost = VL * 6;
- break;
+ VL *= *(TTI->getVScaleForTuning());
+ return VL;
}
case RISCV::VMV_X_S:
+ case RISCV::VMV_S_X:
case RISCV::VFMV_F_S:
+ case RISCV::VFMV_S_F:
+ case RISCV::VMOR_MM:
+ case RISCV::VMXOR_MM:
+ case RISCV::VMAND_MM:
+ case RISCV::VMANDN_MM:
+ case RISCV::VMNAND_MM:
case RISCV::VCPOP_M:
- /* Vector-to-scalar communication */
- Cost = 8;
+ return 1;
+ default:
+ return LMULCost;
}
- return Cost;
}
-InstructionCost
-RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
- TTI::TargetCostKind CostKind) {
- // Check if the type is valid for all CostKind
- if (!VT.isVector())
- return InstructionCost::getInvalid();
- size_t NumInstr = OpCodes.size();
- if (CostKind == TTI::TCK_CodeSize)
- return NumInstr;
-
- std::optional<InstructionCost> (*GetTargetCost)(
- unsigned, MVT, TTI::TargetCostKind) = nullptr;
- if (ST->getProcFamily() == RISCVSubtarget::SiFive7)
- GetTargetCost = getSiFiveX280RVVCost;
+static InstructionCost getSiFiveX280RVVCost(ArrayRef<unsigned> OpCodes, MVT VT,
+ TTI::TargetCostKind CostKind,
+ const RISCVTTIImpl *TTI,
+ const RISCVTargetLowering *TLI) {
InstructionCost LMULCost = TLI->getLMULCost(VT);
if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency))
return LMULCost * NumInstr;
InstructionCost Cost = 0;
+ unsigned VScale = 8;
for (auto Op : OpCodes) {
- std::optional<InstructionCost> OverrideCost =
- GetTargetCost ? GetTargetCost(Op, VT, CostKind) : std::nullopt;
- if (OverrideCost) {
- Cost += *OverrideCost;
- continue;
- }
-
switch (Op) {
- case RISCV::VRGATHER_VI:
- Cost += TLI->getVRGatherVICost(VT);
- break;
- case RISCV::VRGATHER_VV:
- Cost += TLI->getVRGatherVVCost(VT);
- break;
- case RISCV::VSLIDEUP_VI:
- case RISCV::VSLIDEDOWN_VI:
- Cost += TLI->getVSlideVICost(VT);
- break;
- case RISCV::VSLIDEUP_VX:
- case RISCV::VSLIDEDOWN_VX:
- Cost += TLI->getVSlideVXCost(VT);
- break;
case RISCV::VREDMAX_VS:
case RISCV::VREDMIN_VS:
case RISCV::VREDMAXU_VS:
@@ -136,36 +111,62 @@ RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
case RISCV::VFREDUSUM_VS: {
unsigned VL = VT.getVectorMinNumElements();
if (!VT.isFixedLengthVector())
- VL *= *getVScaleForTuning();
- Cost += Log2_32_Ceil(VL);
+ VL *= VScale;
+ // For the cases with small VL, we use a lookup table for accurate
+ // cost estimation.
+ unsigned LookUpSiFive7ReduceLatency[] = {0, 20, 27, 32, 34,
+ 38, 40, 41, 42};
+ if (VL <= 32) {
+ Cost += LookUpSiFive7ReduceLatency[(VL + 3) >> 2];
+ break;
+ }
+ Cost += 6 + 7 * Log2_32_Ceil(VL);
break;
}
case RISCV::VFREDOSUM_VS: {
unsigned VL = VT.getVectorMinNumElements();
if (!VT.isFixedLengthVector())
- VL *= *getVScaleForTuning();
- Cost += VL;
+ VL *= VScale;
+ Cost += VL * 6;
break;
}
case RISCV::VMV_X_S:
- case RISCV::VMV_S_X:
case RISCV::VFMV_F_S:
- case RISCV::VFMV_S_F:
- case RISCV::VMOR_MM:
- case RISCV::VMXOR_MM:
- case RISCV::VMAND_MM:
- case RISCV::VMANDN_MM:
- case RISCV::VMNAND_MM:
case RISCV::VCPOP_M:
- Cost += 1;
+ /* Vector-to-scalar communication */
+ Cost += 8;
break;
default:
- Cost += LMULCost;
+ Cost += getRVVBaseCost(Op, VT, TTI, TLI);
+ break;
}
}
return Cost;
}
+InstructionCost
+RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
+ TTI::TargetCostKind CostKind) {
+ // Check if the type is valid for all CostKind
+ if (!VT.isVector())
+ return InstructionCost::getInvalid();
+ size_t NumInstr = OpCodes.size();
+ if (CostKind == TTI::TCK_CodeSize)
+ return NumInstr;
+
+ if (ST->getProcFamily() == RISCVSubtarget::SiFive7)
+ return getSiFiveX280RVVCost(OpCodes, VT, CostKind, this, TLI);
+
+ InstructionCost LMULCost = TLI->getLMULCost(VT);
+ if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency))
+ return LMULCost * NumInstr;
+ InstructionCost Cost = 0;
+ for (auto Op : OpCodes)
+ Cost += getRVVBaseCost(Op, VT, this, TLI);
+
+ return Cost;
+}
+
InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
TTI::TargetCostKind CostKind) {
assert(Ty->isIntegerTy() &&
More information about the llvm-commits
mailing list