[llvm] [RISCV][TTI] Improve SiFive7 reduction cost (PR #90951)

Sun May 5 20:02:00 PDT 2024

https://github.com/arcbbb updated https://github.com/llvm/llvm-project/pull/90951

>From 38228ba04527be100ad7436391bd1dfac0812893 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Fri, 3 May 2024 01:10:50 -0700
Subject: [PATCH 1/2] [RISCV][TTI] Improve SiFive7 reduction cost

The cost calculation for intrinsic llvm.vector.reduce.* factors in
the costs of instructions VRED, VFRED, VMV, and VFMV.

This patch implements a target-specific cost function.
If an opcode is supported in the target-specific cost function, it
takes precedence over the default cost.

Co-authored-by: Elvis Wang <elvis.wang at sifive.com>
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  60 ++
 .../CostModel/RISCV/sifive-x280-reduce.ll     | 675 ++++++++++++++++++
 2 files changed, 735 insertions(+)
 create mode 100644 llvm/test/Analysis/CostModel/RISCV/sifive-x280-reduce.ll

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index ce26e61880fd05..81fc3e81fd1d0f 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -34,6 +34,54 @@ static cl::opt<unsigned> SLPMaxVF(
         "exclusively by SLP vectorizer."),
     cl::Hidden);
 
+static std::optional<InstructionCost>
+getSiFiveX280RVVCost(unsigned Op, MVT VT, TTI::TargetCostKind CostKind) {
+  std::optional<InstructionCost> Cost;
+  unsigned VScale = 8;
+  switch (Op) {
+  default:
+    Cost = std::nullopt;
+    break;
+  case RISCV::VREDMAX_VS:
+  case RISCV::VREDMIN_VS:
+  case RISCV::VREDMAXU_VS:
+  case RISCV::VREDMINU_VS:
+  case RISCV::VREDSUM_VS:
+  case RISCV::VREDAND_VS:
+  case RISCV::VREDOR_VS:
+  case RISCV::VREDXOR_VS:
+  case RISCV::VFREDMAX_VS:
+  case RISCV::VFREDMIN_VS:
+  case RISCV::VFREDUSUM_VS: {
+    unsigned VL = VT.getVectorMinNumElements();
+    if (!VT.isFixedLengthVector())
+      VL *= VScale;
+    // For the cases with small VL, we use a lookup table for accurate
+    // cost estimation.
+    unsigned LookUpSiFive7ReduceLatency[] = {0, 20, 27, 32, 34, 38, 40, 41, 42};
+    if (VL <= 32) {
+      Cost = LookUpSiFive7ReduceLatency[(VL + 3) >> 2];
+      break;
+    }
+    Cost = 6 + 7 * Log2_32_Ceil(VL);
+    break;
+  }
+  case RISCV::VFREDOSUM_VS: {
+    unsigned VL = VT.getVectorMinNumElements();
+    if (!VT.isFixedLengthVector())
+      VL *= VScale;
+    Cost = VL * 6;
+    break;
+  }
+  case RISCV::VMV_X_S:
+  case RISCV::VFMV_F_S:
+  case RISCV::VCPOP_M:
+    /* Vector-to-scalar communication */
+    Cost = 8;
+  }
+  return Cost;
+}
+
 InstructionCost
 RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
                                       TTI::TargetCostKind CostKind) {
@@ -43,11 +91,23 @@ RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
   size_t NumInstr = OpCodes.size();
   if (CostKind == TTI::TCK_CodeSize)
     return NumInstr;
+
+  std::optional<InstructionCost> (*GetTargetCost)(
+      unsigned, MVT, TTI::TargetCostKind) = nullptr;
+  if (ST->getProcFamily() == RISCVSubtarget::SiFive7)
+    GetTargetCost = getSiFiveX280RVVCost;
   InstructionCost LMULCost = TLI->getLMULCost(VT);
   if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency))
     return LMULCost * NumInstr;
   InstructionCost Cost = 0;
   for (auto Op : OpCodes) {
+    std::optional<InstructionCost> OverrideCost =
+        GetTargetCost ? GetTargetCost(Op, VT, CostKind) : std::nullopt;
+    if (OverrideCost) {
+      Cost += *OverrideCost;
+      continue;
+    }
+
     switch (Op) {
     case RISCV::VRGATHER_VI:
       Cost += TLI->getVRGatherVICost(VT);
diff --git a/llvm/test/Analysis/CostModel/RISCV/sifive-x280-reduce.ll b/llvm/test/Analysis/CostModel/RISCV/sifive-x280-reduce.ll
new file mode 100644
index 00000000000000..01632f9ab80643
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/RISCV/sifive-x280-reduce.ll
@@ -0,0 +1,675 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -mtriple=riscv64 -mcpu=sifive-x280 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefix=THROUGHPUT
+; RUN: opt < %s -mtriple=riscv64 -mcpu=sifive-x280 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck %s --check-prefix=SIZE
+
+declare i1 @llvm.vector.reduce.add.v16i1(<16 x i1>)
+declare i1 @llvm.vector.reduce.add.v32i1(<32 x i1>)
+declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.add.v256i8(<256 x i8>)
+declare i8 @llvm.vector.reduce.add.v512i8(<512 x i8>)
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.add.v128i16(<128 x i16>)
+declare i16 @llvm.vector.reduce.add.v256i16(<256 x i16>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
+declare i32 @llvm.vector.reduce.add.v128i32(<128 x i32>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.add.v32i64(<32 x i64>)
+declare i64 @llvm.vector.reduce.add.v64i64(<64 x i64>)
+
+define void @reduce_add() {
+; THROUGHPUT-LABEL: 'reduce_add'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %1 = call i1 @llvm.vector.reduce.add.v16i1(<16 x i1> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %2 = call i1 @llvm.vector.reduce.add.v32i1(<32 x i1> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %4 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %5 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %6 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %7 = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %8 = call i8 @llvm.vector.reduce.add.v512i8(<512 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %9 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %10 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %11 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %12 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %13 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %14 = call i16 @llvm.vector.reduce.add.v256i16(<256 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %17 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %18 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %19 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %20 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %21 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %22 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %23 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %24 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %25 = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %26 = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %27 = call i1 @llvm.vector.reduce.add.nxv1i1(<vscale x 1 x i1> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %28 = call i1 @llvm.vector.reduce.add.nxv2i1(<vscale x 2 x i1> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %29 = call i1 @llvm.vector.reduce.add.nxv4i1(<vscale x 4 x i1> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %30 = call i1 @llvm.vector.reduce.add.nxv8i1(<vscale x 8 x i1> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %31 = call i1 @llvm.vector.reduce.add.nxv16i1(<vscale x 16 x i1> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %32 = call i8 @llvm.vector.reduce.add.nxv1i8(<vscale x 1 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %33 = call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %34 = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %35 = call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %36 = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %37 = call i8 @llvm.vector.reduce.add.nxv32i8(<vscale x 32 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %38 = call i8 @llvm.vector.reduce.add.nxv64i8(<vscale x 64 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %39 = call i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %40 = call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %41 = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %42 = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %43 = call i16 @llvm.vector.reduce.add.nxv16i16(<vscale x 16 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %44 = call i16 @llvm.vector.reduce.add.nxv32i16(<vscale x 32 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %45 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %46 = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %47 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %48 = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %49 = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %50 = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %51 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %52 = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %53 = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'reduce_add'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %1 = call i1 @llvm.vector.reduce.add.v16i1(<16 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %2 = call i1 @llvm.vector.reduce.add.v32i1(<32 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %3 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %4 = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %5 = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %6 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %7 = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %8 = call i8 @llvm.vector.reduce.add.v512i8(<512 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %9 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %10 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %11 = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %12 = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %13 = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %14 = call i16 @llvm.vector.reduce.add.v256i16(<256 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %15 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %16 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %17 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %18 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %19 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %20 = call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %21 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %22 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %23 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %24 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %25 = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %26 = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %27 = call i1 @llvm.vector.reduce.add.nxv1i1(<vscale x 1 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %28 = call i1 @llvm.vector.reduce.add.nxv2i1(<vscale x 2 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %29 = call i1 @llvm.vector.reduce.add.nxv4i1(<vscale x 4 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %30 = call i1 @llvm.vector.reduce.add.nxv8i1(<vscale x 8 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %31 = call i1 @llvm.vector.reduce.add.nxv16i1(<vscale x 16 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %32 = call i8 @llvm.vector.reduce.add.nxv1i8(<vscale x 1 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %33 = call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %34 = call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %35 = call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %36 = call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %37 = call i8 @llvm.vector.reduce.add.nxv32i8(<vscale x 32 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %38 = call i8 @llvm.vector.reduce.add.nxv64i8(<vscale x 64 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %39 = call i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %40 = call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %41 = call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %42 = call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %43 = call i16 @llvm.vector.reduce.add.nxv16i16(<vscale x 16 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %44 = call i16 @llvm.vector.reduce.add.nxv32i16(<vscale x 32 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %45 = call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %46 = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %47 = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %48 = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %49 = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %50 = call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %51 = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %52 = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %53 = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call i1 @llvm.vector.reduce.add.v16i1(<16 x i1> undef)
+  call i1 @llvm.vector.reduce.add.v32i1(<32 x i1> undef)
+  call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef)
+  call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> undef)
+  call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> undef)
+  call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef)
+  call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> undef)
+  call i8 @llvm.vector.reduce.add.v512i8(<512 x i8> undef)
+  call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef)
+  call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef)
+  call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> undef)
+  call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> undef)
+  call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> undef)
+  call i16 @llvm.vector.reduce.add.v256i16(<256 x i16> undef)
+  call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef)
+  call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef)
+  call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef)
+  call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> undef)
+  call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> undef)
+  call i32 @llvm.vector.reduce.add.v128i32(<128 x i32> undef)
+  call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef)
+  call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef)
+  call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef)
+  call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef)
+  call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> undef)
+  call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> undef)
+
+  call i1 @llvm.vector.reduce.add.nxv1i1(<vscale x 1 x i1> undef)
+  call i1 @llvm.vector.reduce.add.nxv2i1(<vscale x 2 x i1> undef)
+  call i1 @llvm.vector.reduce.add.nxv4i1(<vscale x 4 x i1> undef)
+  call i1 @llvm.vector.reduce.add.nxv8i1(<vscale x 8 x i1> undef)
+  call i1 @llvm.vector.reduce.add.nxv16i1(<vscale x 16 x i1> undef)
+  call i8 @llvm.vector.reduce.add.nxv1i8(<vscale x 1 x i8> undef)
+  call i8 @llvm.vector.reduce.add.nxv2i8(<vscale x 2 x i8> undef)
+  call i8 @llvm.vector.reduce.add.nxv4i8(<vscale x 4 x i8> undef)
+  call i8 @llvm.vector.reduce.add.nxv8i8(<vscale x 8 x i8> undef)
+  call i8 @llvm.vector.reduce.add.nxv16i8(<vscale x 16 x i8> undef)
+  call i8 @llvm.vector.reduce.add.nxv32i8(<vscale x 32 x i8> undef)
+  call i8 @llvm.vector.reduce.add.nxv64i8(<vscale x 64 x i8> undef)
+  call i16 @llvm.vector.reduce.add.nxv1i16(<vscale x 1 x i16> undef)
+  call i16 @llvm.vector.reduce.add.nxv2i16(<vscale x 2 x i16> undef)
+  call i16 @llvm.vector.reduce.add.nxv4i16(<vscale x 4 x i16> undef)
+  call i16 @llvm.vector.reduce.add.nxv8i16(<vscale x 8 x i16> undef)
+  call i16 @llvm.vector.reduce.add.nxv16i16(<vscale x 16 x i16> undef)
+  call i16 @llvm.vector.reduce.add.nxv32i16(<vscale x 32 x i16> undef)
+  call i32 @llvm.vector.reduce.add.nxv1i32(<vscale x 1 x i32> undef)
+  call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> undef)
+  call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> undef)
+  call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> undef)
+  call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> undef)
+  call i64 @llvm.vector.reduce.add.nxv1i64(<vscale x 1 x i64> undef)
+  call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> undef)
+  call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> undef)
+  call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> undef)
+  ret void
+}
+
+declare half @llvm.vector.reduce.fadd.v1f16(half, <1 x half>)
+declare half @llvm.vector.reduce.fadd.v2f16(half, <2 x half>)
+declare half @llvm.vector.reduce.fadd.v4f16(half, <4 x half>)
+declare half @llvm.vector.reduce.fadd.v8f16(half, <8 x half>)
+declare half @llvm.vector.reduce.fadd.v16f16(half, <16 x half>)
+declare half @llvm.vector.reduce.fadd.v32f16(half, <32 x half>)
+declare half @llvm.vector.reduce.fadd.v64f16(half, <64 x half>)
+declare half @llvm.vector.reduce.fadd.v128f16(half, <128 x half>)
+declare half @llvm.vector.reduce.fadd.v256f16(half, <256 x half>)
+declare float @llvm.vector.reduce.fadd.v1f32(float, <1 x float>)
+declare float @llvm.vector.reduce.fadd.v2f32(float, <2 x float>)
+declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>)
+declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>)
+declare float @llvm.vector.reduce.fadd.v16f32(float, <16 x float>)
+declare float @llvm.vector.reduce.fadd.v32f32(float, <32 x float>)
+declare float @llvm.vector.reduce.fadd.v64f32(float, <64 x float>)
+declare float @llvm.vector.reduce.fadd.v128f32(float, <128 x float>)
+declare double @llvm.vector.reduce.fadd.v1f64(double, <1 x double>)
+declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>)
+declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>)
+declare double @llvm.vector.reduce.fadd.v8f64(double, <8 x double>)
+declare double @llvm.vector.reduce.fadd.v16f64(double, <16 x double>)
+declare double @llvm.vector.reduce.fadd.v32f64(double, <32 x double>)
+declare double @llvm.vector.reduce.fadd.v64f64(double, <64 x double>)
+
+define void @ordered_reduce_fadd() {
+; THROUGHPUT-LABEL: 'ordered_reduce_fadd'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %1 = call half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %2 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %3 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %4 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %5 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 201 for instruction: %6 = call half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 393 for instruction: %7 = call half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 777 for instruction: %8 = call half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1545 for instruction: %9 = call half @llvm.vector.reduce.fadd.v256f16(half 0xH0000, <256 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %10 = call half @llvm.vector.reduce.fadd.nxv1f16(half 0xH0000, <vscale x 1 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %11 = call half @llvm.vector.reduce.fadd.nxv2f16(half 0xH0000, <vscale x 2 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 201 for instruction: %12 = call half @llvm.vector.reduce.fadd.nxv4f16(half 0xH0000, <vscale x 4 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 393 for instruction: %13 = call half @llvm.vector.reduce.fadd.nxv8f16(half 0xH0000, <vscale x 8 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 777 for instruction: %14 = call half @llvm.vector.reduce.fadd.nxv16f16(half 0xH0000, <vscale x 16 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1545 for instruction: %15 = call half @llvm.vector.reduce.fadd.nxv32f16(half 0xH0000, <vscale x 32 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %16 = call float @llvm.vector.reduce.fadd.v1f32(float 0.000000e+00, <1 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %17 = call float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %18 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %19 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %20 = call float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 201 for instruction: %21 = call float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 393 for instruction: %22 = call float @llvm.vector.reduce.fadd.v64f32(float 0.000000e+00, <64 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 777 for instruction: %23 = call float @llvm.vector.reduce.fadd.v128f32(float 0.000000e+00, <128 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %24 = call float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %25 = call float @llvm.vector.reduce.fadd.nxv2f32(float 0.000000e+00, <vscale x 2 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 201 for instruction: %26 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 393 for instruction: %27 = call float @llvm.vector.reduce.fadd.nxv8f32(float 0.000000e+00, <vscale x 8 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 777 for instruction: %28 = call float @llvm.vector.reduce.fadd.nxv16f32(float 0.000000e+00, <vscale x 16 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %29 = call double @llvm.vector.reduce.fadd.v1f64(double 0.000000e+00, <1 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %30 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 33 for instruction: %31 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %32 = call double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %33 = call double @llvm.vector.reduce.fadd.v16f64(double 0.000000e+00, <16 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 201 for instruction: %34 = call double @llvm.vector.reduce.fadd.v32f64(double 0.000000e+00, <32 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 393 for instruction: %35 = call double @llvm.vector.reduce.fadd.v64f64(double 0.000000e+00, <64 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 105 for instruction: %36 = call double @llvm.vector.reduce.fadd.v16f64(double 0.000000e+00, <16 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 201 for instruction: %37 = call double @llvm.vector.reduce.fadd.v32f64(double 0.000000e+00, <32 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 393 for instruction: %38 = call double @llvm.vector.reduce.fadd.v64f64(double 0.000000e+00, <64 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 777 for instruction: %39 = call double @llvm.vector.reduce.fadd.v128f64(double 0.000000e+00, <128 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'ordered_reduce_fadd'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %1 = call half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %2 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %3 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %4 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %5 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %6 = call half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %7 = call half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %8 = call half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %9 = call half @llvm.vector.reduce.fadd.v256f16(half 0xH0000, <256 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %10 = call half @llvm.vector.reduce.fadd.nxv1f16(half 0xH0000, <vscale x 1 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %11 = call half @llvm.vector.reduce.fadd.nxv2f16(half 0xH0000, <vscale x 2 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %12 = call half @llvm.vector.reduce.fadd.nxv4f16(half 0xH0000, <vscale x 4 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %13 = call half @llvm.vector.reduce.fadd.nxv8f16(half 0xH0000, <vscale x 8 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %14 = call half @llvm.vector.reduce.fadd.nxv16f16(half 0xH0000, <vscale x 16 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %15 = call half @llvm.vector.reduce.fadd.nxv32f16(half 0xH0000, <vscale x 32 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %16 = call float @llvm.vector.reduce.fadd.v1f32(float 0.000000e+00, <1 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %17 = call float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %18 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %19 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %20 = call float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %21 = call float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %22 = call float @llvm.vector.reduce.fadd.v64f32(float 0.000000e+00, <64 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %23 = call float @llvm.vector.reduce.fadd.v128f32(float 0.000000e+00, <128 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %24 = call float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %25 = call float @llvm.vector.reduce.fadd.nxv2f32(float 0.000000e+00, <vscale x 2 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %26 = call float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %27 = call float @llvm.vector.reduce.fadd.nxv8f32(float 0.000000e+00, <vscale x 8 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %28 = call float @llvm.vector.reduce.fadd.nxv16f32(float 0.000000e+00, <vscale x 16 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %29 = call double @llvm.vector.reduce.fadd.v1f64(double 0.000000e+00, <1 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %30 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %31 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %32 = call double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %33 = call double @llvm.vector.reduce.fadd.v16f64(double 0.000000e+00, <16 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %34 = call double @llvm.vector.reduce.fadd.v32f64(double 0.000000e+00, <32 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %35 = call double @llvm.vector.reduce.fadd.v64f64(double 0.000000e+00, <64 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %36 = call double @llvm.vector.reduce.fadd.v16f64(double 0.000000e+00, <16 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %37 = call double @llvm.vector.reduce.fadd.v32f64(double 0.000000e+00, <32 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %38 = call double @llvm.vector.reduce.fadd.v64f64(double 0.000000e+00, <64 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %39 = call double @llvm.vector.reduce.fadd.v128f64(double 0.000000e+00, <128 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call half @llvm.vector.reduce.fadd.v1f16(half 0.0, <1 x half> undef)
+  call half @llvm.vector.reduce.fadd.v2f16(half 0.0, <2 x half> undef)
+  call half @llvm.vector.reduce.fadd.v4f16(half 0.0, <4 x half> undef)
+  call half @llvm.vector.reduce.fadd.v8f16(half 0.0, <8 x half> undef)
+  call half @llvm.vector.reduce.fadd.v16f16(half 0.0, <16 x half> undef)
+  call half @llvm.vector.reduce.fadd.v32f16(half 0.0, <32 x half> undef)
+  call half @llvm.vector.reduce.fadd.v64f16(half 0.0, <64 x half> undef)
+  call half @llvm.vector.reduce.fadd.v128f16(half 0.0, <128 x half> undef)
+  call half @llvm.vector.reduce.fadd.v256(half 0.0, <256 x half> undef)
+  call half @llvm.vector.reduce.fadd.nxv1f16(half 0.0, <vscale x 1 x half> undef)
+  call half @llvm.vector.reduce.fadd.nxv2f16(half 0.0, <vscale x 2 x half> undef)
+  call half @llvm.vector.reduce.fadd.nxv4f16(half 0.0, <vscale x 4 x half> undef)
+  call half @llvm.vector.reduce.fadd.nxv8f16(half 0.0, <vscale x 8 x half> undef)
+  call half @llvm.vector.reduce.fadd.nxv16f16(half 0.0, <vscale x 16 x half> undef)
+  call half @llvm.vector.reduce.fadd.nxv32f16(half 0.0, <vscale x 32 x half> undef)
+  call float @llvm.vector.reduce.fadd.v1f32(float 0.0, <1 x float> undef)
+  call float @llvm.vector.reduce.fadd.v2f32(float 0.0, <2 x float> undef)
+  call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
+  call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
+  call float @llvm.vector.reduce.fadd.v16f32(float 0.0, <16 x float> undef)
+  call float @llvm.vector.reduce.fadd.v32f32(float 0.0, <32 x float> undef)
+  call float @llvm.vector.reduce.fadd.v64f32(float 0.0, <64 x float> undef)
+  call float @llvm.vector.reduce.fadd.v128f32(float 0.0, <128 x float> undef)
+  call float @llvm.vector.reduce.fadd.nxv1f32(float 0.0, <vscale x 1 x float> undef)
+  call float @llvm.vector.reduce.fadd.nxv2f32(float 0.0, <vscale x 2 x float> undef)
+  call float @llvm.vector.reduce.fadd.nxv4f32(float 0.0, <vscale x 4 x float> undef)
+  call float @llvm.vector.reduce.fadd.nxv8f32(float 0.0, <vscale x 8 x float> undef)
+  call float @llvm.vector.reduce.fadd.nxv16f32(float 0.0, <vscale x 16 x float> undef)
+  call double @llvm.vector.reduce.fadd.v1f64(double 0.0, <1 x double> undef)
+  call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
+  call double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
+  call double @llvm.vector.reduce.fadd.v8f64(double 0.0, <8 x double> undef)
+  call double @llvm.vector.reduce.fadd.v16f64(double 0.0, <16 x double> undef)
+  call double @llvm.vector.reduce.fadd.v32f64(double 0.0, <32 x double> undef)
+  call double @llvm.vector.reduce.fadd.v64f64(double 0.0, <64 x double> undef)
+  call double @llvm.vector.reduce.fadd.v16f64(double 0.0, <16 x double> undef)
+  call double @llvm.vector.reduce.fadd.v32f64(double 0.0, <32 x double> undef)
+  call double @llvm.vector.reduce.fadd.v64f64(double 0.0, <64 x double> undef)
+  call double @llvm.vector.reduce.fadd.v128f64(double 0.0, <128 x double> undef)
+  ret void
+}
+
+define void @fast_reduce_fadd() {
+; THROUGHPUT-LABEL: 'fast_reduce_fadd'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %1 = call fast half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %2 = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %3 = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %4 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %5 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %6 = call fast half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %7 = call fast half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %8 = call fast half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %9 = call fast half @llvm.vector.reduce.fadd.v256f16(half 0xH0000, <256 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %10 = call fast half @llvm.vector.reduce.fadd.nxv1f16(half 0xH0000, <vscale x 1 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %11 = call fast half @llvm.vector.reduce.fadd.nxv2f16(half 0xH0000, <vscale x 2 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %12 = call fast half @llvm.vector.reduce.fadd.nxv4f16(half 0xH0000, <vscale x 4 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %13 = call fast half @llvm.vector.reduce.fadd.nxv8f16(half 0xH0000, <vscale x 8 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %14 = call fast half @llvm.vector.reduce.fadd.nxv16f16(half 0xH0000, <vscale x 16 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %15 = call fast half @llvm.vector.reduce.fadd.nxv32f16(half 0xH0000, <vscale x 32 x half> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %16 = call fast float @llvm.vector.reduce.fadd.v1f32(float 0.000000e+00, <1 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %17 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %18 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %19 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %20 = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %21 = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %22 = call fast float @llvm.vector.reduce.fadd.v64f32(float 0.000000e+00, <64 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %23 = call fast float @llvm.vector.reduce.fadd.v128f32(float 0.000000e+00, <128 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %24 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %25 = call fast float @llvm.vector.reduce.fadd.nxv2f32(float 0.000000e+00, <vscale x 2 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %26 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %27 = call fast float @llvm.vector.reduce.fadd.nxv8f32(float 0.000000e+00, <vscale x 8 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %28 = call fast float @llvm.vector.reduce.fadd.nxv16f32(float 0.000000e+00, <vscale x 16 x float> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %29 = call fast double @llvm.vector.reduce.fadd.v1f64(double 0.000000e+00, <1 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %30 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %31 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %32 = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %33 = call fast double @llvm.vector.reduce.fadd.v16f64(double 0.000000e+00, <16 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %34 = call fast double @llvm.vector.reduce.fadd.v32f64(double 0.000000e+00, <32 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %35 = call fast double @llvm.vector.reduce.fadd.v64f64(double 0.000000e+00, <64 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %36 = call fast double @llvm.vector.reduce.fadd.nxv1f64(double 0.000000e+00, <vscale x 1 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %37 = call fast double @llvm.vector.reduce.fadd.nxv2f64(double 0.000000e+00, <vscale x 2 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %38 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %39 = call fast double @llvm.vector.reduce.fadd.nxv8f64(double 0.000000e+00, <vscale x 8 x double> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'fast_reduce_fadd'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %1 = call fast half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %2 = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %3 = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %4 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %5 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %6 = call fast half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %7 = call fast half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %8 = call fast half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %9 = call fast half @llvm.vector.reduce.fadd.v256f16(half 0xH0000, <256 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %10 = call fast half @llvm.vector.reduce.fadd.nxv1f16(half 0xH0000, <vscale x 1 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %11 = call fast half @llvm.vector.reduce.fadd.nxv2f16(half 0xH0000, <vscale x 2 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %12 = call fast half @llvm.vector.reduce.fadd.nxv4f16(half 0xH0000, <vscale x 4 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %13 = call fast half @llvm.vector.reduce.fadd.nxv8f16(half 0xH0000, <vscale x 8 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %14 = call fast half @llvm.vector.reduce.fadd.nxv16f16(half 0xH0000, <vscale x 16 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %15 = call fast half @llvm.vector.reduce.fadd.nxv32f16(half 0xH0000, <vscale x 32 x half> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %16 = call fast float @llvm.vector.reduce.fadd.v1f32(float 0.000000e+00, <1 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %17 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %18 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %19 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %20 = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %21 = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %22 = call fast float @llvm.vector.reduce.fadd.v64f32(float 0.000000e+00, <64 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %23 = call fast float @llvm.vector.reduce.fadd.v128f32(float 0.000000e+00, <128 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %24 = call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.000000e+00, <vscale x 1 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %25 = call fast float @llvm.vector.reduce.fadd.nxv2f32(float 0.000000e+00, <vscale x 2 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %26 = call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.000000e+00, <vscale x 4 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %27 = call fast float @llvm.vector.reduce.fadd.nxv8f32(float 0.000000e+00, <vscale x 8 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %28 = call fast float @llvm.vector.reduce.fadd.nxv16f32(float 0.000000e+00, <vscale x 16 x float> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %29 = call fast double @llvm.vector.reduce.fadd.v1f64(double 0.000000e+00, <1 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %30 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %31 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %32 = call fast double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %33 = call fast double @llvm.vector.reduce.fadd.v16f64(double 0.000000e+00, <16 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %34 = call fast double @llvm.vector.reduce.fadd.v32f64(double 0.000000e+00, <32 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %35 = call fast double @llvm.vector.reduce.fadd.v64f64(double 0.000000e+00, <64 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %36 = call fast double @llvm.vector.reduce.fadd.nxv1f64(double 0.000000e+00, <vscale x 1 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %37 = call fast double @llvm.vector.reduce.fadd.nxv2f64(double 0.000000e+00, <vscale x 2 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %38 = call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.000000e+00, <vscale x 4 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %39 = call fast double @llvm.vector.reduce.fadd.nxv8f64(double 0.000000e+00, <vscale x 8 x double> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call fast half @llvm.vector.reduce.fadd.v1f16(half 0.0, <1 x half> undef)
+  call fast half @llvm.vector.reduce.fadd.v2f16(half 0.0, <2 x half> undef)
+  call fast half @llvm.vector.reduce.fadd.v4f16(half 0.0, <4 x half> undef)
+  call fast half @llvm.vector.reduce.fadd.v8f16(half 0.0, <8 x half> undef)
+  call fast half @llvm.vector.reduce.fadd.v16f16(half 0.0, <16 x half> undef)
+  call fast half @llvm.vector.reduce.fadd.v32f16(half 0.0, <32 x half> undef)
+  call fast half @llvm.vector.reduce.fadd.v64f16(half 0.0, <64 x half> undef)
+  call fast half @llvm.vector.reduce.fadd.v128f16(half 0.0, <128 x half> undef)
+  call fast half @llvm.vector.reduce.fadd.v256(half 0.0, <256 x half> undef)
+  call fast half @llvm.vector.reduce.fadd.nxv1f16(half 0.0, <vscale x 1 x half> undef)
+  call fast half @llvm.vector.reduce.fadd.nxv2f16(half 0.0, <vscale x 2 x half> undef)
+  call fast half @llvm.vector.reduce.fadd.nxv4f16(half 0.0, <vscale x 4 x half> undef)
+  call fast half @llvm.vector.reduce.fadd.nxv8f16(half 0.0, <vscale x 8 x half> undef)
+  call fast half @llvm.vector.reduce.fadd.nxv16f16(half 0.0, <vscale x 16 x half> undef)
+  call fast half @llvm.vector.reduce.fadd.nxv32f16(half 0.0, <vscale x 32 x half> undef)
+  call fast float @llvm.vector.reduce.fadd.v1f32(float 0.0, <1 x float> undef)
+  call fast float @llvm.vector.reduce.fadd.v2f32(float 0.0, <2 x float> undef)
+  call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef)
+  call fast float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef)
+  call fast float @llvm.vector.reduce.fadd.v16f32(float 0.0, <16 x float> undef)
+  call fast float @llvm.vector.reduce.fadd.v32f32(float 0.0, <32 x float> undef)
+  call fast float @llvm.vector.reduce.fadd.v64f32(float 0.0, <64 x float> undef)
+  call fast float @llvm.vector.reduce.fadd.v128f32(float 0.0, <128 x float> undef)
+  call fast float @llvm.vector.reduce.fadd.nxv1f32(float 0.0, <vscale x 1 x float> undef)
+  call fast float @llvm.vector.reduce.fadd.nxv2f32(float 0.0, <vscale x 2 x float> undef)
+  call fast float @llvm.vector.reduce.fadd.nxv4f32(float 0.0, <vscale x 4 x float> undef)
+  call fast float @llvm.vector.reduce.fadd.nxv8f32(float 0.0, <vscale x 8 x float> undef)
+  call fast float @llvm.vector.reduce.fadd.nxv16f32(float 0.0, <vscale x 16 x float> undef)
+  call fast double @llvm.vector.reduce.fadd.v1f64(double 0.0, <1 x double> undef)
+  call fast double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef)
+  call fast double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef)
+  call fast double @llvm.vector.reduce.fadd.v8f64(double 0.0, <8 x double> undef)
+  call fast double @llvm.vector.reduce.fadd.v16f64(double 0.0, <16 x double> undef)
+  call fast double @llvm.vector.reduce.fadd.v32f64(double 0.0, <32 x double> undef)
+  call fast double @llvm.vector.reduce.fadd.v64f64(double 0.0, <64 x double> undef)
+  call fast double @llvm.vector.reduce.fadd.nxv1f64(double 0.0, <vscale x 1 x double> undef)
+  call fast double @llvm.vector.reduce.fadd.nxv2f64(double 0.0, <vscale x 2 x double> undef)
+  call fast double @llvm.vector.reduce.fadd.nxv4f64(double 0.0, <vscale x 4 x double> undef)
+  call fast double @llvm.vector.reduce.fadd.nxv8f64(double 0.0, <vscale x 8 x double> undef)
+  ret void
+}
+
+declare i1 @llvm.vector.reduce.smax.v16i1(<16 x i1>)
+declare i1 @llvm.vector.reduce.smax.v32i1(<32 x i1>)
+declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>)
+declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>)
+declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>)
+declare i8 @llvm.vector.reduce.smax.v256i8(<256 x i8>)
+declare i8 @llvm.vector.reduce.smax.v512i8(<512 x i8>)
+declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>)
+declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>)
+declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>)
+declare i16 @llvm.vector.reduce.smax.v128i16(<128 x i16>)
+declare i16 @llvm.vector.reduce.smax.v256i16(<256 x i16>)
+declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>)
+declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>)
+declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>)
+declare i32 @llvm.vector.reduce.smax.v64i32(<64 x i32>)
+declare i32 @llvm.vector.reduce.smax.v128i32(<128 x i32>)
+declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>)
+declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>)
+declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>)
+declare i64 @llvm.vector.reduce.smax.v32i64(<32 x i64>)
+declare i64 @llvm.vector.reduce.smax.v64i64(<64 x i64>)
+
+define void @reduce_smax() {
+; THROUGHPUT-LABEL: 'reduce_smax'
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call i1 @llvm.vector.reduce.smax.v16i1(<16 x i1> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %2 = call i1 @llvm.vector.reduce.smax.v32i1(<32 x i1> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %3 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %4 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %5 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %6 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %7 = call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %8 = call i8 @llvm.vector.reduce.smax.v512i8(<512 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %9 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %10 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %11 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %12 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %13 = call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %14 = call i16 @llvm.vector.reduce.smax.v256i16(<256 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %15 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %16 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %17 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %18 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %19 = call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %20 = call i32 @llvm.vector.reduce.smax.v128i32(<128 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %21 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 29 for instruction: %22 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %23 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %24 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %25 = call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %26 = call i64 @llvm.vector.reduce.smax.v64i64(<64 x i64> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %27 = call i1 @llvm.vector.reduce.smax.nxv1i1(<vscale x 1 x i1> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %28 = call i1 @llvm.vector.reduce.smax.nxv2i1(<vscale x 2 x i1> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %29 = call i1 @llvm.vector.reduce.smax.nxv4i1(<vscale x 4 x i1> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %30 = call i1 @llvm.vector.reduce.smax.nxv8i1(<vscale x 8 x i1> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %31 = call i1 @llvm.vector.reduce.smax.nxv16i1(<vscale x 16 x i1> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %32 = call i8 @llvm.vector.reduce.smax.nxv1i8(<vscale x 1 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %33 = call i8 @llvm.vector.reduce.smax.nxv2i8(<vscale x 2 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %34 = call i8 @llvm.vector.reduce.smax.nxv4i8(<vscale x 4 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %35 = call i8 @llvm.vector.reduce.smax.nxv8i8(<vscale x 8 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %36 = call i8 @llvm.vector.reduce.smax.nxv16i8(<vscale x 16 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %37 = call i8 @llvm.vector.reduce.smax.nxv32i8(<vscale x 32 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 78 for instruction: %38 = call i8 @llvm.vector.reduce.smax.nxv64i8(<vscale x 64 x i8> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %39 = call i16 @llvm.vector.reduce.smax.nxv1i16(<vscale x 1 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %40 = call i16 @llvm.vector.reduce.smax.nxv2i16(<vscale x 2 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %41 = call i16 @llvm.vector.reduce.smax.nxv4i16(<vscale x 4 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %42 = call i16 @llvm.vector.reduce.smax.nxv8i16(<vscale x 8 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %43 = call i16 @llvm.vector.reduce.smax.nxv16i16(<vscale x 16 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 71 for instruction: %44 = call i16 @llvm.vector.reduce.smax.nxv32i16(<vscale x 32 x i16> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %45 = call i32 @llvm.vector.reduce.smax.nxv1i32(<vscale x 1 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %46 = call i32 @llvm.vector.reduce.smax.nxv2i32(<vscale x 2 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %47 = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %48 = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %49 = call i32 @llvm.vector.reduce.smax.nxv16i32(<vscale x 16 x i32> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 36 for instruction: %50 = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 43 for instruction: %51 = call i64 @llvm.vector.reduce.smax.nxv2i64(<vscale x 2 x i64> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 51 for instruction: %52 = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 57 for instruction: %53 = call i64 @llvm.vector.reduce.smax.nxv8i64(<vscale x 8 x i64> undef)
+; THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'reduce_smax'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %1 = call i1 @llvm.vector.reduce.smax.v16i1(<16 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %2 = call i1 @llvm.vector.reduce.smax.v32i1(<32 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %3 = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %4 = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %5 = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %6 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %7 = call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %8 = call i8 @llvm.vector.reduce.smax.v512i8(<512 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %9 = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %10 = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %11 = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %12 = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %13 = call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %14 = call i16 @llvm.vector.reduce.smax.v256i16(<256 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %15 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %16 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %17 = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %18 = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %19 = call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %20 = call i32 @llvm.vector.reduce.smax.v128i32(<128 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %21 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %22 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %23 = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %24 = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %25 = call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %26 = call i64 @llvm.vector.reduce.smax.v64i64(<64 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %27 = call i1 @llvm.vector.reduce.smax.nxv1i1(<vscale x 1 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %28 = call i1 @llvm.vector.reduce.smax.nxv2i1(<vscale x 2 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %29 = call i1 @llvm.vector.reduce.smax.nxv4i1(<vscale x 4 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %30 = call i1 @llvm.vector.reduce.smax.nxv8i1(<vscale x 8 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %31 = call i1 @llvm.vector.reduce.smax.nxv16i1(<vscale x 16 x i1> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %32 = call i8 @llvm.vector.reduce.smax.nxv1i8(<vscale x 1 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %33 = call i8 @llvm.vector.reduce.smax.nxv2i8(<vscale x 2 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %34 = call i8 @llvm.vector.reduce.smax.nxv4i8(<vscale x 4 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %35 = call i8 @llvm.vector.reduce.smax.nxv8i8(<vscale x 8 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %36 = call i8 @llvm.vector.reduce.smax.nxv16i8(<vscale x 16 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %37 = call i8 @llvm.vector.reduce.smax.nxv32i8(<vscale x 32 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %38 = call i8 @llvm.vector.reduce.smax.nxv64i8(<vscale x 64 x i8> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %39 = call i16 @llvm.vector.reduce.smax.nxv1i16(<vscale x 1 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %40 = call i16 @llvm.vector.reduce.smax.nxv2i16(<vscale x 2 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %41 = call i16 @llvm.vector.reduce.smax.nxv4i16(<vscale x 4 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %42 = call i16 @llvm.vector.reduce.smax.nxv8i16(<vscale x 8 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %43 = call i16 @llvm.vector.reduce.smax.nxv16i16(<vscale x 16 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %44 = call i16 @llvm.vector.reduce.smax.nxv32i16(<vscale x 32 x i16> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %45 = call i32 @llvm.vector.reduce.smax.nxv1i32(<vscale x 1 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %46 = call i32 @llvm.vector.reduce.smax.nxv2i32(<vscale x 2 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %47 = call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %48 = call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %49 = call i32 @llvm.vector.reduce.smax.nxv16i32(<vscale x 16 x i32> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %50 = call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %51 = call i64 @llvm.vector.reduce.smax.nxv2i64(<vscale x 2 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %52 = call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %53 = call i64 @llvm.vector.reduce.smax.nxv8i64(<vscale x 8 x i64> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  call i1 @llvm.vector.reduce.smax.v16i1(<16 x i1> undef)
+  call i1 @llvm.vector.reduce.smax.v32i1(<32 x i1> undef)
+  call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> undef)
+  call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> undef)
+  call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> undef)
+  call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef)
+  call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> undef)
+  call i8 @llvm.vector.reduce.smax.v512i8(<512 x i8> undef)
+  call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> undef)
+  call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> undef)
+  call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> undef)
+  call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> undef)
+  call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> undef)
+  call i16 @llvm.vector.reduce.smax.v256i16(<256 x i16> undef)
+  call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef)
+  call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef)
+  call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> undef)
+  call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> undef)
+  call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> undef)
+  call i32 @llvm.vector.reduce.smax.v128i32(<128 x i32> undef)
+  call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef)
+  call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef)
+  call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> undef)
+  call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> undef)
+  call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> undef)
+  call i64 @llvm.vector.reduce.smax.v64i64(<64 x i64> undef)
+  call i1 @llvm.vector.reduce.smax.nxv1i1(<vscale x 1 x i1> undef)
+  call i1 @llvm.vector.reduce.smax.nxv2i1(<vscale x 2 x i1> undef)
+  call i1 @llvm.vector.reduce.smax.nxv4i1(<vscale x 4 x i1> undef)
+  call i1 @llvm.vector.reduce.smax.nxv8i1(<vscale x 8 x i1> undef)
+  call i1 @llvm.vector.reduce.smax.nxv16i1(<vscale x 16 x i1> undef)
+  call i8 @llvm.vector.reduce.smax.nxv1i8(<vscale x 1 x i8> undef)
+  call i8 @llvm.vector.reduce.smax.nxv2i8(<vscale x 2 x i8> undef)
+  call i8 @llvm.vector.reduce.smax.nxv4i8(<vscale x 4 x i8> undef)
+  call i8 @llvm.vector.reduce.smax.nxv8i8(<vscale x 8 x i8> undef)
+  call i8 @llvm.vector.reduce.smax.nxv16i8(<vscale x 16 x i8> undef)
+  call i8 @llvm.vector.reduce.smax.nxv32i8(<vscale x 32 x i8> undef)
+  call i8 @llvm.vector.reduce.smax.nxv64i8(<vscale x 64 x i8> undef)
+  call i16 @llvm.vector.reduce.smax.nxv1i16(<vscale x 1 x i16> undef)
+  call i16 @llvm.vector.reduce.smax.nxv2i16(<vscale x 2 x i16> undef)
+  call i16 @llvm.vector.reduce.smax.nxv4i16(<vscale x 4 x i16> undef)
+  call i16 @llvm.vector.reduce.smax.nxv8i16(<vscale x 8 x i16> undef)
+  call i16 @llvm.vector.reduce.smax.nxv16i16(<vscale x 16 x i16> undef)
+  call i16 @llvm.vector.reduce.smax.nxv32i16(<vscale x 32 x i16> undef)
+  call i32 @llvm.vector.reduce.smax.nxv1i32(<vscale x 1 x i32> undef)
+  call i32 @llvm.vector.reduce.smax.nxv2i32(<vscale x 2 x i32> undef)
+  call i32 @llvm.vector.reduce.smax.nxv4i32(<vscale x 4 x i32> undef)
+  call i32 @llvm.vector.reduce.smax.nxv8i32(<vscale x 8 x i32> undef)
+  call i32 @llvm.vector.reduce.smax.nxv16i32(<vscale x 16 x i32> undef)
+  call i64 @llvm.vector.reduce.smax.nxv1i64(<vscale x 1 x i64> undef)
+  call i64 @llvm.vector.reduce.smax.nxv2i64(<vscale x 2 x i64> undef)
+  call i64 @llvm.vector.reduce.smax.nxv4i64(<vscale x 4 x i64> undef)
+  call i64 @llvm.vector.reduce.smax.nxv8i64(<vscale x 8 x i64> undef)
+  ret void
+}

>From bdab018aac80b8097ba9ce930eb3f9b065f2d737 Mon Sep 17 00:00:00 2001
From: ShihPo Hung <shihpo.hung at sifive.com>
Date: Sun, 5 May 2024 19:59:19 -0700
Subject: [PATCH 2/2] move `for (auto Op: OCodes)` into `getSiFiveX280RVVCost`
 and common cost into `getRVVBaseCost`

---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp | 143 +++++++++---------
 1 file changed, 72 insertions(+), 71 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 81fc3e81fd1d0f..625b405654cc58 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -34,14 +34,21 @@ static cl::opt<unsigned> SLPMaxVF(
         "exclusively by SLP vectorizer."),
     cl::Hidden);
 
-static std::optional<InstructionCost>
-getSiFiveX280RVVCost(unsigned Op, MVT VT, TTI::TargetCostKind CostKind) {
-  std::optional<InstructionCost> Cost;
-  unsigned VScale = 8;
+static InstructionCost getRVVBaseCost(unsigned Op, MVT VT,
+                                      const RISCVTTIImpl *TTI,
+                                      const RISCVTargetLowering *TLI) {
+  InstructionCost LMULCost = TLI->getLMULCost(VT);
   switch (Op) {
-  default:
-    Cost = std::nullopt;
-    break;
+  case RISCV::VRGATHER_VI:
+    return TLI->getVRGatherVICost(VT);
+  case RISCV::VRGATHER_VV:
+    return TLI->getVRGatherVVCost(VT);
+  case RISCV::VSLIDEUP_VI:
+  case RISCV::VSLIDEDOWN_VI:
+    return TLI->getVSlideVICost(VT);
+  case RISCV::VSLIDEUP_VX:
+  case RISCV::VSLIDEDOWN_VX:
+    return TLI->getVSlideVXCost(VT);
   case RISCV::VREDMAX_VS:
   case RISCV::VREDMIN_VS:
   case RISCV::VREDMAXU_VS:
@@ -55,74 +62,42 @@ getSiFiveX280RVVCost(unsigned Op, MVT VT, TTI::TargetCostKind CostKind) {
   case RISCV::VFREDUSUM_VS: {
     unsigned VL = VT.getVectorMinNumElements();
     if (!VT.isFixedLengthVector())
-      VL *= VScale;
-    // For the cases with small VL, we use a lookup table for accurate
-    // cost estimation.
-    unsigned LookUpSiFive7ReduceLatency[] = {0, 20, 27, 32, 34, 38, 40, 41, 42};
-    if (VL <= 32) {
-      Cost = LookUpSiFive7ReduceLatency[(VL + 3) >> 2];
-      break;
-    }
-    Cost = 6 + 7 * Log2_32_Ceil(VL);
-    break;
+      VL *= *(TTI->getVScaleForTuning());
+    return Log2_32_Ceil(VL);
   }
   case RISCV::VFREDOSUM_VS: {
     unsigned VL = VT.getVectorMinNumElements();
     if (!VT.isFixedLengthVector())
-      VL *= VScale;
-    Cost = VL * 6;
-    break;
+      VL *= *(TTI->getVScaleForTuning());
+    return VL;
   }
   case RISCV::VMV_X_S:
+  case RISCV::VMV_S_X:
   case RISCV::VFMV_F_S:
+  case RISCV::VFMV_S_F:
+  case RISCV::VMOR_MM:
+  case RISCV::VMXOR_MM:
+  case RISCV::VMAND_MM:
+  case RISCV::VMANDN_MM:
+  case RISCV::VMNAND_MM:
   case RISCV::VCPOP_M:
-    /* Vector-to-scalar communication */
-    Cost = 8;
+    return 1;
+  default:
+    return LMULCost;
   }
-  return Cost;
 }
 
-InstructionCost
-RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
-                                      TTI::TargetCostKind CostKind) {
-  // Check if the type is valid for all CostKind
-  if (!VT.isVector())
-    return InstructionCost::getInvalid();
-  size_t NumInstr = OpCodes.size();
-  if (CostKind == TTI::TCK_CodeSize)
-    return NumInstr;
-
-  std::optional<InstructionCost> (*GetTargetCost)(
-      unsigned, MVT, TTI::TargetCostKind) = nullptr;
-  if (ST->getProcFamily() == RISCVSubtarget::SiFive7)
-    GetTargetCost = getSiFiveX280RVVCost;
+static InstructionCost getSiFiveX280RVVCost(ArrayRef<unsigned> OpCodes, MVT VT,
+                                            TTI::TargetCostKind CostKind,
+                                            const RISCVTTIImpl *TTI,
+                                            const RISCVTargetLowering *TLI) {
   InstructionCost LMULCost = TLI->getLMULCost(VT);
   if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency))
     return LMULCost * NumInstr;
   InstructionCost Cost = 0;
+  unsigned VScale = 8;
   for (auto Op : OpCodes) {
-    std::optional<InstructionCost> OverrideCost =
-        GetTargetCost ? GetTargetCost(Op, VT, CostKind) : std::nullopt;
-    if (OverrideCost) {
-      Cost += *OverrideCost;
-      continue;
-    }
-
     switch (Op) {
-    case RISCV::VRGATHER_VI:
-      Cost += TLI->getVRGatherVICost(VT);
-      break;
-    case RISCV::VRGATHER_VV:
-      Cost += TLI->getVRGatherVVCost(VT);
-      break;
-    case RISCV::VSLIDEUP_VI:
-    case RISCV::VSLIDEDOWN_VI:
-      Cost += TLI->getVSlideVICost(VT);
-      break;
-    case RISCV::VSLIDEUP_VX:
-    case RISCV::VSLIDEDOWN_VX:
-      Cost += TLI->getVSlideVXCost(VT);
-      break;
     case RISCV::VREDMAX_VS:
     case RISCV::VREDMIN_VS:
     case RISCV::VREDMAXU_VS:
@@ -136,36 +111,62 @@ RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
     case RISCV::VFREDUSUM_VS: {
       unsigned VL = VT.getVectorMinNumElements();
       if (!VT.isFixedLengthVector())
-        VL *= *getVScaleForTuning();
-      Cost += Log2_32_Ceil(VL);
+        VL *= VScale;
+      // For the cases with small VL, we use a lookup table for accurate
+      // cost estimation.
+      unsigned LookUpSiFive7ReduceLatency[] = {0,  20, 27, 32, 34,
+                                               38, 40, 41, 42};
+      if (VL <= 32) {
+        Cost += LookUpSiFive7ReduceLatency[(VL + 3) >> 2];
+        break;
+      }
+      Cost += 6 + 7 * Log2_32_Ceil(VL);
       break;
     }
     case RISCV::VFREDOSUM_VS: {
       unsigned VL = VT.getVectorMinNumElements();
       if (!VT.isFixedLengthVector())
-        VL *= *getVScaleForTuning();
-      Cost += VL;
+        VL *= VScale;
+      Cost += VL * 6;
       break;
     }
     case RISCV::VMV_X_S:
-    case RISCV::VMV_S_X:
     case RISCV::VFMV_F_S:
-    case RISCV::VFMV_S_F:
-    case RISCV::VMOR_MM:
-    case RISCV::VMXOR_MM:
-    case RISCV::VMAND_MM:
-    case RISCV::VMANDN_MM:
-    case RISCV::VMNAND_MM:
     case RISCV::VCPOP_M:
-      Cost += 1;
+      /* Vector-to-scalar communication */
+      Cost += 8;
       break;
     default:
-      Cost += LMULCost;
+      Cost += getRVVBaseCost(Op, VT, TTI, TLI);
+      break;
     }
   }
   return Cost;
 }
 
+InstructionCost
+RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
+                                      TTI::TargetCostKind CostKind) {
+  // Check if the type is valid for all CostKind
+  if (!VT.isVector())
+    return InstructionCost::getInvalid();
+  size_t NumInstr = OpCodes.size();
+  if (CostKind == TTI::TCK_CodeSize)
+    return NumInstr;
+
+  if (ST->getProcFamily() == RISCVSubtarget::SiFive7)
+    return getSiFiveX280RVVCost(OpCodes, VT, CostKind, this, TLI);
+
+  InstructionCost LMULCost = TLI->getLMULCost(VT);
+  if ((CostKind != TTI::TCK_RecipThroughput) && (CostKind != TTI::TCK_Latency))
+    return LMULCost * NumInstr;
+  InstructionCost Cost = 0;
+  for (auto Op : OpCodes)
+    Cost += getRVVBaseCost(Op, VT, this, TLI);
+
+  return Cost;
+}
+
 InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
                                             TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy() &&