[llvm] [RISCV] Cost ordered bf16/f16 w/ zvfhmin reductions as invalid (PR #114250)

Wed Oct 30 08:42:57 PDT 2024

https://github.com/lukel97 created https://github.com/llvm/llvm-project/pull/114250

This moves the check earlier to fix this.

This also uses BasicTTIImpl instead which now assigns a valid but expensive cost for fixed-length vectors, which reflects how codegen will actually scalarize them.


>From 26fd9ac3730c8cac7c75240c824c38f3ca8e9fdf Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Wed, 30 Oct 2024 23:33:50 +0800
Subject: [PATCH] [RISCV] Cost ordered bf16/f16 w/ zvfhmin reductions as
 invalid

This moves the check earlier to fix this.

This also uses BasicTTIImpl instead which now assigns a valid but expensive cost for fixed-length vectors, which reflects how codegen will actually scalarize them.
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |  15 +-
 .../Analysis/CostModel/RISCV/reduce-fadd.ll   | 151 ++++++++++--------
 2 files changed, 93 insertions(+), 73 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 395baa5f1aab99..d879a13d877ba1 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1523,9 +1523,17 @@ RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
       ISD != ISD::FADD)
     return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
 
+  Type *ElementTy = Ty->getElementType();
+  // We can't promote f16/bf16 fadd reductions.
+  if (ISD == ISD::FADD &&
+      ((ElementTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
+       ElementTy->isBFloatTy())) {
+    // We can't lower scalable vectors, but we can expand fixed vectors.
+    return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
+  }
+
   std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
   SmallVector<unsigned, 3> Opcodes;
-  Type *ElementTy = Ty->getElementType();
   if (ElementTy->isIntegerTy(1)) {
     if (ISD == ISD::AND) {
       // Example sequences:
@@ -1578,11 +1586,6 @@ RISCVTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
     Opcodes = {RISCV::VMV_S_X, RISCV::VREDAND_VS, RISCV::VMV_X_S};
     break;
   case ISD::FADD:
-    // We can't promote f16/bf16 fadd reductions.
-    if ((LT.second.getVectorElementType() == MVT::f16 &&
-         !ST->hasVInstructionsF16()) ||
-        LT.second.getVectorElementType() == MVT::bf16)
-      return InstructionCost::getInvalid();
     SplitOp = RISCV::VFADD_VV;
     Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
     break;
diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll
index 196e7376677a54..1762f701a9b2d5 100644
--- a/llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll
@@ -5,14 +5,14 @@
 
 define void @reduce_fadd_bfloat() {
 ; FP-REDUCE-LABEL: 'reduce_fadd_bfloat'
-; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %V1 = call fast bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %V2 = call fast bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %V4 = call fast bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %V8 = call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %V16 = call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %v32 = call fast bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %V64 = call fast bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %V128 = call fast bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call fast bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call fast bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call fast bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V16 = call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 211 for instruction: %v32 = call fast bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 541 for instruction: %V64 = call fast bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 573 for instruction: %V128 = call fast bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef)
 ; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %NXV1 = call fast bfloat @llvm.vector.reduce.fadd.nxv1bf16(bfloat 0xR0000, <vscale x 1 x bfloat> undef)
 ; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %NXV2 = call fast bfloat @llvm.vector.reduce.fadd.nxv2bf16(bfloat 0xR0000, <vscale x 2 x bfloat> undef)
 ; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %NXV4 = call fast bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat 0xR0000, <vscale x 4 x bfloat> undef)
@@ -22,14 +22,14 @@ define void @reduce_fadd_bfloat() {
 ; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'reduce_fadd_bfloat'
-; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %V1 = call fast bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef)
-; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %V2 = call fast bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef)
-; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %V4 = call fast bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
-; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %V8 = call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef)
-; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %V16 = call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef)
-; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %v32 = call fast bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef)
-; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %V64 = call fast bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef)
-; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %V128 = call fast bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call fast bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call fast bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call fast bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %v32 = call fast bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call fast bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V128 = call fast bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef)
 ; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %NXV1 = call fast bfloat @llvm.vector.reduce.fadd.nxv1bf16(bfloat 0xR0000, <vscale x 1 x bfloat> undef)
 ; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %NXV2 = call fast bfloat @llvm.vector.reduce.fadd.nxv2bf16(bfloat 0xR0000, <vscale x 2 x bfloat> undef)
 ; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %NXV4 = call fast bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat 0xR0000, <vscale x 4 x bfloat> undef)
@@ -74,14 +74,14 @@ define void @reduce_fadd_half() {
 ; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; FP-REDUCE-ZVFHMIN-LABEL: 'reduce_fadd_half'
-; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %V1 = call fast half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef)
-; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %V2 = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
-; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %V4 = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %V8 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %V16 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
-; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %v32 = call fast half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef)
-; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %V64 = call fast half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef)
-; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %V128 = call fast half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call fast half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V16 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 211 for instruction: %v32 = call fast half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 541 for instruction: %V64 = call fast half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 573 for instruction: %V128 = call fast half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef)
 ; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1 = call fast half @llvm.vector.reduce.fadd.nxv1f16(half 0xH0000, <vscale x 1 x half> undef)
 ; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2 = call fast half @llvm.vector.reduce.fadd.nxv2f16(half 0xH0000, <vscale x 2 x half> undef)
 ; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4 = call fast half @llvm.vector.reduce.fadd.nxv4f16(half 0xH0000, <vscale x 4 x half> undef)
@@ -222,36 +222,36 @@ define void @reduce_fadd_double() {
 define void @reduce_ordered_fadd_bfloat() {
 ; FP-REDUCE-LABEL: 'reduce_ordered_fadd_bfloat'
 ; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v32 = call bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64 = call bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V128 = call bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV1 = call bfloat @llvm.vector.reduce.fadd.nxv1bf16(bfloat 0xR0000, <vscale x 1 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %NXV2 = call bfloat @llvm.vector.reduce.fadd.nxv2bf16(bfloat 0xR0000, <vscale x 2 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV4 = call bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat 0xR0000, <vscale x 4 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8 = call bfloat @llvm.vector.reduce.fadd.nxv8bf16(bfloat 0xR0000, <vscale x 8 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %NXV16 = call bfloat @llvm.vector.reduce.fadd.nxv16bf16(bfloat 0xR0000, <vscale x 16 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %NXV32 = call bfloat @llvm.vector.reduce.fadd.nxv32bf16(bfloat 0xR0000, <vscale x 32 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8 = call bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16 = call bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v32 = call bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %V64 = call bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 510 for instruction: %V128 = call bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %NXV1 = call bfloat @llvm.vector.reduce.fadd.nxv1bf16(bfloat 0xR0000, <vscale x 1 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %NXV2 = call bfloat @llvm.vector.reduce.fadd.nxv2bf16(bfloat 0xR0000, <vscale x 2 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %NXV4 = call bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat 0xR0000, <vscale x 4 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %NXV8 = call bfloat @llvm.vector.reduce.fadd.nxv8bf16(bfloat 0xR0000, <vscale x 8 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %NXV16 = call bfloat @llvm.vector.reduce.fadd.nxv16bf16(bfloat 0xR0000, <vscale x 16 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %NXV32 = call bfloat @llvm.vector.reduce.fadd.nxv32bf16(bfloat 0xR0000, <vscale x 32 x bfloat> undef)
 ; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'reduce_ordered_fadd_bfloat'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8 = call bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32 = call bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV1 = call bfloat @llvm.vector.reduce.fadd.nxv1bf16(bfloat 0xR0000, <vscale x 1 x bfloat> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV2 = call bfloat @llvm.vector.reduce.fadd.nxv2bf16(bfloat 0xR0000, <vscale x 2 x bfloat> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV4 = call bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat 0xR0000, <vscale x 4 x bfloat> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV8 = call bfloat @llvm.vector.reduce.fadd.nxv8bf16(bfloat 0xR0000, <vscale x 8 x bfloat> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV16 = call bfloat @llvm.vector.reduce.fadd.nxv16bf16(bfloat 0xR0000, <vscale x 16 x bfloat> undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %NXV32 = call bfloat @llvm.vector.reduce.fadd.nxv32bf16(bfloat 0xR0000, <vscale x 32 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8 = call bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16 = call bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %v32 = call bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %V64 = call bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 382 for instruction: %V128 = call bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %NXV1 = call bfloat @llvm.vector.reduce.fadd.nxv1bf16(bfloat 0xR0000, <vscale x 1 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %NXV2 = call bfloat @llvm.vector.reduce.fadd.nxv2bf16(bfloat 0xR0000, <vscale x 2 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %NXV4 = call bfloat @llvm.vector.reduce.fadd.nxv4bf16(bfloat 0xR0000, <vscale x 4 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %NXV8 = call bfloat @llvm.vector.reduce.fadd.nxv8bf16(bfloat 0xR0000, <vscale x 8 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %NXV16 = call bfloat @llvm.vector.reduce.fadd.nxv16bf16(bfloat 0xR0000, <vscale x 16 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %NXV32 = call bfloat @llvm.vector.reduce.fadd.nxv32bf16(bfloat 0xR0000, <vscale x 32 x bfloat> undef)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %V1 = call bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0.0, <1 x bfloat> undef)
@@ -272,22 +272,39 @@ define void @reduce_ordered_fadd_bfloat() {
 }
 
 define void @reduce_ordered_fadd_half() {
-; FP-REDUCE-LABEL: 'reduce_ordered_fadd_half'
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v32 = call half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64 = call half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V128 = call half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV1 = call half @llvm.vector.reduce.fadd.nxv1f16(half 0xH0000, <vscale x 1 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %NXV2 = call half @llvm.vector.reduce.fadd.nxv2f16(half 0xH0000, <vscale x 2 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV4 = call half @llvm.vector.reduce.fadd.nxv4f16(half 0xH0000, <vscale x 4 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8 = call half @llvm.vector.reduce.fadd.nxv8f16(half 0xH0000, <vscale x 8 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %NXV16 = call half @llvm.vector.reduce.fadd.nxv16f16(half 0xH0000, <vscale x 16 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %NXV32 = call half @llvm.vector.reduce.fadd.nxv32f16(half 0xH0000, <vscale x 32 x half> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; FP-REDUCE-ZVFH-LABEL: 'reduce_ordered_fadd_half'
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v32 = call half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64 = call half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V128 = call half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %NXV1 = call half @llvm.vector.reduce.fadd.nxv1f16(half 0xH0000, <vscale x 1 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %NXV2 = call half @llvm.vector.reduce.fadd.nxv2f16(half 0xH0000, <vscale x 2 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %NXV4 = call half @llvm.vector.reduce.fadd.nxv4f16(half 0xH0000, <vscale x 4 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %NXV8 = call half @llvm.vector.reduce.fadd.nxv8f16(half 0xH0000, <vscale x 8 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %NXV16 = call half @llvm.vector.reduce.fadd.nxv16f16(half 0xH0000, <vscale x 16 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %NXV32 = call half @llvm.vector.reduce.fadd.nxv32f16(half 0xH0000, <vscale x 32 x half> undef)
+; FP-REDUCE-ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; FP-REDUCE-ZVFHMIN-LABEL: 'reduce_ordered_fadd_half'
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8 = call half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16 = call half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v32 = call half @llvm.vector.reduce.fadd.v32f16(half 0xH0000, <32 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %V64 = call half @llvm.vector.reduce.fadd.v64f16(half 0xH0000, <64 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 510 for instruction: %V128 = call half @llvm.vector.reduce.fadd.v128f16(half 0xH0000, <128 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV1 = call half @llvm.vector.reduce.fadd.nxv1f16(half 0xH0000, <vscale x 1 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV2 = call half @llvm.vector.reduce.fadd.nxv2f16(half 0xH0000, <vscale x 2 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV4 = call half @llvm.vector.reduce.fadd.nxv4f16(half 0xH0000, <vscale x 4 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV8 = call half @llvm.vector.reduce.fadd.nxv8f16(half 0xH0000, <vscale x 8 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV16 = call half @llvm.vector.reduce.fadd.nxv16f16(half 0xH0000, <vscale x 16 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %NXV32 = call half @llvm.vector.reduce.fadd.nxv32f16(half 0xH0000, <vscale x 32 x half> undef)
+; FP-REDUCE-ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'reduce_ordered_fadd_half'
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef)