[llvm] 20864d2 - [ValueTypes][RISCV] Add v1bf16 type (#111112)

Sun Oct 6 07:20:55 PDT 2024

Author: Luke Lau
Date: 2024-10-06T22:20:51+08:00
New Revision: 20864d2cf610639a70e43aa417f90b457f8e3c90

URL: https://github.com/llvm/llvm-project/commit/20864d2cf610639a70e43aa417f90b457f8e3c90
DIFF: https://github.com/llvm/llvm-project/commit/20864d2cf610639a70e43aa417f90b457f8e3c90.diff

LOG: [ValueTypes][RISCV] Add v1bf16 type (#111112)

When trying to add RISC-V fadd reduction cost model tests for bf16, I
noticed a crash when the vector was of <1 x bfloat>.

It turns out that this was being scalarized because unlike f16/f32/f64,
there's no v1bf16 value type, and the existing cost model code assumed
that the legalized type would always be a vector.

This adds v1bf16 to bring bf16 in line with the other fp types.

It also adds some more RISC-V bf16 reduction tests which previously
crashed, including tests to ensure that SLP won't emit fadd/fmul
reductions for bf16 or f16 w/ zvfhmin after #111000.

Added: 
    

Modified: 
    llvm/include/llvm/CodeGen/ValueTypes.td
    llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
    llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll
    llvm/test/Analysis/CostModel/RISCV/reduce-fmul.ll
    llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll
    llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/ValueTypes.td b/llvm/include/llvm/CodeGen/ValueTypes.td
index ea2c80eaf95836..493c0cfcab60ce 100644

--- a/llvm/include/llvm/CodeGen/ValueTypes.td
+++ b/llvm/include/llvm/CodeGen/ValueTypes.td
@@ -179,157 +179,158 @@ def v128f16  : VTVec<128,  f16,  96>;  //  128 x f16 vector value
 def v256f16  : VTVec<256,  f16,  97>;  //  256 x f16 vector value
 def v512f16  : VTVec<512,  f16,  98>;  //  512 x f16 vector value
 
-def v2bf16   : VTVec<2,   bf16,  99>;  //    2 x bf16 vector value
-def v3bf16   : VTVec<3,   bf16, 100>;  //    3 x bf16 vector value
-def v4bf16   : VTVec<4,   bf16, 101>;  //    4 x bf16 vector value
-def v8bf16   : VTVec<8,   bf16, 102>;  //    8 x bf16 vector value
-def v16bf16  : VTVec<16,  bf16, 103>;  //   16 x bf16 vector value
-def v32bf16  : VTVec<32,  bf16, 104>;  //   32 x bf16 vector value
-def v64bf16  : VTVec<64,  bf16, 105>;  //   64 x bf16 vector value
-def v128bf16 : VTVec<128, bf16, 106>;  //  128 x bf16 vector value
-
-def v1f32    : VTVec<1,    f32, 107>;  //    1 x f32 vector value
-def v2f32    : VTVec<2,    f32, 108>;  //    2 x f32 vector value
-def v3f32    : VTVec<3,    f32, 109>;  //    3 x f32 vector value
-def v4f32    : VTVec<4,    f32, 110>;  //    4 x f32 vector value
-def v5f32    : VTVec<5,    f32, 111>;  //    5 x f32 vector value
-def v6f32    : VTVec<6,    f32, 112>;  //    6 x f32 vector value
-def v7f32    : VTVec<7,    f32, 113>;  //    7 x f32 vector value
-def v8f32    : VTVec<8,    f32, 114>;  //    8 x f32 vector value
-def v9f32    : VTVec<9,    f32, 115>;  //    9 x f32 vector value
-def v10f32   : VTVec<10,   f32, 116>;  //   10 x f32 vector value
-def v11f32   : VTVec<11,   f32, 117>;  //   11 x f32 vector value
-def v12f32   : VTVec<12,   f32, 118>;  //   12 x f32 vector value
-def v16f32   : VTVec<16,   f32, 119>;  //   16 x f32 vector value
-def v32f32   : VTVec<32,   f32, 120>;  //   32 x f32 vector value
-def v64f32   : VTVec<64,   f32, 121>;  //   64 x f32 vector value
-def v128f32  : VTVec<128,  f32, 122>;  //  128 x f32 vector value
-def v256f32  : VTVec<256,  f32, 123>;  //  256 x f32 vector value
-def v512f32  : VTVec<512,  f32, 124>;  //  512 x f32 vector value
-def v1024f32 : VTVec<1024, f32, 125>;  // 1024 x f32 vector value
-def v2048f32 : VTVec<2048, f32, 126>;  // 2048 x f32 vector value
-
-def v1f64    : VTVec<1,    f64, 127>;  //    1 x f64 vector value
-def v2f64    : VTVec<2,    f64, 128>;  //    2 x f64 vector value
-def v3f64    : VTVec<3,    f64, 129>;  //    3 x f64 vector value
-def v4f64    : VTVec<4,    f64, 130>;  //    4 x f64 vector value
-def v8f64    : VTVec<8,    f64, 131>;  //    8 x f64 vector value
-def v16f64   : VTVec<16,   f64, 132>;  //   16 x f64 vector value
-def v32f64   : VTVec<32,   f64, 133>;  //   32 x f64 vector value
-def v64f64   : VTVec<64,   f64, 134>;  //   64 x f64 vector value
-def v128f64  : VTVec<128,  f64, 135>;  //  128 x f64 vector value
-def v256f64  : VTVec<256,  f64, 136>;  //  256 x f64 vector value
-
-def nxv1i1  : VTScalableVec<1,  i1, 137>;  // n x  1 x i1  vector value
-def nxv2i1  : VTScalableVec<2,  i1, 138>;  // n x  2 x i1  vector value
-def nxv4i1  : VTScalableVec<4,  i1, 139>;  // n x  4 x i1  vector value
-def nxv8i1  : VTScalableVec<8,  i1, 140>;  // n x  8 x i1  vector value
-def nxv16i1 : VTScalableVec<16, i1, 141>;  // n x 16 x i1  vector value
-def nxv32i1 : VTScalableVec<32, i1, 142>;  // n x 32 x i1  vector value
-def nxv64i1 : VTScalableVec<64, i1, 143>;  // n x 64 x i1  vector value
-
-def nxv1i8  : VTScalableVec<1,  i8, 144>;  // n x  1 x i8  vector value
-def nxv2i8  : VTScalableVec<2,  i8, 145>;  // n x  2 x i8  vector value
-def nxv4i8  : VTScalableVec<4,  i8, 146>;  // n x  4 x i8  vector value
-def nxv8i8  : VTScalableVec<8,  i8, 147>;  // n x  8 x i8  vector value
-def nxv16i8 : VTScalableVec<16, i8, 148>;  // n x 16 x i8  vector value
-def nxv32i8 : VTScalableVec<32, i8, 149>;  // n x 32 x i8  vector value
-def nxv64i8 : VTScalableVec<64, i8, 150>;  // n x 64 x i8  vector value
-
-def nxv1i16  : VTScalableVec<1,  i16, 151>;  // n x  1 x i16 vector value
-def nxv2i16  : VTScalableVec<2,  i16, 152>;  // n x  2 x i16 vector value
-def nxv4i16  : VTScalableVec<4,  i16, 153>;  // n x  4 x i16 vector value
-def nxv8i16  : VTScalableVec<8,  i16, 154>;  // n x  8 x i16 vector value
-def nxv16i16 : VTScalableVec<16, i16, 155>;  // n x 16 x i16 vector value
-def nxv32i16 : VTScalableVec<32, i16, 156>;  // n x 32 x i16 vector value
-
-def nxv1i32  : VTScalableVec<1,  i32, 157>;  // n x  1 x i32 vector value
-def nxv2i32  : VTScalableVec<2,  i32, 158>;  // n x  2 x i32 vector value
-def nxv4i32  : VTScalableVec<4,  i32, 159>;  // n x  4 x i32 vector value
-def nxv8i32  : VTScalableVec<8,  i32, 160>;  // n x  8 x i32 vector value
-def nxv16i32 : VTScalableVec<16, i32, 161>;  // n x 16 x i32 vector value
-def nxv32i32 : VTScalableVec<32, i32, 162>;  // n x 32 x i32 vector value
-
-def nxv1i64  : VTScalableVec<1,  i64, 163>;  // n x  1 x i64 vector value
-def nxv2i64  : VTScalableVec<2,  i64, 164>;  // n x  2 x i64 vector value
-def nxv4i64  : VTScalableVec<4,  i64, 165>;  // n x  4 x i64 vector value
-def nxv8i64  : VTScalableVec<8,  i64, 166>;  // n x  8 x i64 vector value
-def nxv16i64 : VTScalableVec<16, i64, 167>;  // n x 16 x i64 vector value
-def nxv32i64 : VTScalableVec<32, i64, 168>;  // n x 32 x i64 vector value
-
-def nxv1f16  : VTScalableVec<1,  f16, 169>;  // n x  1 x  f16 vector value
-def nxv2f16  : VTScalableVec<2,  f16, 170>;  // n x  2 x  f16 vector value
-def nxv4f16  : VTScalableVec<4,  f16, 171>;  // n x  4 x  f16 vector value
-def nxv8f16  : VTScalableVec<8,  f16, 172>;  // n x  8 x  f16 vector value
-def nxv16f16 : VTScalableVec<16, f16, 173>;  // n x 16 x  f16 vector value
-def nxv32f16 : VTScalableVec<32, f16, 174>;  // n x 32 x  f16 vector value
-
-def nxv1bf16  : VTScalableVec<1,  bf16, 175>;  // n x  1 x bf16 vector value
-def nxv2bf16  : VTScalableVec<2,  bf16, 176>;  // n x  2 x bf16 vector value
-def nxv4bf16  : VTScalableVec<4,  bf16, 177>;  // n x  4 x bf16 vector value
-def nxv8bf16  : VTScalableVec<8,  bf16, 178>;  // n x  8 x bf16 vector value
-def nxv16bf16 : VTScalableVec<16, bf16, 179>;  // n x 16 x bf16 vector value
-def nxv32bf16 : VTScalableVec<32, bf16, 180>;  // n x 32 x bf16 vector value
-
-def nxv1f32  : VTScalableVec<1,  f32, 181>;  // n x  1 x  f32 vector value
-def nxv2f32  : VTScalableVec<2,  f32, 182>;  // n x  2 x  f32 vector value
-def nxv4f32  : VTScalableVec<4,  f32, 183>;  // n x  4 x  f32 vector value
-def nxv8f32  : VTScalableVec<8,  f32, 184>;  // n x  8 x  f32 vector value
-def nxv16f32 : VTScalableVec<16, f32, 185>;  // n x 16 x  f32 vector value
-
-def nxv1f64  : VTScalableVec<1,  f64, 186>;  // n x  1 x  f64 vector value
-def nxv2f64  : VTScalableVec<2,  f64, 187>;  // n x  2 x  f64 vector value
-def nxv4f64  : VTScalableVec<4,  f64, 188>;  // n x  4 x  f64 vector value
-def nxv8f64  : VTScalableVec<8,  f64, 189>;  // n x  8 x  f64 vector value
+def v1bf16   : VTVec<1,   bf16,  99>;  //    1 x bf16 vector value
+def v2bf16   : VTVec<2,   bf16, 100>;  //    2 x bf16 vector value
+def v3bf16   : VTVec<3,   bf16, 101>;  //    3 x bf16 vector value
+def v4bf16   : VTVec<4,   bf16, 102>;  //    4 x bf16 vector value
+def v8bf16   : VTVec<8,   bf16, 103>;  //    8 x bf16 vector value
+def v16bf16  : VTVec<16,  bf16, 104>;  //   16 x bf16 vector value
+def v32bf16  : VTVec<32,  bf16, 105>;  //   32 x bf16 vector value
+def v64bf16  : VTVec<64,  bf16, 106>;  //   64 x bf16 vector value
+def v128bf16 : VTVec<128, bf16, 107>;  //  128 x bf16 vector value
+
+def v1f32    : VTVec<1,    f32, 108>;  //    1 x f32 vector value
+def v2f32    : VTVec<2,    f32, 109>;  //    2 x f32 vector value
+def v3f32    : VTVec<3,    f32, 110>;  //    3 x f32 vector value
+def v4f32    : VTVec<4,    f32, 111>;  //    4 x f32 vector value
+def v5f32    : VTVec<5,    f32, 112>;  //    5 x f32 vector value
+def v6f32    : VTVec<6,    f32, 113>;  //    6 x f32 vector value
+def v7f32    : VTVec<7,    f32, 114>;  //    7 x f32 vector value
+def v8f32    : VTVec<8,    f32, 115>;  //    8 x f32 vector value
+def v9f32    : VTVec<9,    f32, 116>;  //    9 x f32 vector value
+def v10f32   : VTVec<10,   f32, 117>;  //   10 x f32 vector value
+def v11f32   : VTVec<11,   f32, 118>;  //   11 x f32 vector value
+def v12f32   : VTVec<12,   f32, 119>;  //   12 x f32 vector value
+def v16f32   : VTVec<16,   f32, 120>;  //   16 x f32 vector value
+def v32f32   : VTVec<32,   f32, 121>;  //   32 x f32 vector value
+def v64f32   : VTVec<64,   f32, 122>;  //   64 x f32 vector value
+def v128f32  : VTVec<128,  f32, 123>;  //  128 x f32 vector value
+def v256f32  : VTVec<256,  f32, 124>;  //  256 x f32 vector value
+def v512f32  : VTVec<512,  f32, 125>;  //  512 x f32 vector value
+def v1024f32 : VTVec<1024, f32, 126>;  // 1024 x f32 vector value
+def v2048f32 : VTVec<2048, f32, 127>;  // 2048 x f32 vector value
+
+def v1f64    : VTVec<1,    f64, 128>;  //    1 x f64 vector value
+def v2f64    : VTVec<2,    f64, 129>;  //    2 x f64 vector value
+def v3f64    : VTVec<3,    f64, 130>;  //    3 x f64 vector value
+def v4f64    : VTVec<4,    f64, 131>;  //    4 x f64 vector value
+def v8f64    : VTVec<8,    f64, 132>;  //    8 x f64 vector value
+def v16f64   : VTVec<16,   f64, 133>;  //   16 x f64 vector value
+def v32f64   : VTVec<32,   f64, 134>;  //   32 x f64 vector value
+def v64f64   : VTVec<64,   f64, 135>;  //   64 x f64 vector value
+def v128f64  : VTVec<128,  f64, 136>;  //  128 x f64 vector value
+def v256f64  : VTVec<256,  f64, 137>;  //  256 x f64 vector value
+
+def nxv1i1  : VTScalableVec<1,  i1, 138>;  // n x  1 x i1  vector value
+def nxv2i1  : VTScalableVec<2,  i1, 139>;  // n x  2 x i1  vector value
+def nxv4i1  : VTScalableVec<4,  i1, 140>;  // n x  4 x i1  vector value
+def nxv8i1  : VTScalableVec<8,  i1, 141>;  // n x  8 x i1  vector value
+def nxv16i1 : VTScalableVec<16, i1, 142>;  // n x 16 x i1  vector value
+def nxv32i1 : VTScalableVec<32, i1, 143>;  // n x 32 x i1  vector value
+def nxv64i1 : VTScalableVec<64, i1, 144>;  // n x 64 x i1  vector value
+
+def nxv1i8  : VTScalableVec<1,  i8, 145>;  // n x  1 x i8  vector value
+def nxv2i8  : VTScalableVec<2,  i8, 146>;  // n x  2 x i8  vector value
+def nxv4i8  : VTScalableVec<4,  i8, 147>;  // n x  4 x i8  vector value
+def nxv8i8  : VTScalableVec<8,  i8, 148>;  // n x  8 x i8  vector value
+def nxv16i8 : VTScalableVec<16, i8, 149>;  // n x 16 x i8  vector value
+def nxv32i8 : VTScalableVec<32, i8, 150>;  // n x 32 x i8  vector value
+def nxv64i8 : VTScalableVec<64, i8, 151>;  // n x 64 x i8  vector value
+
+def nxv1i16  : VTScalableVec<1,  i16, 152>;  // n x  1 x i16 vector value
+def nxv2i16  : VTScalableVec<2,  i16, 153>;  // n x  2 x i16 vector value
+def nxv4i16  : VTScalableVec<4,  i16, 154>;  // n x  4 x i16 vector value
+def nxv8i16  : VTScalableVec<8,  i16, 155>;  // n x  8 x i16 vector value
+def nxv16i16 : VTScalableVec<16, i16, 156>;  // n x 16 x i16 vector value
+def nxv32i16 : VTScalableVec<32, i16, 157>;  // n x 32 x i16 vector value
+
+def nxv1i32  : VTScalableVec<1,  i32, 158>;  // n x  1 x i32 vector value
+def nxv2i32  : VTScalableVec<2,  i32, 159>;  // n x  2 x i32 vector value
+def nxv4i32  : VTScalableVec<4,  i32, 160>;  // n x  4 x i32 vector value
+def nxv8i32  : VTScalableVec<8,  i32, 161>;  // n x  8 x i32 vector value
+def nxv16i32 : VTScalableVec<16, i32, 162>;  // n x 16 x i32 vector value
+def nxv32i32 : VTScalableVec<32, i32, 163>;  // n x 32 x i32 vector value
+
+def nxv1i64  : VTScalableVec<1,  i64, 164>;  // n x  1 x i64 vector value
+def nxv2i64  : VTScalableVec<2,  i64, 165>;  // n x  2 x i64 vector value
+def nxv4i64  : VTScalableVec<4,  i64, 166>;  // n x  4 x i64 vector value
+def nxv8i64  : VTScalableVec<8,  i64, 167>;  // n x  8 x i64 vector value
+def nxv16i64 : VTScalableVec<16, i64, 168>;  // n x 16 x i64 vector value
+def nxv32i64 : VTScalableVec<32, i64, 169>;  // n x 32 x i64 vector value
+
+def nxv1f16  : VTScalableVec<1,  f16, 170>;  // n x  1 x  f16 vector value
+def nxv2f16  : VTScalableVec<2,  f16, 171>;  // n x  2 x  f16 vector value
+def nxv4f16  : VTScalableVec<4,  f16, 172>;  // n x  4 x  f16 vector value
+def nxv8f16  : VTScalableVec<8,  f16, 173>;  // n x  8 x  f16 vector value
+def nxv16f16 : VTScalableVec<16, f16, 174>;  // n x 16 x  f16 vector value
+def nxv32f16 : VTScalableVec<32, f16, 175>;  // n x 32 x  f16 vector value
+
+def nxv1bf16  : VTScalableVec<1,  bf16, 176>;  // n x  1 x bf16 vector value
+def nxv2bf16  : VTScalableVec<2,  bf16, 177>;  // n x  2 x bf16 vector value
+def nxv4bf16  : VTScalableVec<4,  bf16, 178>;  // n x  4 x bf16 vector value
+def nxv8bf16  : VTScalableVec<8,  bf16, 179>;  // n x  8 x bf16 vector value
+def nxv16bf16 : VTScalableVec<16, bf16, 180>;  // n x 16 x bf16 vector value
+def nxv32bf16 : VTScalableVec<32, bf16, 181>;  // n x 32 x bf16 vector value
+
+def nxv1f32  : VTScalableVec<1,  f32, 182>;  // n x  1 x  f32 vector value
+def nxv2f32  : VTScalableVec<2,  f32, 183>;  // n x  2 x  f32 vector value
+def nxv4f32  : VTScalableVec<4,  f32, 184>;  // n x  4 x  f32 vector value
+def nxv8f32  : VTScalableVec<8,  f32, 185>;  // n x  8 x  f32 vector value
+def nxv16f32 : VTScalableVec<16, f32, 186>;  // n x 16 x  f32 vector value
+
+def nxv1f64  : VTScalableVec<1,  f64, 187>;  // n x  1 x  f64 vector value
+def nxv2f64  : VTScalableVec<2,  f64, 188>;  // n x  2 x  f64 vector value
+def nxv4f64  : VTScalableVec<4,  f64, 189>;  // n x  4 x  f64 vector value
+def nxv8f64  : VTScalableVec<8,  f64, 190>;  // n x  8 x  f64 vector value
 
 // Sz = NF * MinNumElts * 8(bits)
-def riscv_nxv1i8x2   : VTVecTup<16, 2, i8, 190>;  // RISCV vector tuple(min_num_elts=1, nf=2)
-def riscv_nxv1i8x3   : VTVecTup<24, 3, i8, 191>;  // RISCV vector tuple(min_num_elts=1, nf=3)
-def riscv_nxv1i8x4   : VTVecTup<32, 4, i8, 192>;  // RISCV vector tuple(min_num_elts=1, nf=4)
-def riscv_nxv1i8x5   : VTVecTup<40, 5, i8, 193>;  // RISCV vector tuple(min_num_elts=1, nf=5)
-def riscv_nxv1i8x6   : VTVecTup<48, 6, i8, 194>;  // RISCV vector tuple(min_num_elts=1, nf=6)
-def riscv_nxv1i8x7   : VTVecTup<56, 7, i8, 195>;  // RISCV vector tuple(min_num_elts=1, nf=7)
-def riscv_nxv1i8x8   : VTVecTup<64, 8, i8, 196>;  // RISCV vector tuple(min_num_elts=1, nf=8)
-def riscv_nxv2i8x2   : VTVecTup<32, 2, i8, 197>;  // RISCV vector tuple(min_num_elts=2, nf=2)
-def riscv_nxv2i8x3   : VTVecTup<48, 3, i8, 198>;  // RISCV vector tuple(min_num_elts=2, nf=3)
-def riscv_nxv2i8x4   : VTVecTup<64, 4, i8, 199>;  // RISCV vector tuple(min_num_elts=2, nf=4)
-def riscv_nxv2i8x5   : VTVecTup<80, 5, i8, 200>;  // RISCV vector tuple(min_num_elts=2, nf=5)
-def riscv_nxv2i8x6   : VTVecTup<96, 6, i8, 201>;  // RISCV vector tuple(min_num_elts=2, nf=6)
-def riscv_nxv2i8x7   : VTVecTup<112, 7, i8, 202>; // RISCV vector tuple(min_num_elts=2, nf=7)
-def riscv_nxv2i8x8   : VTVecTup<128, 8, i8, 203>; // RISCV vector tuple(min_num_elts=2, nf=8)
-def riscv_nxv4i8x2   : VTVecTup<64, 2, i8, 204>;  // RISCV vector tuple(min_num_elts=4, nf=2)
-def riscv_nxv4i8x3   : VTVecTup<96, 3, i8, 205>;  // RISCV vector tuple(min_num_elts=4, nf=3)
-def riscv_nxv4i8x4   : VTVecTup<128, 4, i8, 206>; // RISCV vector tuple(min_num_elts=4, nf=4)
-def riscv_nxv4i8x5   : VTVecTup<160, 5, i8, 207>; // RISCV vector tuple(min_num_elts=4, nf=5)
-def riscv_nxv4i8x6   : VTVecTup<192, 6, i8, 208>; // RISCV vector tuple(min_num_elts=4, nf=6)
-def riscv_nxv4i8x7   : VTVecTup<224, 7, i8, 209>; // RISCV vector tuple(min_num_elts=4, nf=7)
-def riscv_nxv4i8x8   : VTVecTup<256, 8, i8, 210>; // RISCV vector tuple(min_num_elts=4, nf=8)
-def riscv_nxv8i8x2   : VTVecTup<128, 2, i8, 211>; // RISCV vector tuple(min_num_elts=8, nf=2)
-def riscv_nxv8i8x3   : VTVecTup<192, 3, i8, 212>; // RISCV vector tuple(min_num_elts=8, nf=3)
-def riscv_nxv8i8x4   : VTVecTup<256, 4, i8, 213>; // RISCV vector tuple(min_num_elts=8, nf=4)
-def riscv_nxv8i8x5   : VTVecTup<320, 5, i8, 214>; // RISCV vector tuple(min_num_elts=8, nf=5)
-def riscv_nxv8i8x6   : VTVecTup<384, 6, i8, 215>; // RISCV vector tuple(min_num_elts=8, nf=6)
-def riscv_nxv8i8x7   : VTVecTup<448, 7, i8, 216>; // RISCV vector tuple(min_num_elts=8, nf=7)
-def riscv_nxv8i8x8   : VTVecTup<512, 8, i8, 217>; // RISCV vector tuple(min_num_elts=8, nf=8)
-def riscv_nxv16i8x2  : VTVecTup<256, 2, i8, 218>; // RISCV vector tuple(min_num_elts=16, nf=2)
-def riscv_nxv16i8x3  : VTVecTup<384, 3, i8, 219>; // RISCV vector tuple(min_num_elts=16, nf=3)
-def riscv_nxv16i8x4  : VTVecTup<512, 4, i8, 220>; // RISCV vector tuple(min_num_elts=16, nf=4)
-def riscv_nxv32i8x2  : VTVecTup<512, 2, i8, 221>; // RISCV vector tuple(min_num_elts=32, nf=2)
-
-def x86mmx    : ValueType<64,   222>;  // X86 MMX value
-def Glue      : ValueType<0,    223>;  // Pre-RA sched glue
-def isVoid    : ValueType<0,    224>;  // Produces no value
-def untyped   : ValueType<8,    225> { // Produces an untyped value
+def riscv_nxv1i8x2   : VTVecTup<16, 2, i8, 191>;  // RISCV vector tuple(min_num_elts=1, nf=2)
+def riscv_nxv1i8x3   : VTVecTup<24, 3, i8, 192>;  // RISCV vector tuple(min_num_elts=1, nf=3)
+def riscv_nxv1i8x4   : VTVecTup<32, 4, i8, 193>;  // RISCV vector tuple(min_num_elts=1, nf=4)
+def riscv_nxv1i8x5   : VTVecTup<40, 5, i8, 194>;  // RISCV vector tuple(min_num_elts=1, nf=5)
+def riscv_nxv1i8x6   : VTVecTup<48, 6, i8, 195>;  // RISCV vector tuple(min_num_elts=1, nf=6)
+def riscv_nxv1i8x7   : VTVecTup<56, 7, i8, 196>;  // RISCV vector tuple(min_num_elts=1, nf=7)
+def riscv_nxv1i8x8   : VTVecTup<64, 8, i8, 197>;  // RISCV vector tuple(min_num_elts=1, nf=8)
+def riscv_nxv2i8x2   : VTVecTup<32, 2, i8, 198>;  // RISCV vector tuple(min_num_elts=2, nf=2)
+def riscv_nxv2i8x3   : VTVecTup<48, 3, i8, 199>;  // RISCV vector tuple(min_num_elts=2, nf=3)
+def riscv_nxv2i8x4   : VTVecTup<64, 4, i8, 200>;  // RISCV vector tuple(min_num_elts=2, nf=4)
+def riscv_nxv2i8x5   : VTVecTup<80, 5, i8, 201>;  // RISCV vector tuple(min_num_elts=2, nf=5)
+def riscv_nxv2i8x6   : VTVecTup<96, 6, i8, 202>;  // RISCV vector tuple(min_num_elts=2, nf=6)
+def riscv_nxv2i8x7   : VTVecTup<112, 7, i8, 203>; // RISCV vector tuple(min_num_elts=2, nf=7)
+def riscv_nxv2i8x8   : VTVecTup<128, 8, i8, 204>; // RISCV vector tuple(min_num_elts=2, nf=8)
+def riscv_nxv4i8x2   : VTVecTup<64, 2, i8, 205>;  // RISCV vector tuple(min_num_elts=4, nf=2)
+def riscv_nxv4i8x3   : VTVecTup<96, 3, i8, 206>;  // RISCV vector tuple(min_num_elts=4, nf=3)
+def riscv_nxv4i8x4   : VTVecTup<128, 4, i8, 207>; // RISCV vector tuple(min_num_elts=4, nf=4)
+def riscv_nxv4i8x5   : VTVecTup<160, 5, i8, 208>; // RISCV vector tuple(min_num_elts=4, nf=5)
+def riscv_nxv4i8x6   : VTVecTup<192, 6, i8, 209>; // RISCV vector tuple(min_num_elts=4, nf=6)
+def riscv_nxv4i8x7   : VTVecTup<224, 7, i8, 210>; // RISCV vector tuple(min_num_elts=4, nf=7)
+def riscv_nxv4i8x8   : VTVecTup<256, 8, i8, 211>; // RISCV vector tuple(min_num_elts=4, nf=8)
+def riscv_nxv8i8x2   : VTVecTup<128, 2, i8, 212>; // RISCV vector tuple(min_num_elts=8, nf=2)
+def riscv_nxv8i8x3   : VTVecTup<192, 3, i8, 213>; // RISCV vector tuple(min_num_elts=8, nf=3)
+def riscv_nxv8i8x4   : VTVecTup<256, 4, i8, 214>; // RISCV vector tuple(min_num_elts=8, nf=4)
+def riscv_nxv8i8x5   : VTVecTup<320, 5, i8, 215>; // RISCV vector tuple(min_num_elts=8, nf=5)
+def riscv_nxv8i8x6   : VTVecTup<384, 6, i8, 216>; // RISCV vector tuple(min_num_elts=8, nf=6)
+def riscv_nxv8i8x7   : VTVecTup<448, 7, i8, 217>; // RISCV vector tuple(min_num_elts=8, nf=7)
+def riscv_nxv8i8x8   : VTVecTup<512, 8, i8, 218>; // RISCV vector tuple(min_num_elts=8, nf=8)
+def riscv_nxv16i8x2  : VTVecTup<256, 2, i8, 219>; // RISCV vector tuple(min_num_elts=16, nf=2)
+def riscv_nxv16i8x3  : VTVecTup<384, 3, i8, 220>; // RISCV vector tuple(min_num_elts=16, nf=3)
+def riscv_nxv16i8x4  : VTVecTup<512, 4, i8, 221>; // RISCV vector tuple(min_num_elts=16, nf=4)
+def riscv_nxv32i8x2  : VTVecTup<512, 2, i8, 222>; // RISCV vector tuple(min_num_elts=32, nf=2)
+
+def x86mmx    : ValueType<64,   223>;  // X86 MMX value
+def Glue      : ValueType<0,    224>;  // Pre-RA sched glue
+def isVoid    : ValueType<0,    225>;  // Produces no value
+def untyped   : ValueType<8,    226> { // Produces an untyped value
   let LLVMName = "Untyped";
 }
-def funcref   : ValueType<0,    226>;  // WebAssembly's funcref type
-def externref : ValueType<0,    227>;  // WebAssembly's externref type
-def exnref    : ValueType<0,    228>;  // WebAssembly's exnref type
-def x86amx    : ValueType<8192, 229>;  // X86 AMX value
-def i64x8     : ValueType<512,  230>;  // 8 Consecutive GPRs (AArch64)
+def funcref   : ValueType<0,    227>;  // WebAssembly's funcref type
+def externref : ValueType<0,    228>;  // WebAssembly's externref type
+def exnref    : ValueType<0,    229>;  // WebAssembly's exnref type
+def x86amx    : ValueType<8192, 230>;  // X86 AMX value
+def i64x8     : ValueType<512,  231>;  // 8 Consecutive GPRs (AArch64)
 def aarch64svcount
-              : ValueType<16,  231>;  // AArch64 predicate-as-counter
-def spirvbuiltin : ValueType<0, 232>; // SPIR-V's builtin type
+              : ValueType<16,  232>;  // AArch64 predicate-as-counter
+def spirvbuiltin : ValueType<0, 233>; // SPIR-V's builtin type
 
 let isNormalValueType = false in {
 def token      : ValueType<0, 504>;  // TokenTy

diff  --git a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
index b3e66ccc705f8f..35619db0b49900 100644
--- a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
@@ -521,7 +521,7 @@ define void @frem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = frem bfloat undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = frem float undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = frem double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = frem <1 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1BF16 = frem <1 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2BF16 = frem <2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4BF16 = frem <4 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8BF16 = frem <8 x bfloat> undef, undef
@@ -761,7 +761,7 @@ define void @fcopysign() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %BF16 = call bfloat @llvm.copysign.bf16(bfloat undef, bfloat undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.copysign.f32(float undef, float undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.copysign.f64(double undef, double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1BF16 = call <1 x bfloat> @llvm.copysign.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V1BF16 = call <1 x bfloat> @llvm.copysign.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2BF16 = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4BF16 = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8BF16 = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
@@ -889,7 +889,7 @@ define void @fma() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %BF16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1BF16 = call <1 x bfloat> @llvm.fma.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = call <1 x bfloat> @llvm.fma.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2BF16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4BF16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8BF16 = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x bfloat> undef)

diff  --git a/llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll
index afb2b644645218..588d852d7f26e2 100644
--- a/llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/reduce-fadd.ll
@@ -1,7 +1,41 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=FP-REDUCE,FP-REDUCE-ZVFH
-; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=FP-REDUCE,FP-REDUCE-ZVFHMIN
-; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck %s  --check-prefix=SIZE
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+zfbfmin,+zvfbfmin -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=FP-REDUCE,FP-REDUCE-ZVFH
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=FP-REDUCE,FP-REDUCE-ZVFHMIN
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+zfbfmin,+zvfbfmin -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck %s  --check-prefix=SIZE
+
+define void @reduce_fadd_bfloat() {
+; FP-REDUCE-LABEL: 'reduce_fadd_bfloat'
+; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %V1 = call fast bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %V2 = call fast bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %V4 = call fast bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %V8 = call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %V16 = call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %v32 = call fast bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %V64 = call fast bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Invalid cost for instruction: %V128 = call fast bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'reduce_fadd_bfloat'
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %V1 = call fast bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %V2 = call fast bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %V4 = call fast bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %V8 = call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %V16 = call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %v32 = call fast bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %V64 = call fast bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %V128 = call fast bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %V1 = call fast bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0.0, <1 x bfloat> undef)
+  %V2 = call fast bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0.0, <2 x bfloat> undef)
+  %V4 = call fast bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0.0, <4 x bfloat> undef)
+  %V8 = call fast bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0.0, <8 x bfloat> undef)
+  %V16 = call fast bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0.0, <16 x bfloat> undef)
+  %v32 = call fast bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0.0, <32 x bfloat> undef)
+  %V64 = call fast bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0.0, <64 x bfloat> undef)
+  %V128 = call fast bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0.0, <128 x bfloat> undef)
+  ret void
+}
 
 define void @reduce_fadd_half() {
 ; FP-REDUCE-ZVFH-LABEL: 'reduce_fadd_half'
@@ -116,6 +150,40 @@ define void @reduce_fadd_double() {
   ret void
 }
 
+define void @reduce_oredered_fadd_bfloat() {
+; FP-REDUCE-LABEL: 'reduce_oredered_fadd_bfloat'
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2 = call bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V4 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8 = call bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16 = call bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %v32 = call bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 66 for instruction: %V64 = call bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 130 for instruction: %V128 = call bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'reduce_oredered_fadd_bfloat'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0xR0000, <1 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V2 = call bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0xR0000, <2 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V4 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8 = call bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0xR0000, <8 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16 = call bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR0000, <16 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v32 = call bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0xR0000, <32 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V64 = call bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0xR0000, <64 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V128 = call bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0xR0000, <128 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %V1 = call bfloat @llvm.vector.reduce.fadd.v1bf16(bfloat 0.0, <1 x bfloat> undef)
+  %V2 = call bfloat @llvm.vector.reduce.fadd.v2bf16(bfloat 0.0, <2 x bfloat> undef)
+  %V4 = call bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0.0, <4 x bfloat> undef)
+  %V8 = call bfloat @llvm.vector.reduce.fadd.v8bf16(bfloat 0.0, <8 x bfloat> undef)
+  %V16 = call bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0.0, <16 x bfloat> undef)
+  %v32 = call bfloat @llvm.vector.reduce.fadd.v32bf16(bfloat 0.0, <32 x bfloat> undef)
+  %V64 = call bfloat @llvm.vector.reduce.fadd.v64bf16(bfloat 0.0, <64 x bfloat> undef)
+  %V128 = call bfloat @llvm.vector.reduce.fadd.v128bf16(bfloat 0.0, <128 x bfloat> undef)
+  ret void
+}
+
 define void @reduce_oredered_fadd_half() {
 ; FP-REDUCE-LABEL: 'reduce_oredered_fadd_half'
 ; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call half @llvm.vector.reduce.fadd.v1f16(half 0xH0000, <1 x half> undef)

diff  --git a/llvm/test/Analysis/CostModel/RISCV/reduce-fmul.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-fmul.ll
index da1336aa724c95..913ce40f133da2 100644
--- a/llvm/test/Analysis/CostModel/RISCV/reduce-fmul.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/reduce-fmul.ll
@@ -1,7 +1,41 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=FP-REDUCE,FP-REDUCE-ZVFH
-; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=FP-REDUCE,FP-REDUCE-ZVFHMIN
-; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck %s  --check-prefix=SIZE
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+zfbfmin,+zvfbfmin -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=FP-REDUCE,FP-REDUCE-ZVFH
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfhmin,+zfbfmin,+zvfbfmin -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s --check-prefixes=FP-REDUCE,FP-REDUCE-ZVFHMIN
+; RUN: opt < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh,+zfbfmin,+zvfbfmin -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck %s  --check-prefix=SIZE
+
+define void @reduce_fmul_bfloat() {
+; FP-REDUCE-LABEL: 'reduce_fmul_bfloat'
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call fast bfloat @llvm.vector.reduce.fmul.v1bf16(bfloat 0xR0000, <1 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call fast bfloat @llvm.vector.reduce.fmul.v2bf16(bfloat 0xR0000, <2 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call fast bfloat @llvm.vector.reduce.fmul.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call fast bfloat @llvm.vector.reduce.fmul.v8bf16(bfloat 0xR0000, <8 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16 = call fast bfloat @llvm.vector.reduce.fmul.v16bf16(bfloat 0xR0000, <16 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v32 = call fast bfloat @llvm.vector.reduce.fmul.v32bf16(bfloat 0xR0000, <32 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 541 for instruction: %V64 = call fast bfloat @llvm.vector.reduce.fmul.v64bf16(bfloat 0xR0000, <64 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 573 for instruction: %V128 = call fast bfloat @llvm.vector.reduce.fmul.v128bf16(bfloat 0xR0000, <128 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'reduce_fmul_bfloat'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call fast bfloat @llvm.vector.reduce.fmul.v1bf16(bfloat 0xR0000, <1 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2 = call fast bfloat @llvm.vector.reduce.fmul.v2bf16(bfloat 0xR0000, <2 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call fast bfloat @llvm.vector.reduce.fmul.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8 = call fast bfloat @llvm.vector.reduce.fmul.v8bf16(bfloat 0xR0000, <8 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %V16 = call fast bfloat @llvm.vector.reduce.fmul.v16bf16(bfloat 0xR0000, <16 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %v32 = call fast bfloat @llvm.vector.reduce.fmul.v32bf16(bfloat 0xR0000, <32 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V64 = call fast bfloat @llvm.vector.reduce.fmul.v64bf16(bfloat 0xR0000, <64 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 34 for instruction: %V128 = call fast bfloat @llvm.vector.reduce.fmul.v128bf16(bfloat 0xR0000, <128 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %V1 = call fast bfloat @llvm.vector.reduce.fmul.v1bf16(bfloat 0.0, <1 x bfloat> undef)
+  %V2 = call fast bfloat @llvm.vector.reduce.fmul.v2bf16(bfloat 0.0, <2 x bfloat> undef)
+  %V4 = call fast bfloat @llvm.vector.reduce.fmul.v4bf16(bfloat 0.0, <4 x bfloat> undef)
+  %V8 = call fast bfloat @llvm.vector.reduce.fmul.v8bf16(bfloat 0.0, <8 x bfloat> undef)
+  %V16 = call fast bfloat @llvm.vector.reduce.fmul.v16bf16(bfloat 0.0, <16 x bfloat> undef)
+  %v32 = call fast bfloat @llvm.vector.reduce.fmul.v32bf16(bfloat 0.0, <32 x bfloat> undef)
+  %V64 = call fast bfloat @llvm.vector.reduce.fmul.v64bf16(bfloat 0.0, <64 x bfloat> undef)
+  %V128 = call fast bfloat @llvm.vector.reduce.fmul.v128bf16(bfloat 0.0, <128 x bfloat> undef)
+  ret void
+}
 
 define void @reduce_fmul_half() {
 ; FP-REDUCE-ZVFH-LABEL: 'reduce_fmul_half'
@@ -116,6 +150,40 @@ define void @reduce_fmul_double() {
   ret void
 }
 
+define void @reduce_ordered_fmul_bfloat() {
+; FP-REDUCE-LABEL: 'reduce_ordered_fmul_bfloat'
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call bfloat @llvm.vector.reduce.fmul.v1bf16(bfloat 0xR0000, <1 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call bfloat @llvm.vector.reduce.fmul.v2bf16(bfloat 0xR0000, <2 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call bfloat @llvm.vector.reduce.fmul.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 31 for instruction: %V8 = call bfloat @llvm.vector.reduce.fmul.v8bf16(bfloat 0xR0000, <8 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16 = call bfloat @llvm.vector.reduce.fmul.v16bf16(bfloat 0xR0000, <16 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %v32 = call bfloat @llvm.vector.reduce.fmul.v32bf16(bfloat 0xR0000, <32 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 255 for instruction: %V64 = call bfloat @llvm.vector.reduce.fmul.v64bf16(bfloat 0xR0000, <64 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 510 for instruction: %V128 = call bfloat @llvm.vector.reduce.fmul.v128bf16(bfloat 0xR0000, <128 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; SIZE-LABEL: 'reduce_ordered_fmul_bfloat'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1 = call bfloat @llvm.vector.reduce.fmul.v1bf16(bfloat 0xR0000, <1 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2 = call bfloat @llvm.vector.reduce.fmul.v2bf16(bfloat 0xR0000, <2 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %V4 = call bfloat @llvm.vector.reduce.fmul.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %V8 = call bfloat @llvm.vector.reduce.fmul.v8bf16(bfloat 0xR0000, <8 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16 = call bfloat @llvm.vector.reduce.fmul.v16bf16(bfloat 0xR0000, <16 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %v32 = call bfloat @llvm.vector.reduce.fmul.v32bf16(bfloat 0xR0000, <32 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %V64 = call bfloat @llvm.vector.reduce.fmul.v64bf16(bfloat 0xR0000, <64 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 382 for instruction: %V128 = call bfloat @llvm.vector.reduce.fmul.v128bf16(bfloat 0xR0000, <128 x bfloat> undef)
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+  %V1 = call bfloat @llvm.vector.reduce.fmul.v1bf16(bfloat 0.0, <1 x bfloat> undef)
+  %V2 = call bfloat @llvm.vector.reduce.fmul.v2bf16(bfloat 0.0, <2 x bfloat> undef)
+  %V4 = call bfloat @llvm.vector.reduce.fmul.v4bf16(bfloat 0.0, <4 x bfloat> undef)
+  %V8 = call bfloat @llvm.vector.reduce.fmul.v8bf16(bfloat 0.0, <8 x bfloat> undef)
+  %V16 = call bfloat @llvm.vector.reduce.fmul.v16bf16(bfloat 0.0, <16 x bfloat> undef)
+  %v32 = call bfloat @llvm.vector.reduce.fmul.v32bf16(bfloat 0.0, <32 x bfloat> undef)
+  %V64 = call bfloat @llvm.vector.reduce.fmul.v64bf16(bfloat 0.0, <64 x bfloat> undef)
+  %V128 = call bfloat @llvm.vector.reduce.fmul.v128bf16(bfloat 0.0, <128 x bfloat> undef)
+  ret void
+}
+
 define void @reduce_ordered_fmul_half() {
 ; FP-REDUCE-LABEL: 'reduce_ordered_fmul_half'
 ; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1 = call half @llvm.vector.reduce.fmul.v1f16(half 0xH0000, <1 x half> undef)

diff  --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll
index b953cf1f5bed88..1639f21f243d86 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-vfwmaccbf16.ll
@@ -1,112 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfbfwma -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFWMA
 ; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfbfwma -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFWMA
-; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfbfmin -verify-machineinstrs | FileCheck %s --check-prefixes=ZVFBFMIN,ZVFBMIN32
-; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfbfmin -verify-machineinstrs | FileCheck %s --check-prefixes=ZVFBFMIN,ZVFBMIN64
+; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfbfmin -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFMIN
+; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfbfmin -verify-machineinstrs | FileCheck %s --check-prefix=ZVFBFMIN
 
 define <1 x float> @vfwmaccbf16_vv_v1f32(<1 x float> %a, <1 x bfloat> %b, <1 x bfloat> %c) {
 ; ZVFBFWMA-LABEL: vfwmaccbf16_vv_v1f32:
 ; ZVFBFWMA:       # %bb.0:
-; ZVFBFWMA-NEXT:    addi sp, sp, -16
-; ZVFBFWMA-NEXT:    .cfi_def_cfa_offset 16
-; ZVFBFWMA-NEXT:    fcvt.s.bf16 fa5, fa0
-; ZVFBFWMA-NEXT:    fsw fa5, 8(sp)
-; ZVFBFWMA-NEXT:    addi a0, sp, 8
-; ZVFBFWMA-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFBFWMA-NEXT:    vle32.v v9, (a0)
-; ZVFBFWMA-NEXT:    fcvt.s.bf16 fa5, fa1
-; ZVFBFWMA-NEXT:    fsw fa5, 12(sp)
-; ZVFBFWMA-NEXT:    addi a0, sp, 12
-; ZVFBFWMA-NEXT:    vle32.v v10, (a0)
-; ZVFBFWMA-NEXT:    vfmacc.vv v8, v9, v10
-; ZVFBFWMA-NEXT:    addi sp, sp, 16
+; ZVFBFWMA-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFBFWMA-NEXT:    vfwmaccbf16.vv v8, v9, v10
 ; ZVFBFWMA-NEXT:    ret
 ;
-; ZVFBMIN32-LABEL: vfwmaccbf16_vv_v1f32:
-; ZVFBMIN32:       # %bb.0:
-; ZVFBMIN32-NEXT:    addi sp, sp, -32
-; ZVFBMIN32-NEXT:    .cfi_def_cfa_offset 32
-; ZVFBMIN32-NEXT:    sw ra, 28(sp) # 4-byte Folded Spill
-; ZVFBMIN32-NEXT:    sw s0, 24(sp) # 4-byte Folded Spill
-; ZVFBMIN32-NEXT:    fsd fs0, 16(sp) # 8-byte Folded Spill
-; ZVFBMIN32-NEXT:    .cfi_offset ra, -4
-; ZVFBMIN32-NEXT:    .cfi_offset s0, -8
-; ZVFBMIN32-NEXT:    .cfi_offset fs0, -16
-; ZVFBMIN32-NEXT:    csrr a0, vlenb
-; ZVFBMIN32-NEXT:    sub sp, sp, a0
-; ZVFBMIN32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x20, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 + 1 * vlenb
-; ZVFBMIN32-NEXT:    fmv.s fs0, fa0
-; ZVFBMIN32-NEXT:    addi a0, sp, 16
-; ZVFBMIN32-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFBMIN32-NEXT:    fmv.s fa0, fa1
-; ZVFBMIN32-NEXT:    call __truncsfbf2
-; ZVFBMIN32-NEXT:    fmv.x.w s0, fa0
-; ZVFBMIN32-NEXT:    fmv.s fa0, fs0
-; ZVFBMIN32-NEXT:    call __truncsfbf2
-; ZVFBMIN32-NEXT:    fmv.x.w a0, fa0
-; ZVFBMIN32-NEXT:    slli a0, a0, 16
-; ZVFBMIN32-NEXT:    sw a0, 8(sp)
-; ZVFBMIN32-NEXT:    addi a0, sp, 8
-; ZVFBMIN32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFBMIN32-NEXT:    vle32.v v10, (a0)
-; ZVFBMIN32-NEXT:    slli s0, s0, 16
-; ZVFBMIN32-NEXT:    sw s0, 12(sp)
-; ZVFBMIN32-NEXT:    addi a0, sp, 12
-; ZVFBMIN32-NEXT:    vle32.v v9, (a0)
-; ZVFBMIN32-NEXT:    addi a0, sp, 16
-; ZVFBMIN32-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFBMIN32-NEXT:    vfmacc.vv v8, v10, v9
-; ZVFBMIN32-NEXT:    csrr a0, vlenb
-; ZVFBMIN32-NEXT:    add sp, sp, a0
-; ZVFBMIN32-NEXT:    lw ra, 28(sp) # 4-byte Folded Reload
-; ZVFBMIN32-NEXT:    lw s0, 24(sp) # 4-byte Folded Reload
-; ZVFBMIN32-NEXT:    fld fs0, 16(sp) # 8-byte Folded Reload
-; ZVFBMIN32-NEXT:    addi sp, sp, 32
-; ZVFBMIN32-NEXT:    ret
-;
-; ZVFBMIN64-LABEL: vfwmaccbf16_vv_v1f32:
-; ZVFBMIN64:       # %bb.0:
-; ZVFBMIN64-NEXT:    addi sp, sp, -64
-; ZVFBMIN64-NEXT:    .cfi_def_cfa_offset 64
-; ZVFBMIN64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
-; ZVFBMIN64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
-; ZVFBMIN64-NEXT:    fsd fs0, 40(sp) # 8-byte Folded Spill
-; ZVFBMIN64-NEXT:    .cfi_offset ra, -8
-; ZVFBMIN64-NEXT:    .cfi_offset s0, -16
-; ZVFBMIN64-NEXT:    .cfi_offset fs0, -24
-; ZVFBMIN64-NEXT:    csrr a0, vlenb
-; ZVFBMIN64-NEXT:    sub sp, sp, a0
-; ZVFBMIN64-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xc0, 0x00, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 64 + 1 * vlenb
-; ZVFBMIN64-NEXT:    fmv.s fs0, fa0
-; ZVFBMIN64-NEXT:    addi a0, sp, 32
-; ZVFBMIN64-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFBMIN64-NEXT:    fmv.s fa0, fa1
-; ZVFBMIN64-NEXT:    call __truncsfbf2
-; ZVFBMIN64-NEXT:    fmv.x.w s0, fa0
-; ZVFBMIN64-NEXT:    fmv.s fa0, fs0
-; ZVFBMIN64-NEXT:    call __truncsfbf2
-; ZVFBMIN64-NEXT:    fmv.x.w a0, fa0
-; ZVFBMIN64-NEXT:    slli a0, a0, 16
-; ZVFBMIN64-NEXT:    fmv.w.x fa5, a0
-; ZVFBMIN64-NEXT:    fsw fa5, 16(sp)
-; ZVFBMIN64-NEXT:    addi a0, sp, 16
-; ZVFBMIN64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFBMIN64-NEXT:    vle32.v v10, (a0)
-; ZVFBMIN64-NEXT:    slli s0, s0, 16
-; ZVFBMIN64-NEXT:    fmv.w.x fa5, s0
-; ZVFBMIN64-NEXT:    fsw fa5, 20(sp)
-; ZVFBMIN64-NEXT:    addi a0, sp, 20
-; ZVFBMIN64-NEXT:    vle32.v v9, (a0)
-; ZVFBMIN64-NEXT:    addi a0, sp, 32
-; ZVFBMIN64-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFBMIN64-NEXT:    vfmacc.vv v8, v10, v9
-; ZVFBMIN64-NEXT:    csrr a0, vlenb
-; ZVFBMIN64-NEXT:    add sp, sp, a0
-; ZVFBMIN64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
-; ZVFBMIN64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
-; ZVFBMIN64-NEXT:    fld fs0, 40(sp) # 8-byte Folded Reload
-; ZVFBMIN64-NEXT:    addi sp, sp, 64
-; ZVFBMIN64-NEXT:    ret
+; ZVFBFMIN-LABEL: vfwmaccbf16_vv_v1f32:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v11, v9
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v9, v10
+; ZVFBFMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFMIN-NEXT:    vfmacc.vv v8, v11, v9
+; ZVFBFMIN-NEXT:    ret
   %b.ext = fpext <1 x bfloat> %b to <1 x float>
   %c.ext = fpext <1 x bfloat> %c to <1 x float>
   %res = call <1 x float> @llvm.fma.v1f32(<1 x float> %b.ext, <1 x float> %c.ext, <1 x float> %a)
@@ -116,96 +28,22 @@ define <1 x float> @vfwmaccbf16_vv_v1f32(<1 x float> %a, <1 x bfloat> %b, <1 x b
 define <1 x float> @vfwmaccbf16_vf_v1f32(<1 x float> %a, bfloat %b, <1 x bfloat> %c) {
 ; ZVFBFWMA-LABEL: vfwmaccbf16_vf_v1f32:
 ; ZVFBFWMA:       # %bb.0:
-; ZVFBFWMA-NEXT:    addi sp, sp, -16
-; ZVFBFWMA-NEXT:    .cfi_def_cfa_offset 16
-; ZVFBFWMA-NEXT:    fcvt.s.bf16 fa5, fa0
-; ZVFBFWMA-NEXT:    fsw fa5, 8(sp)
-; ZVFBFWMA-NEXT:    addi a0, sp, 8
-; ZVFBFWMA-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFBFWMA-NEXT:    vle32.v v9, (a0)
-; ZVFBFWMA-NEXT:    fcvt.s.bf16 fa5, fa1
-; ZVFBFWMA-NEXT:    fsw fa5, 12(sp)
-; ZVFBFWMA-NEXT:    addi a0, sp, 12
-; ZVFBFWMA-NEXT:    vle32.v v10, (a0)
-; ZVFBFWMA-NEXT:    vfmacc.vv v8, v9, v10
-; ZVFBFWMA-NEXT:    addi sp, sp, 16
+; ZVFBFWMA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFWMA-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFBFWMA-NEXT:    vmv.s.x v10, a0
+; ZVFBFWMA-NEXT:    vfwmaccbf16.vv v8, v10, v9
 ; ZVFBFWMA-NEXT:    ret
 ;
-; ZVFBMIN32-LABEL: vfwmaccbf16_vf_v1f32:
-; ZVFBMIN32:       # %bb.0:
-; ZVFBMIN32-NEXT:    addi sp, sp, -48
-; ZVFBMIN32-NEXT:    .cfi_def_cfa_offset 48
-; ZVFBMIN32-NEXT:    sw ra, 44(sp) # 4-byte Folded Spill
-; ZVFBMIN32-NEXT:    fsd fs0, 32(sp) # 8-byte Folded Spill
-; ZVFBMIN32-NEXT:    .cfi_offset ra, -4
-; ZVFBMIN32-NEXT:    .cfi_offset fs0, -16
-; ZVFBMIN32-NEXT:    csrr a0, vlenb
-; ZVFBMIN32-NEXT:    sub sp, sp, a0
-; ZVFBMIN32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb
-; ZVFBMIN32-NEXT:    fmv.s fs0, fa0
-; ZVFBMIN32-NEXT:    addi a0, sp, 32
-; ZVFBMIN32-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFBMIN32-NEXT:    fmv.s fa0, fa1
-; ZVFBMIN32-NEXT:    call __truncsfbf2
-; ZVFBMIN32-NEXT:    fmv.x.w a0, fa0
-; ZVFBMIN32-NEXT:    fmv.x.w a1, fs0
-; ZVFBMIN32-NEXT:    slli a1, a1, 16
-; ZVFBMIN32-NEXT:    sw a1, 8(sp)
-; ZVFBMIN32-NEXT:    addi a1, sp, 8
-; ZVFBMIN32-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFBMIN32-NEXT:    vle32.v v10, (a1)
-; ZVFBMIN32-NEXT:    slli a0, a0, 16
-; ZVFBMIN32-NEXT:    sw a0, 12(sp)
-; ZVFBMIN32-NEXT:    addi a0, sp, 12
-; ZVFBMIN32-NEXT:    vle32.v v9, (a0)
-; ZVFBMIN32-NEXT:    addi a0, sp, 32
-; ZVFBMIN32-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFBMIN32-NEXT:    vfmacc.vv v8, v10, v9
-; ZVFBMIN32-NEXT:    csrr a0, vlenb
-; ZVFBMIN32-NEXT:    add sp, sp, a0
-; ZVFBMIN32-NEXT:    lw ra, 44(sp) # 4-byte Folded Reload
-; ZVFBMIN32-NEXT:    fld fs0, 32(sp) # 8-byte Folded Reload
-; ZVFBMIN32-NEXT:    addi sp, sp, 48
-; ZVFBMIN32-NEXT:    ret
-;
-; ZVFBMIN64-LABEL: vfwmaccbf16_vf_v1f32:
-; ZVFBMIN64:       # %bb.0:
-; ZVFBMIN64-NEXT:    addi sp, sp, -48
-; ZVFBMIN64-NEXT:    .cfi_def_cfa_offset 48
-; ZVFBMIN64-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
-; ZVFBMIN64-NEXT:    fsd fs0, 32(sp) # 8-byte Folded Spill
-; ZVFBMIN64-NEXT:    .cfi_offset ra, -8
-; ZVFBMIN64-NEXT:    .cfi_offset fs0, -16
-; ZVFBMIN64-NEXT:    csrr a0, vlenb
-; ZVFBMIN64-NEXT:    sub sp, sp, a0
-; ZVFBMIN64-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x30, 0x22, 0x11, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 48 + 1 * vlenb
-; ZVFBMIN64-NEXT:    fmv.s fs0, fa0
-; ZVFBMIN64-NEXT:    addi a0, sp, 32
-; ZVFBMIN64-NEXT:    vs1r.v v8, (a0) # Unknown-size Folded Spill
-; ZVFBMIN64-NEXT:    fmv.s fa0, fa1
-; ZVFBMIN64-NEXT:    call __truncsfbf2
-; ZVFBMIN64-NEXT:    fmv.x.w a0, fa0
-; ZVFBMIN64-NEXT:    fmv.x.w a1, fs0
-; ZVFBMIN64-NEXT:    slli a1, a1, 16
-; ZVFBMIN64-NEXT:    fmv.w.x fa5, a1
-; ZVFBMIN64-NEXT:    fsw fa5, 24(sp)
-; ZVFBMIN64-NEXT:    addi a1, sp, 24
-; ZVFBMIN64-NEXT:    vsetivli zero, 1, e32, mf2, ta, ma
-; ZVFBMIN64-NEXT:    vle32.v v10, (a1)
-; ZVFBMIN64-NEXT:    slli a0, a0, 16
-; ZVFBMIN64-NEXT:    fmv.w.x fa5, a0
-; ZVFBMIN64-NEXT:    fsw fa5, 28(sp)
-; ZVFBMIN64-NEXT:    addi a0, sp, 28
-; ZVFBMIN64-NEXT:    vle32.v v9, (a0)
-; ZVFBMIN64-NEXT:    addi a0, sp, 32
-; ZVFBMIN64-NEXT:    vl1r.v v8, (a0) # Unknown-size Folded Reload
-; ZVFBMIN64-NEXT:    vfmacc.vv v8, v10, v9
-; ZVFBMIN64-NEXT:    csrr a0, vlenb
-; ZVFBMIN64-NEXT:    add sp, sp, a0
-; ZVFBMIN64-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
-; ZVFBMIN64-NEXT:    fld fs0, 32(sp) # 8-byte Folded Reload
-; ZVFBMIN64-NEXT:    addi sp, sp, 48
-; ZVFBMIN64-NEXT:    ret
+; ZVFBFMIN-LABEL: vfwmaccbf16_vf_v1f32:
+; ZVFBFMIN:       # %bb.0:
+; ZVFBFMIN-NEXT:    fmv.x.w a0, fa0
+; ZVFBFMIN-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFBFMIN-NEXT:    vmv.s.x v10, a0
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v11, v10
+; ZVFBFMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFBFMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFMIN-NEXT:    vfmacc.vv v8, v11, v10
+; ZVFBFMIN-NEXT:    ret
   %b.head = insertelement <1 x bfloat> poison, bfloat %b, i32 0
   %b.splat = shufflevector <1 x bfloat> %b.head, <1 x bfloat> poison, <1 x i32> zeroinitializer
   %b.ext = fpext <1 x bfloat> %b.splat to <1 x float>

diff  --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
index 151b91184bf428..4cc9a0124337d9 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll
@@ -1,13 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v \
-; RUN: -riscv-v-slp-max-vf=0 -S \
-; RUN: | FileCheck %s --check-prefixes=CHECK,ZVL128
-; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v,+zvl256b \
-; RUN: -riscv-v-slp-max-vf=0 -S \
-; RUN: | FileCheck %s --check-prefixes=CHECK,ZVL256
-; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 -mattr=+v,+zvl512b \
-; RUN: -riscv-v-slp-max-vf=0 -S \
-; RUN: | FileCheck %s --check-prefixes=CHECK,ZVL512
+; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 \
+; RUN: -mattr=+v,+zvfhmin,+zvfbfmin -riscv-v-slp-max-vf=0 -S \
+; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 \
+; RUN: -mattr=+v,+zvfh,+zvfbfmin -riscv-v-slp-max-vf=0 -S \
+; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVL128
+; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 \
+; RUN: -mattr=+v,+zvl256b,+zvfh,+zvfbfmin -riscv-v-slp-max-vf=0 -S \
+; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVL256
+; RUN: opt < %s -passes=slp-vectorizer -mtriple=riscv64 \
+; RUN: -mattr=+v,+zvl512b,+zvfh,+zvfbfmin -riscv-v-slp-max-vf=0 -S \
+; RUN: | FileCheck %s --check-prefixes=CHECK,ZVFH,ZVL512
 
 target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
 target triple = "riscv64"
@@ -332,6 +335,21 @@ entry:
 }
 
 define void @reduce_or_2() {
+; ZVFHMIN-LABEL: @reduce_or_2(
+; ZVFHMIN-NEXT:    [[TMP1:%.*]] = shl i64 0, 0
+; ZVFHMIN-NEXT:    [[TMP2:%.*]] = insertelement <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison>, i64 [[TMP1]], i32 15
+; ZVFHMIN-NEXT:    [[TMP3:%.*]] = icmp ult <16 x i64> [[TMP2]], zeroinitializer
+; ZVFHMIN-NEXT:    [[TMP4:%.*]] = insertelement <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0>, i64 [[TMP1]], i32 6
+; ZVFHMIN-NEXT:    [[TMP5:%.*]] = icmp ult <16 x i64> [[TMP4]], zeroinitializer
+; ZVFHMIN-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP3]])
+; ZVFHMIN-NEXT:    [[TMP7:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[TMP5]])
+; ZVFHMIN-NEXT:    [[OP_RDX:%.*]] = or i1 [[TMP6]], [[TMP7]]
+; ZVFHMIN-NEXT:    br i1 [[OP_RDX]], label [[TMP9:%.*]], label [[TMP8:%.*]]
+; ZVFHMIN:       8:
+; ZVFHMIN-NEXT:    ret void
+; ZVFHMIN:       9:
+; ZVFHMIN-NEXT:    ret void
+;
 ; ZVL128-LABEL: @reduce_or_2(
 ; ZVL128-NEXT:    [[TMP1:%.*]] = shl i64 0, 0
 ; ZVL128-NEXT:    [[TMP2:%.*]] = insertelement <16 x i64> <i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 0, i64 poison>, i64 [[TMP1]], i32 15
@@ -1152,3 +1170,128 @@ define i32 @reduce_sum_2arrays_b(ptr noalias noundef %x, ptr noalias %y) {
   %add10.3 = add nuw nsw i32 %add10.2, %conv9.3
   ret i32 %add10.3
 }
+
+; Shouldn't vectorize to a reduction because we can't promote it
+define bfloat @fadd_4xbf16(ptr %p) {
+; CHECK-LABEL: @fadd_4xbf16(
+; CHECK-NEXT:    [[X0:%.*]] = load bfloat, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr bfloat, ptr [[P]], i32 1
+; CHECK-NEXT:    [[X1:%.*]] = load bfloat, ptr [[P1]], align 2
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr bfloat, ptr [[P]], i32 2
+; CHECK-NEXT:    [[X2:%.*]] = load bfloat, ptr [[P2]], align 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr bfloat, ptr [[P]], i32 3
+; CHECK-NEXT:    [[X3:%.*]] = load bfloat, ptr [[P3]], align 2
+; CHECK-NEXT:    [[R0:%.*]] = fadd fast bfloat [[X0]], [[X1]]
+; CHECK-NEXT:    [[R1:%.*]] = fadd fast bfloat [[R0]], [[X2]]
+; CHECK-NEXT:    [[R2:%.*]] = fadd fast bfloat [[R1]], [[X3]]
+; CHECK-NEXT:    ret bfloat [[R2]]
+;
+  %x0 = load bfloat, ptr %p
+  %p1 = getelementptr bfloat, ptr %p, i32 1
+  %x1 = load bfloat, ptr %p1
+  %p2 = getelementptr bfloat, ptr %p, i32 2
+  %x2 = load bfloat, ptr %p2
+  %p3 = getelementptr bfloat, ptr %p, i32 3
+  %x3 = load bfloat, ptr %p3
+
+  %r0 = fadd fast bfloat %x0, %x1
+  %r1 = fadd fast bfloat %r0, %x2
+  %r2 = fadd fast bfloat %r1, %x3
+
+  ret bfloat %r2
+}
+
+; Shouldn't vectorize to a reduction because there's no vfred{u,o}mul.vs
+define bfloat @fmul_4xbf16(ptr %p) {
+; CHECK-LABEL: @fmul_4xbf16(
+; CHECK-NEXT:    [[X0:%.*]] = load bfloat, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr bfloat, ptr [[P]], i32 1
+; CHECK-NEXT:    [[X1:%.*]] = load bfloat, ptr [[P1]], align 2
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr bfloat, ptr [[P]], i32 2
+; CHECK-NEXT:    [[X2:%.*]] = load bfloat, ptr [[P2]], align 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr bfloat, ptr [[P]], i32 3
+; CHECK-NEXT:    [[X3:%.*]] = load bfloat, ptr [[P3]], align 2
+; CHECK-NEXT:    [[R0:%.*]] = fmul fast bfloat [[X0]], [[X1]]
+; CHECK-NEXT:    [[R1:%.*]] = fmul fast bfloat [[R0]], [[X2]]
+; CHECK-NEXT:    [[R2:%.*]] = fmul fast bfloat [[R1]], [[X3]]
+; CHECK-NEXT:    ret bfloat [[R2]]
+;
+  %x0 = load bfloat, ptr %p
+  %p1 = getelementptr bfloat, ptr %p, i32 1
+  %x1 = load bfloat, ptr %p1
+  %p2 = getelementptr bfloat, ptr %p, i32 2
+  %x2 = load bfloat, ptr %p2
+  %p3 = getelementptr bfloat, ptr %p, i32 3
+  %x3 = load bfloat, ptr %p3
+
+  %r0 = fmul fast bfloat %x0, %x1
+  %r1 = fmul fast bfloat %r0, %x2
+  %r2 = fmul fast bfloat %r1, %x3
+
+  ret bfloat %r2
+}
+
+; Shouldn't vectorize to a reduction on zvfhmin because we can't promote it
+define half @fadd_4xf16(ptr %p) {
+; ZVFHMIN-LABEL: @fadd_4xf16(
+; ZVFHMIN-NEXT:    [[X0:%.*]] = load half, ptr [[P:%.*]], align 2
+; ZVFHMIN-NEXT:    [[P1:%.*]] = getelementptr half, ptr [[P]], i32 1
+; ZVFHMIN-NEXT:    [[X1:%.*]] = load half, ptr [[P1]], align 2
+; ZVFHMIN-NEXT:    [[P2:%.*]] = getelementptr half, ptr [[P]], i32 2
+; ZVFHMIN-NEXT:    [[X2:%.*]] = load half, ptr [[P2]], align 2
+; ZVFHMIN-NEXT:    [[P3:%.*]] = getelementptr half, ptr [[P]], i32 3
+; ZVFHMIN-NEXT:    [[X3:%.*]] = load half, ptr [[P3]], align 2
+; ZVFHMIN-NEXT:    [[R0:%.*]] = fadd fast half [[X0]], [[X1]]
+; ZVFHMIN-NEXT:    [[R1:%.*]] = fadd fast half [[R0]], [[X2]]
+; ZVFHMIN-NEXT:    [[R2:%.*]] = fadd fast half [[R1]], [[X3]]
+; ZVFHMIN-NEXT:    ret half [[R2]]
+;
+; ZVFH-LABEL: @fadd_4xf16(
+; ZVFH-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr [[P:%.*]], align 2
+; ZVFH-NEXT:    [[TMP2:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP1]])
+; ZVFH-NEXT:    ret half [[TMP2]]
+;
+  %x0 = load half, ptr %p
+  %p1 = getelementptr half, ptr %p, i32 1
+  %x1 = load half, ptr %p1
+  %p2 = getelementptr half, ptr %p, i32 2
+  %x2 = load half, ptr %p2
+  %p3 = getelementptr half, ptr %p, i32 3
+  %x3 = load half, ptr %p3
+
+  %r0 = fadd fast half %x0, %x1
+  %r1 = fadd fast half %r0, %x2
+  %r2 = fadd fast half %r1, %x3
+
+  ret half %r2
+}
+
+; Shouldn't vectorize to a reduction because there's no vfred{u,o}mul.vs
+define half @fmul_4xf16(ptr %p) {
+; CHECK-LABEL: @fmul_4xf16(
+; CHECK-NEXT:    [[X0:%.*]] = load half, ptr [[P:%.*]], align 2
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr half, ptr [[P]], i32 1
+; CHECK-NEXT:    [[X1:%.*]] = load half, ptr [[P1]], align 2
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr half, ptr [[P]], i32 2
+; CHECK-NEXT:    [[X2:%.*]] = load half, ptr [[P2]], align 2
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr half, ptr [[P]], i32 3
+; CHECK-NEXT:    [[X3:%.*]] = load half, ptr [[P3]], align 2
+; CHECK-NEXT:    [[R0:%.*]] = fmul fast half [[X0]], [[X1]]
+; CHECK-NEXT:    [[R1:%.*]] = fmul fast half [[R0]], [[X2]]
+; CHECK-NEXT:    [[R2:%.*]] = fmul fast half [[R1]], [[X3]]
+; CHECK-NEXT:    ret half [[R2]]
+;
+  %x0 = load half, ptr %p
+  %p1 = getelementptr half, ptr %p, i32 1
+  %x1 = load half, ptr %p1
+  %p2 = getelementptr half, ptr %p, i32 2
+  %x2 = load half, ptr %p2
+  %p3 = getelementptr half, ptr %p, i32 3
+  %x3 = load half, ptr %p3
+
+  %r0 = fmul fast half %x0, %x1
+  %r1 = fmul fast half %r0, %x2
+  %r2 = fmul fast half %r1, %x3
+
+  ret half %r2
+}