[llvm] [RISCV] Enable scalable loop vectorization for zvfhmin/zvfbfmin (PR #115272)

Wed Nov 6 22:45:58 PST 2024

https://github.com/lukel97 created https://github.com/llvm/llvm-project/pull/115272

This PR enables scalable loop vectorization for f16 with zvfhmin and bf16 with zvfbfmin. 

Enabling this was dependent on filling out the gaps for scalable zvfhmin/zvfbfmin codegen, but everything that the loop vectorizer might emit should now be handled.

It does this by marking f16 and bf16 as legal in `isLegalElementTypeForRVV`. There are a few users of `isLegalElementTypeForRVV` that have already been enabled in other PRs:

- `isLegalStridedLoadStore` #115264
- `isLegalInterleavedAccessType` #115257
- `isLegalMaskedLoadStore` #115145
- `isLegalMaskedGatherScatter` #114945

The remaining user is `isLegalToVectorizeReduction`. We can't promote f16/bf16 fadd reductions to f32 so we need to disable it for scalable vectors. The cost model actually marks these as invalid cost, but for out-of-tree reductions `ComputeReductionResult` doesn't actually get costed and so will end up emitting a reduction intrinsic regardless, so we still need to mark it as illegal.

After this PR, the following loop compiled with `-march=rv64gv_zvfbfwma -O2`:
```c
void f(float * restrict dst, __bf16 * restrict a, __bf16 * restrict b, int n) {
  for (int i = 0; i < n; i++)
    dst[i] += ((float)a[i] * (float)b[i]);
}
```

Goes from this in llvm-19.1.0:
```asm
.LBB0_4:
        addi    t1, t0, 16
        vsetivli        zero, 8, e16, m1, ta, ma
        vle16.v v8, (t0)
        vle16.v v9, (t1)
        addi    t1, a6, 32
        addi    t2, a7, 16
        vfwcvtbf16.f.f.v        v10, v8
        vfwcvtbf16.f.f.v        v12, v9
        vle16.v v8, (a7)
        vle16.v v9, (t2)
        vle32.v v14, (a6)
        vle32.v v16, (t1)
        vfwcvtbf16.f.f.v        v18, v8
        vfwcvtbf16.f.f.v        v20, v9
        vsetvli zero, zero, e32, m2, ta, ma
        vfmacc.vv       v14, v10, v18
        vfmacc.vv       v16, v12, v20
        vse32.v v14, (a6)
        vse32.v v16, (t1)
        addi    t0, t0, 32
        addi    a7, a7, 32
        addi    a6, a6, 64
        bne     t0, a5, .LBB0_4
```

To:
```asm
	vsetvli	t4, zero, e16, m1, ta, ma
.LBB0_4:
	vl1re16.v	v8, (t3)
	vl1re16.v	v9, (t2)
	vl2re32.v	v10, (t1)
	vfwmaccbf16.vv	v10, v8, v9
	vs2r.v	v10, (t1)
	add	t3, t3, a4
	add	t2, t2, a4
	sub	t0, t0, a6
	add	t1, t1, a7
	bnez	t0, .LBB0_4
```

>From ba94220a0d8e4232dd9e6cc73615f985f96cec80 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 4 Nov 2024 10:52:47 +0800
Subject: [PATCH 1/2] Precommit tests

---
 .../Transforms/LoopVectorize/RISCV/bf16.ll    | 231 ++++++++++++++++++
 .../Transforms/LoopVectorize/RISCV/f16.ll     |  88 +++++++
 2 files changed, 319 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/f16.ll

diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
new file mode 100644
index 00000000000000..487f6cd7e89629
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
@@ -0,0 +1,231 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=NO-ZVFBFMIN
+; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfbfmin -S | FileCheck %s -check-prefix=ZVFBFMIN
+
+define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
+; NO-ZVFBFMIN-LABEL: define void @fadd(
+; NO-ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-ZVFBFMIN-NEXT:  [[ENTRY:.*]]:
+; NO-ZVFBFMIN-NEXT:    br label %[[LOOP:.*]]
+; NO-ZVFBFMIN:       [[LOOP]]:
+; NO-ZVFBFMIN-NEXT:    [[I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; NO-ZVFBFMIN-NEXT:    [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]]
+; NO-ZVFBFMIN-NEXT:    [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]]
+; NO-ZVFBFMIN-NEXT:    [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2
+; NO-ZVFBFMIN-NEXT:    [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2
+; NO-ZVFBFMIN-NEXT:    [[Z:%.*]] = fadd bfloat [[X]], [[Y]]
+; NO-ZVFBFMIN-NEXT:    store bfloat [[Z]], ptr [[A_GEP]], align 2
+; NO-ZVFBFMIN-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
+; NO-ZVFBFMIN-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; NO-ZVFBFMIN-NEXT:    br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-ZVFBFMIN:       [[EXIT]]:
+; NO-ZVFBFMIN-NEXT:    ret void
+;
+; ZVFBFMIN-LABEL: define void @fadd(
+; ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; ZVFBFMIN-NEXT:  [[ENTRY:.*]]:
+; ZVFBFMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; ZVFBFMIN-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; ZVFBFMIN:       [[VECTOR_PH]]:
+; ZVFBFMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; ZVFBFMIN-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; ZVFBFMIN-NEXT:    br label %[[VECTOR_BODY:.*]]
+; ZVFBFMIN:       [[VECTOR_BODY]]:
+; ZVFBFMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; ZVFBFMIN-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; ZVFBFMIN-NEXT:    [[TMP1:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[TMP0]]
+; ZVFBFMIN-NEXT:    [[TMP2:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP0]]
+; ZVFBFMIN-NEXT:    [[TMP3:%.*]] = getelementptr bfloat, ptr [[TMP1]], i32 0
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x bfloat>, ptr [[TMP3]], align 2
+; ZVFBFMIN-NEXT:    [[TMP4:%.*]] = getelementptr bfloat, ptr [[TMP2]], i32 0
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x bfloat>, ptr [[TMP4]], align 2
+; ZVFBFMIN-NEXT:    [[TMP5:%.*]] = fadd <16 x bfloat> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; ZVFBFMIN-NEXT:    store <16 x bfloat> [[TMP5]], ptr [[TMP3]], align 2
+; ZVFBFMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; ZVFBFMIN-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; ZVFBFMIN-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; ZVFBFMIN:       [[MIDDLE_BLOCK]]:
+; ZVFBFMIN-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; ZVFBFMIN-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; ZVFBFMIN:       [[SCALAR_PH]]:
+; ZVFBFMIN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; ZVFBFMIN-NEXT:    br label %[[LOOP:.*]]
+; ZVFBFMIN:       [[LOOP]]:
+; ZVFBFMIN-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; ZVFBFMIN-NEXT:    [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]]
+; ZVFBFMIN-NEXT:    [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]]
+; ZVFBFMIN-NEXT:    [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2
+; ZVFBFMIN-NEXT:    [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2
+; ZVFBFMIN-NEXT:    [[Z:%.*]] = fadd bfloat [[X]], [[Y]]
+; ZVFBFMIN-NEXT:    store bfloat [[Z]], ptr [[A_GEP]], align 2
+; ZVFBFMIN-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
+; ZVFBFMIN-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; ZVFBFMIN-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; ZVFBFMIN:       [[EXIT]]:
+; ZVFBFMIN-NEXT:    ret void
+;
+entry:
+  br label %loop
+loop:
+  %i = phi i64 [0, %entry], [%i.next, %loop]
+  %a.gep = getelementptr bfloat, ptr %a, i64 %i
+  %b.gep = getelementptr bfloat, ptr %b, i64 %i
+  %x = load bfloat, ptr %a.gep
+  %y = load bfloat, ptr %b.gep
+  %z = fadd bfloat %x, %y
+  store bfloat %z, ptr %a.gep
+  %i.next = add i64 %i, 1
+  %done = icmp eq i64 %i.next, %n
+  br i1 %done, label %exit, label %loop
+exit:
+  ret void
+}
+
+define void @vfwmaccbf16.vv(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
+; NO-ZVFBFMIN-LABEL: define void @vfwmaccbf16.vv(
+; NO-ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; NO-ZVFBFMIN-NEXT:  [[ENTRY:.*]]:
+; NO-ZVFBFMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; NO-ZVFBFMIN-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-ZVFBFMIN:       [[VECTOR_PH]]:
+; NO-ZVFBFMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; NO-ZVFBFMIN-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-ZVFBFMIN-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-ZVFBFMIN:       [[VECTOR_BODY]]:
+; NO-ZVFBFMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-ZVFBFMIN-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; NO-ZVFBFMIN-NEXT:    [[TMP1:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[TMP0]]
+; NO-ZVFBFMIN-NEXT:    [[TMP2:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP0]]
+; NO-ZVFBFMIN-NEXT:    [[TMP3:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP0]]
+; NO-ZVFBFMIN-NEXT:    [[TMP4:%.*]] = getelementptr bfloat, ptr [[TMP1]], i32 0
+; NO-ZVFBFMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x bfloat>, ptr [[TMP4]], align 2
+; NO-ZVFBFMIN-NEXT:    [[TMP5:%.*]] = getelementptr bfloat, ptr [[TMP2]], i32 0
+; NO-ZVFBFMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x bfloat>, ptr [[TMP5]], align 2
+; NO-ZVFBFMIN-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[TMP3]], i32 0
+; NO-ZVFBFMIN-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP6]], align 4
+; NO-ZVFBFMIN-NEXT:    [[TMP7:%.*]] = fpext <8 x bfloat> [[WIDE_LOAD]] to <8 x float>
+; NO-ZVFBFMIN-NEXT:    [[TMP8:%.*]] = fpext <8 x bfloat> [[WIDE_LOAD1]] to <8 x float>
+; NO-ZVFBFMIN-NEXT:    [[TMP9:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP7]], <8 x float> [[TMP8]], <8 x float> [[WIDE_LOAD2]])
+; NO-ZVFBFMIN-NEXT:    store <8 x float> [[TMP9]], ptr [[TMP6]], align 4
+; NO-ZVFBFMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; NO-ZVFBFMIN-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-ZVFBFMIN-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-ZVFBFMIN:       [[MIDDLE_BLOCK]]:
+; NO-ZVFBFMIN-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-ZVFBFMIN-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-ZVFBFMIN:       [[SCALAR_PH]]:
+; NO-ZVFBFMIN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; NO-ZVFBFMIN-NEXT:    br label %[[LOOP:.*]]
+; NO-ZVFBFMIN:       [[LOOP]]:
+; NO-ZVFBFMIN-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; NO-ZVFBFMIN-NEXT:    [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]]
+; NO-ZVFBFMIN-NEXT:    [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]]
+; NO-ZVFBFMIN-NEXT:    [[C_GEP:%.*]] = getelementptr float, ptr [[C]], i64 [[I]]
+; NO-ZVFBFMIN-NEXT:    [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2
+; NO-ZVFBFMIN-NEXT:    [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2
+; NO-ZVFBFMIN-NEXT:    [[Z:%.*]] = load float, ptr [[C_GEP]], align 4
+; NO-ZVFBFMIN-NEXT:    [[X_EXT:%.*]] = fpext bfloat [[X]] to float
+; NO-ZVFBFMIN-NEXT:    [[Y_EXT:%.*]] = fpext bfloat [[Y]] to float
+; NO-ZVFBFMIN-NEXT:    [[FMULADD:%.*]] = call float @llvm.fmuladd.f32(float [[X_EXT]], float [[Y_EXT]], float [[Z]])
+; NO-ZVFBFMIN-NEXT:    store float [[FMULADD]], ptr [[C_GEP]], align 4
+; NO-ZVFBFMIN-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
+; NO-ZVFBFMIN-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; NO-ZVFBFMIN-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-ZVFBFMIN:       [[EXIT]]:
+; NO-ZVFBFMIN-NEXT:    ret void
+;
+; ZVFBFMIN-LABEL: define void @vfwmaccbf16.vv(
+; ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; ZVFBFMIN-NEXT:  [[ENTRY:.*]]:
+; ZVFBFMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; ZVFBFMIN-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; ZVFBFMIN:       [[VECTOR_PH]]:
+; ZVFBFMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; ZVFBFMIN-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; ZVFBFMIN-NEXT:    br label %[[VECTOR_BODY:.*]]
+; ZVFBFMIN:       [[VECTOR_BODY]]:
+; ZVFBFMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; ZVFBFMIN-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; ZVFBFMIN-NEXT:    [[TMP7:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[TMP6]]
+; ZVFBFMIN-NEXT:    [[TMP8:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP6]]
+; ZVFBFMIN-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP6]]
+; ZVFBFMIN-NEXT:    [[TMP10:%.*]] = getelementptr bfloat, ptr [[TMP7]], i32 0
+; ZVFBFMIN-NEXT:    [[TMP5:%.*]] = getelementptr bfloat, ptr [[TMP7]], i32 8
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x bfloat>, ptr [[TMP10]], align 2
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x bfloat>, ptr [[TMP5]], align 2
+; ZVFBFMIN-NEXT:    [[TMP11:%.*]] = getelementptr bfloat, ptr [[TMP8]], i32 0
+; ZVFBFMIN-NEXT:    [[TMP17:%.*]] = getelementptr bfloat, ptr [[TMP8]], i32 8
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x bfloat>, ptr [[TMP11]], align 2
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x bfloat>, ptr [[TMP17]], align 2
+; ZVFBFMIN-NEXT:    [[TMP12:%.*]] = getelementptr float, ptr [[TMP9]], i32 0
+; ZVFBFMIN-NEXT:    [[TMP18:%.*]] = getelementptr float, ptr [[TMP9]], i32 8
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x float>, ptr [[TMP12]], align 4
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x float>, ptr [[TMP18]], align 4
+; ZVFBFMIN-NEXT:    [[TMP19:%.*]] = fpext <8 x bfloat> [[WIDE_LOAD]] to <8 x float>
+; ZVFBFMIN-NEXT:    [[TMP20:%.*]] = fpext <8 x bfloat> [[WIDE_LOAD1]] to <8 x float>
+; ZVFBFMIN-NEXT:    [[TMP21:%.*]] = fpext <8 x bfloat> [[WIDE_LOAD2]] to <8 x float>
+; ZVFBFMIN-NEXT:    [[TMP13:%.*]] = fpext <8 x bfloat> [[WIDE_LOAD3]] to <8 x float>
+; ZVFBFMIN-NEXT:    [[TMP14:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP19]], <8 x float> [[TMP21]], <8 x float> [[WIDE_LOAD4]])
+; ZVFBFMIN-NEXT:    [[TMP15:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP20]], <8 x float> [[TMP13]], <8 x float> [[WIDE_LOAD5]])
+; ZVFBFMIN-NEXT:    store <8 x float> [[TMP14]], ptr [[TMP12]], align 4
+; ZVFBFMIN-NEXT:    store <8 x float> [[TMP15]], ptr [[TMP18]], align 4
+; ZVFBFMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; ZVFBFMIN-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; ZVFBFMIN-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; ZVFBFMIN:       [[MIDDLE_BLOCK]]:
+; ZVFBFMIN-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; ZVFBFMIN-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; ZVFBFMIN:       [[SCALAR_PH]]:
+; ZVFBFMIN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; ZVFBFMIN-NEXT:    br label %[[LOOP:.*]]
+; ZVFBFMIN:       [[LOOP]]:
+; ZVFBFMIN-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; ZVFBFMIN-NEXT:    [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]]
+; ZVFBFMIN-NEXT:    [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]]
+; ZVFBFMIN-NEXT:    [[C_GEP:%.*]] = getelementptr float, ptr [[C]], i64 [[I]]
+; ZVFBFMIN-NEXT:    [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2
+; ZVFBFMIN-NEXT:    [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2
+; ZVFBFMIN-NEXT:    [[Z:%.*]] = load float, ptr [[C_GEP]], align 4
+; ZVFBFMIN-NEXT:    [[X_EXT:%.*]] = fpext bfloat [[X]] to float
+; ZVFBFMIN-NEXT:    [[Y_EXT:%.*]] = fpext bfloat [[Y]] to float
+; ZVFBFMIN-NEXT:    [[FMULADD:%.*]] = call float @llvm.fmuladd.f32(float [[X_EXT]], float [[Y_EXT]], float [[Z]])
+; ZVFBFMIN-NEXT:    store float [[FMULADD]], ptr [[C_GEP]], align 4
+; ZVFBFMIN-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
+; ZVFBFMIN-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; ZVFBFMIN-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; ZVFBFMIN:       [[EXIT]]:
+; ZVFBFMIN-NEXT:    ret void
+;
+entry:
+  br label %loop
+loop:
+  %i = phi i64 [0, %entry], [%i.next, %loop]
+  %a.gep = getelementptr bfloat, ptr %a, i64 %i
+  %b.gep = getelementptr bfloat, ptr %b, i64 %i
+  %c.gep = getelementptr float, ptr %c, i64 %i
+  %x = load bfloat, ptr %a.gep
+  %y = load bfloat, ptr %b.gep
+  %z = load float, ptr %c.gep
+  %x.ext = fpext bfloat %x to float
+  %y.ext = fpext bfloat %y to float
+  %fmuladd = call float @llvm.fmuladd.f32(float %x.ext, float %y.ext, float %z)
+  store float %fmuladd, ptr %c.gep
+  %i.next = add i64 %i, 1
+  %done = icmp eq i64 %i.next, %n
+  br i1 %done, label %exit, label %loop
+exit:
+  ret void
+}
+;.
+; NO-ZVFBFMIN: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; NO-ZVFBFMIN: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; NO-ZVFBFMIN: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; NO-ZVFBFMIN: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
+; ZVFBFMIN: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; ZVFBFMIN: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; ZVFBFMIN: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; ZVFBFMIN: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; ZVFBFMIN: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; ZVFBFMIN: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll
new file mode 100644
index 00000000000000..be9151e04776e0
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=NO-ZVFHMIN
+; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfhmin -S | FileCheck %s -check-prefix=ZVFHMIN
+
+define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
+; NO-ZVFHMIN-LABEL: define void @fadd(
+; NO-ZVFHMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-ZVFHMIN-NEXT:  [[ENTRY:.*]]:
+; NO-ZVFHMIN-NEXT:    br label %[[LOOP:.*]]
+; NO-ZVFHMIN:       [[LOOP]]:
+; NO-ZVFHMIN-NEXT:    [[I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; NO-ZVFHMIN-NEXT:    [[A_GEP:%.*]] = getelementptr half, ptr [[A]], i64 [[I]]
+; NO-ZVFHMIN-NEXT:    [[B_GEP:%.*]] = getelementptr half, ptr [[B]], i64 [[I]]
+; NO-ZVFHMIN-NEXT:    [[X:%.*]] = load half, ptr [[A_GEP]], align 2
+; NO-ZVFHMIN-NEXT:    [[Y:%.*]] = load half, ptr [[B_GEP]], align 2
+; NO-ZVFHMIN-NEXT:    [[Z:%.*]] = fadd half [[X]], [[Y]]
+; NO-ZVFHMIN-NEXT:    store half [[Z]], ptr [[A_GEP]], align 2
+; NO-ZVFHMIN-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
+; NO-ZVFHMIN-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; NO-ZVFHMIN-NEXT:    br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-ZVFHMIN:       [[EXIT]]:
+; NO-ZVFHMIN-NEXT:    ret void
+;
+; ZVFHMIN-LABEL: define void @fadd(
+; ZVFHMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; ZVFHMIN-NEXT:  [[ENTRY:.*]]:
+; ZVFHMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; ZVFHMIN-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; ZVFHMIN:       [[VECTOR_PH]]:
+; ZVFHMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; ZVFHMIN-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; ZVFHMIN-NEXT:    br label %[[VECTOR_BODY:.*]]
+; ZVFHMIN:       [[VECTOR_BODY]]:
+; ZVFHMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; ZVFHMIN-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; ZVFHMIN-NEXT:    [[TMP1:%.*]] = getelementptr half, ptr [[A]], i64 [[TMP0]]
+; ZVFHMIN-NEXT:    [[TMP2:%.*]] = getelementptr half, ptr [[B]], i64 [[TMP0]]
+; ZVFHMIN-NEXT:    [[TMP3:%.*]] = getelementptr half, ptr [[TMP1]], i32 0
+; ZVFHMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x half>, ptr [[TMP3]], align 2
+; ZVFHMIN-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr [[TMP2]], i32 0
+; ZVFHMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x half>, ptr [[TMP4]], align 2
+; ZVFHMIN-NEXT:    [[TMP5:%.*]] = fadd <16 x half> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; ZVFHMIN-NEXT:    store <16 x half> [[TMP5]], ptr [[TMP3]], align 2
+; ZVFHMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; ZVFHMIN-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; ZVFHMIN-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; ZVFHMIN:       [[MIDDLE_BLOCK]]:
+; ZVFHMIN-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; ZVFHMIN-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; ZVFHMIN:       [[SCALAR_PH]]:
+; ZVFHMIN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; ZVFHMIN-NEXT:    br label %[[LOOP:.*]]
+; ZVFHMIN:       [[LOOP]]:
+; ZVFHMIN-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; ZVFHMIN-NEXT:    [[A_GEP:%.*]] = getelementptr half, ptr [[A]], i64 [[I]]
+; ZVFHMIN-NEXT:    [[B_GEP:%.*]] = getelementptr half, ptr [[B]], i64 [[I]]
+; ZVFHMIN-NEXT:    [[X:%.*]] = load half, ptr [[A_GEP]], align 2
+; ZVFHMIN-NEXT:    [[Y:%.*]] = load half, ptr [[B_GEP]], align 2
+; ZVFHMIN-NEXT:    [[Z:%.*]] = fadd half [[X]], [[Y]]
+; ZVFHMIN-NEXT:    store half [[Z]], ptr [[A_GEP]], align 2
+; ZVFHMIN-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
+; ZVFHMIN-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; ZVFHMIN-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; ZVFHMIN:       [[EXIT]]:
+; ZVFHMIN-NEXT:    ret void
+;
+entry:
+  br label %loop
+loop:
+  %i = phi i64 [0, %entry], [%i.next, %loop]
+  %a.gep = getelementptr half, ptr %a, i64 %i
+  %b.gep = getelementptr half, ptr %b, i64 %i
+  %x = load half, ptr %a.gep
+  %y = load half, ptr %b.gep
+  %z = fadd half %x, %y
+  store half %z, ptr %a.gep
+  %i.next = add i64 %i, 1
+  %done = icmp eq i64 %i.next, %n
+  br i1 %done, label %exit, label %loop
+exit:
+  ret void
+}
+;.
+; ZVFHMIN: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; ZVFHMIN: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; ZVFHMIN: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; ZVFHMIN: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.

>From 64ed1a7babe5aeba9f868a8ae376307c5822ac3a Mon Sep 17 00:00:00 2001
From: Luke Lau <luke at igalia.com>
Date: Mon, 4 Nov 2024 11:11:36 +0800
Subject: [PATCH 2/2] [RISCV] Enable scalable loop vectorization for
 zvfhmin/zvfbfmin

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  16 +-
 .../Target/RISCV/RISCVTargetTransformInfo.h   |  23 +--
 .../Transforms/LoopVectorize/RISCV/bf16.ll    |  56 +++---
 .../Transforms/LoopVectorize/RISCV/f16.ll     |  20 ++-
 .../RISCV/scalable-reductions.ll              | 163 +++++++++++++++++-
 5 files changed, 218 insertions(+), 60 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5600524b69a620..7a27afa17c9319 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2516,7 +2516,9 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(EVT ScalarTy) const {
   case MVT::i64:
     return Subtarget.hasVInstructionsI64();
   case MVT::f16:
-    return Subtarget.hasVInstructionsF16();
+    return Subtarget.hasVInstructionsF16Minimal();
+  case MVT::bf16:
+    return Subtarget.hasVInstructionsBF16Minimal();
   case MVT::f32:
     return Subtarget.hasVInstructionsF32();
   case MVT::f64:
@@ -21509,12 +21511,7 @@ bool RISCVTargetLowering::isLegalInterleavedAccessType(
   if (!isTypeLegal(VT))
     return false;
 
-  // TODO: Move bf16/f16 support into isLegalElementTypeForRVV
-  if (!(isLegalElementTypeForRVV(VT.getScalarType()) ||
-        (VT.getScalarType() == MVT::bf16 &&
-         Subtarget.hasVInstructionsBF16Minimal()) ||
-        (VT.getScalarType() == MVT::f16 &&
-         Subtarget.hasVInstructionsF16Minimal())) ||
+  if (!isLegalElementTypeForRVV(VT.getScalarType()) ||
       !allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace,
                                       Alignment))
     return false;
@@ -21554,10 +21551,7 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType,
     return false;
 
   EVT ScalarType = DataType.getScalarType();
-  // TODO: Move bf16/f16 support into isLegalElementTypeForRVV
-  if (!(isLegalElementTypeForRVV(ScalarType) ||
-        (ScalarType == MVT::bf16 && Subtarget.hasVInstructionsBF16Minimal()) ||
-        (ScalarType == MVT::f16 && Subtarget.hasVInstructionsF16Minimal())))
+  if (!isLegalElementTypeForRVV(ScalarType))
     return false;
 
   if (!Subtarget.enableUnalignedVectorMem() &&
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 4c01c1679cd818..bbbe101745f0e3 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -239,12 +239,7 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
     if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
       return false;
 
-    // TODO: Move bf16/f16 support into isLegalElementTypeForRVV
-    return TLI->isLegalElementTypeForRVV(ElemType) ||
-           (DataTypeVT.getVectorElementType() == MVT::bf16 &&
-            ST->hasVInstructionsBF16Minimal()) ||
-           (DataTypeVT.getVectorElementType() == MVT::f16 &&
-            ST->hasVInstructionsF16Minimal());
+    return TLI->isLegalElementTypeForRVV(ElemType);
   }
 
   bool isLegalMaskedLoad(Type *DataType, Align Alignment) {
@@ -274,12 +269,7 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
     if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
       return false;
 
-    // TODO: Move bf16/f16 support into isLegalElementTypeForRVV
-    return TLI->isLegalElementTypeForRVV(ElemType) ||
-           (DataTypeVT.getVectorElementType() == MVT::bf16 &&
-            ST->hasVInstructionsBF16Minimal()) ||
-           (DataTypeVT.getVectorElementType() == MVT::f16 &&
-            ST->hasVInstructionsF16Minimal());
+    return TLI->isLegalElementTypeForRVV(ElemType);
   }
 
   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
@@ -342,8 +332,14 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
       return false;
 
     switch (RdxDesc.getRecurrenceKind()) {
-    case RecurKind::Add:
     case RecurKind::FAdd:
+    case RecurKind::FMulAdd:
+      // We can't promote f16/bf16 fadd reductions and scalable vectors can't be
+      // expanded.
+      if (Ty->isBFloatTy() || (Ty->isHalfTy() && !ST->hasVInstructionsF16()))
+        return false;
+      [[fallthrough]];
+    case RecurKind::Add:
     case RecurKind::And:
     case RecurKind::Or:
     case RecurKind::Xor:
@@ -353,7 +349,6 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
     case RecurKind::UMax:
     case RecurKind::FMin:
     case RecurKind::FMax:
-    case RecurKind::FMulAdd:
     case RecurKind::IAnyOf:
     case RecurKind::FAnyOf:
       return true;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
index 487f6cd7e89629..27923f82411d00 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
@@ -24,11 +24,17 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; ZVFBFMIN-LABEL: define void @fadd(
 ; ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; ZVFBFMIN-NEXT:  [[ENTRY:.*]]:
-; ZVFBFMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; ZVFBFMIN-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFBFMIN-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
+; ZVFBFMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP8]]
 ; ZVFBFMIN-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; ZVFBFMIN:       [[VECTOR_PH]]:
-; ZVFBFMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; ZVFBFMIN-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFBFMIN-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; ZVFBFMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP10]]
 ; ZVFBFMIN-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; ZVFBFMIN-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFBFMIN-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP12]], 8
 ; ZVFBFMIN-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; ZVFBFMIN:       [[VECTOR_BODY]]:
 ; ZVFBFMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -36,12 +42,12 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; ZVFBFMIN-NEXT:    [[TMP1:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[TMP0]]
 ; ZVFBFMIN-NEXT:    [[TMP2:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP0]]
 ; ZVFBFMIN-NEXT:    [[TMP3:%.*]] = getelementptr bfloat, ptr [[TMP1]], i32 0
-; ZVFBFMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x bfloat>, ptr [[TMP3]], align 2
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x bfloat>, ptr [[TMP3]], align 2
 ; ZVFBFMIN-NEXT:    [[TMP4:%.*]] = getelementptr bfloat, ptr [[TMP2]], i32 0
-; ZVFBFMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x bfloat>, ptr [[TMP4]], align 2
-; ZVFBFMIN-NEXT:    [[TMP5:%.*]] = fadd <16 x bfloat> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; ZVFBFMIN-NEXT:    store <16 x bfloat> [[TMP5]], ptr [[TMP3]], align 2
-; ZVFBFMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x bfloat>, ptr [[TMP4]], align 2
+; ZVFBFMIN-NEXT:    [[TMP11:%.*]] = fadd <vscale x 8 x bfloat> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; ZVFBFMIN-NEXT:    store <vscale x 8 x bfloat> [[TMP11]], ptr [[TMP3]], align 2
+; ZVFBFMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; ZVFBFMIN-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; ZVFBFMIN-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; ZVFBFMIN:       [[MIDDLE_BLOCK]]:
@@ -137,11 +143,17 @@ define void @vfwmaccbf16.vv(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64
 ; ZVFBFMIN-LABEL: define void @vfwmaccbf16.vv(
 ; ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; ZVFBFMIN-NEXT:  [[ENTRY:.*]]:
-; ZVFBFMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; ZVFBFMIN-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFBFMIN-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; ZVFBFMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
 ; ZVFBFMIN-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; ZVFBFMIN:       [[VECTOR_PH]]:
-; ZVFBFMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; ZVFBFMIN-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFBFMIN-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; ZVFBFMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
 ; ZVFBFMIN-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; ZVFBFMIN-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFBFMIN-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
 ; ZVFBFMIN-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; ZVFBFMIN:       [[VECTOR_BODY]]:
 ; ZVFBFMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -150,26 +162,16 @@ define void @vfwmaccbf16.vv(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64
 ; ZVFBFMIN-NEXT:    [[TMP8:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP6]]
 ; ZVFBFMIN-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP6]]
 ; ZVFBFMIN-NEXT:    [[TMP10:%.*]] = getelementptr bfloat, ptr [[TMP7]], i32 0
-; ZVFBFMIN-NEXT:    [[TMP5:%.*]] = getelementptr bfloat, ptr [[TMP7]], i32 8
-; ZVFBFMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x bfloat>, ptr [[TMP10]], align 2
-; ZVFBFMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x bfloat>, ptr [[TMP5]], align 2
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x bfloat>, ptr [[TMP10]], align 2
 ; ZVFBFMIN-NEXT:    [[TMP11:%.*]] = getelementptr bfloat, ptr [[TMP8]], i32 0
-; ZVFBFMIN-NEXT:    [[TMP17:%.*]] = getelementptr bfloat, ptr [[TMP8]], i32 8
-; ZVFBFMIN-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x bfloat>, ptr [[TMP11]], align 2
-; ZVFBFMIN-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x bfloat>, ptr [[TMP17]], align 2
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x bfloat>, ptr [[TMP11]], align 2
 ; ZVFBFMIN-NEXT:    [[TMP12:%.*]] = getelementptr float, ptr [[TMP9]], i32 0
-; ZVFBFMIN-NEXT:    [[TMP18:%.*]] = getelementptr float, ptr [[TMP9]], i32 8
-; ZVFBFMIN-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x float>, ptr [[TMP12]], align 4
-; ZVFBFMIN-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x float>, ptr [[TMP18]], align 4
-; ZVFBFMIN-NEXT:    [[TMP19:%.*]] = fpext <8 x bfloat> [[WIDE_LOAD]] to <8 x float>
-; ZVFBFMIN-NEXT:    [[TMP20:%.*]] = fpext <8 x bfloat> [[WIDE_LOAD1]] to <8 x float>
-; ZVFBFMIN-NEXT:    [[TMP21:%.*]] = fpext <8 x bfloat> [[WIDE_LOAD2]] to <8 x float>
-; ZVFBFMIN-NEXT:    [[TMP13:%.*]] = fpext <8 x bfloat> [[WIDE_LOAD3]] to <8 x float>
-; ZVFBFMIN-NEXT:    [[TMP14:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP19]], <8 x float> [[TMP21]], <8 x float> [[WIDE_LOAD4]])
-; ZVFBFMIN-NEXT:    [[TMP15:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP20]], <8 x float> [[TMP13]], <8 x float> [[WIDE_LOAD5]])
-; ZVFBFMIN-NEXT:    store <8 x float> [[TMP14]], ptr [[TMP12]], align 4
-; ZVFBFMIN-NEXT:    store <8 x float> [[TMP15]], ptr [[TMP18]], align 4
-; ZVFBFMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
+; ZVFBFMIN-NEXT:    [[TMP13:%.*]] = fpext <vscale x 4 x bfloat> [[WIDE_LOAD]] to <vscale x 4 x float>
+; ZVFBFMIN-NEXT:    [[TMP14:%.*]] = fpext <vscale x 4 x bfloat> [[WIDE_LOAD1]] to <vscale x 4 x float>
+; ZVFBFMIN-NEXT:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> [[TMP13]], <vscale x 4 x float> [[TMP14]], <vscale x 4 x float> [[WIDE_LOAD2]])
+; ZVFBFMIN-NEXT:    store <vscale x 4 x float> [[TMP15]], ptr [[TMP12]], align 4
+; ZVFBFMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; ZVFBFMIN-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; ZVFBFMIN-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; ZVFBFMIN:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll
index be9151e04776e0..2b267f6a2a9778 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll
@@ -24,11 +24,17 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; ZVFHMIN-LABEL: define void @fadd(
 ; ZVFHMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; ZVFHMIN-NEXT:  [[ENTRY:.*]]:
-; ZVFHMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 16
+; ZVFHMIN-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFHMIN-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
+; ZVFHMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP8]]
 ; ZVFHMIN-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; ZVFHMIN:       [[VECTOR_PH]]:
-; ZVFHMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 16
+; ZVFHMIN-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFHMIN-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; ZVFHMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP10]]
 ; ZVFHMIN-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; ZVFHMIN-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFHMIN-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP12]], 8
 ; ZVFHMIN-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; ZVFHMIN:       [[VECTOR_BODY]]:
 ; ZVFHMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
@@ -36,12 +42,12 @@ define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
 ; ZVFHMIN-NEXT:    [[TMP1:%.*]] = getelementptr half, ptr [[A]], i64 [[TMP0]]
 ; ZVFHMIN-NEXT:    [[TMP2:%.*]] = getelementptr half, ptr [[B]], i64 [[TMP0]]
 ; ZVFHMIN-NEXT:    [[TMP3:%.*]] = getelementptr half, ptr [[TMP1]], i32 0
-; ZVFHMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x half>, ptr [[TMP3]], align 2
+; ZVFHMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>, ptr [[TMP3]], align 2
 ; ZVFHMIN-NEXT:    [[TMP4:%.*]] = getelementptr half, ptr [[TMP2]], i32 0
-; ZVFHMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x half>, ptr [[TMP4]], align 2
-; ZVFHMIN-NEXT:    [[TMP5:%.*]] = fadd <16 x half> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; ZVFHMIN-NEXT:    store <16 x half> [[TMP5]], ptr [[TMP3]], align 2
-; ZVFHMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; ZVFHMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x half>, ptr [[TMP4]], align 2
+; ZVFHMIN-NEXT:    [[TMP11:%.*]] = fadd <vscale x 8 x half> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; ZVFHMIN-NEXT:    store <vscale x 8 x half> [[TMP11]], ptr [[TMP3]], align 2
+; ZVFHMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; ZVFHMIN-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; ZVFHMIN-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; ZVFHMIN:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
index 8e7cd7f6d530dd..01a2a757dea5dd 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll
@@ -224,9 +224,66 @@ for.end:
   ret float %add
 }
 
+; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
+define half @fadd_fast_half_zvfh(ptr noalias nocapture readonly %a, i64 %n) "target-features"="+zvfh" {
+; CHECK-LABEL: @fadd_fast_half_zvfh
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <vscale x 8 x half>
+; CHECK: %[[LOAD2:.*]] = load <vscale x 8 x half>
+; CHECK: %[[FADD1:.*]] = fadd fast <vscale x 8 x half> %[[LOAD1]]
+; CHECK: %[[FADD2:.*]] = fadd fast <vscale x 8 x half> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[RDX:.*]] = fadd fast <vscale x 8 x half> %[[FADD2]], %[[FADD1]]
+; CHECK: call fast half @llvm.vector.reduce.fadd.nxv8f16(half 0xH0000, <vscale x 8 x half> %[[RDX]])
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum.07 = phi half [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds half, ptr %a, i64 %iv
+  %0 = load half, ptr %arrayidx, align 4
+  %add = fadd fast half %0, %sum.07
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret half %add
+}
+
 ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
 ; CHECK-REMARK: vectorized loop (vectorization width: 16, interleaved count: 2)
-define bfloat @fadd_fast_bfloat(ptr noalias nocapture readonly %a, i64 %n) {
+define half @fadd_fast_half_zvfhmin(ptr noalias nocapture readonly %a, i64 %n) "target-features"="+zvfhmin" {
+; CHECK-LABEL: @fadd_fast_half_zvfhmin
+; CHECK: vector.body:
+; CHECK: %[[LOAD1:.*]] = load <16 x half>
+; CHECK: %[[LOAD2:.*]] = load <16 x half>
+; CHECK: %[[FADD1:.*]] = fadd fast <16 x half> %[[LOAD1]]
+; CHECK: %[[FADD2:.*]] = fadd fast <16 x half> %[[LOAD2]]
+; CHECK: middle.block:
+; CHECK: %[[RDX:.*]] = fadd fast <16 x half> %[[FADD2]], %[[FADD1]]
+; CHECK: call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> %[[RDX]])
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum.07 = phi half [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds half, ptr %a, i64 %iv
+  %0 = load half, ptr %arrayidx, align 4
+  %add = fadd fast half %0, %sum.07
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret half %add
+}
+
+; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
+; CHECK-REMARK: vectorized loop (vectorization width: 16, interleaved count: 2)
+define bfloat @fadd_fast_bfloat(ptr noalias nocapture readonly %a, i64 %n) "target-features"="+zvfbfmin" {
 ; CHECK-LABEL: @fadd_fast_bfloat
 ; CHECK: vector.body:
 ; CHECK: %[[LOAD1:.*]] = load <16 x bfloat>
@@ -427,6 +484,110 @@ for.end:
   ret float %muladd
 }
 
+; CHECK-REMARK: vectorized loop (vectorization width: vscale x 8, interleaved count: 2)
+define half @fmuladd_f16_zvfh(ptr %a, ptr %b, i64 %n) "target-features"="+zvfh" {
+; CHECK-LABEL: @fmuladd_f16_zvfh(
+; CHECK: vector.body:
+; CHECK: [[WIDE_LOAD:%.*]] = load <vscale x 8 x half>
+; CHECK: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x half>
+; CHECK: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x half>
+; CHECK: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x half>
+; CHECK: [[MULADD1:%.*]] = call reassoc <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> [[WIDE_LOAD]], <vscale x 8 x half> [[WIDE_LOAD3]],
+; CHECK: [[MULADD2:%.*]] = call reassoc <vscale x 8 x half> @llvm.fmuladd.nxv8f16(<vscale x 8 x half> [[WIDE_LOAD2]], <vscale x 8 x half> [[WIDE_LOAD4]],
+; CHECK: middle.block:
+; CHECK: [[BIN_RDX:%.*]] = fadd reassoc <vscale x 8 x half> [[MULADD2]], [[MULADD1]]
+; CHECK: call reassoc half @llvm.vector.reduce.fadd.nxv8f16(half 0xH8000, <vscale x 8 x half> [[BIN_RDX]])
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum.07 = phi half [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
+  %arrayidx = getelementptr inbounds half, ptr %a, i64 %iv
+  %0 = load half, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds half, ptr %b, i64 %iv
+  %1 = load half, ptr %arrayidx2, align 4
+  %muladd = tail call reassoc half @llvm.fmuladd.f16(half %0, half %1, half %sum.07)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+  ret half %muladd
+}
+
+
+; We can't scalably vectorize reductions of f16 with zvfhmin or bf16 with zvfbfmin, so make sure we use fixed-length vectors instead.
+
+; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
+; CHECK-REMARK: vectorized loop (vectorization width: 16, interleaved count: 2)
+define half @fmuladd_f16_zvfhmin(ptr %a, ptr %b, i64 %n) "target-features"="+zvfhmin" {
+; CHECK-LABEL: @fmuladd_f16_zvfhmin(
+; CHECK: vector.body:
+; CHECK: [[WIDE_LOAD:%.*]] = load <16 x half>
+; CHECK: [[WIDE_LOAD2:%.*]] = load <16 x half>
+; CHECK: [[WIDE_LOAD3:%.*]] = load <16 x half>
+; CHECK: [[WIDE_LOAD4:%.*]] = load <16 x half>
+; CHECK: [[MULADD1:%.*]] = call reassoc <16 x half> @llvm.fmuladd.v16f16(<16 x half> [[WIDE_LOAD]], <16 x half> [[WIDE_LOAD3]],
+; CHECK: [[MULADD2:%.*]] = call reassoc <16 x half> @llvm.fmuladd.v16f16(<16 x half> [[WIDE_LOAD2]], <16 x half> [[WIDE_LOAD4]],
+; CHECK: middle.block:
+; CHECK: [[BIN_RDX:%.*]] = fadd reassoc <16 x half> [[MULADD2]], [[MULADD1]]
+; CHECK: call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH8000, <16 x half> [[BIN_RDX]])
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum.07 = phi half [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
+  %arrayidx = getelementptr inbounds half, ptr %a, i64 %iv
+  %0 = load half, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds half, ptr %b, i64 %iv
+  %1 = load half, ptr %arrayidx2, align 4
+  %muladd = tail call reassoc half @llvm.fmuladd.f16(half %0, half %1, half %sum.07)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+  ret half %muladd
+}
+
+; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
+; CHECK-REMARK: vectorized loop (vectorization width: 16, interleaved count: 2)
+define bfloat @fmuladd_bf16(ptr %a, ptr %b, i64 %n) "target-features"="+zvfbfmin" {
+; CHECK-LABEL: @fmuladd_bf16(
+; CHECK: vector.body:
+; CHECK: [[WIDE_LOAD:%.*]] = load <16 x bfloat>
+; CHECK: [[WIDE_LOAD2:%.*]] = load <16 x bfloat>
+; CHECK: [[WIDE_LOAD3:%.*]] = load <16 x bfloat>
+; CHECK: [[WIDE_LOAD4:%.*]] = load <16 x bfloat>
+; CHECK: [[MULADD1:%.*]] = call reassoc <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> [[WIDE_LOAD]], <16 x bfloat> [[WIDE_LOAD3]],
+; CHECK: [[MULADD2:%.*]] = call reassoc <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> [[WIDE_LOAD2]], <16 x bfloat> [[WIDE_LOAD4]],
+; CHECK: middle.block:
+; CHECK: [[BIN_RDX:%.*]] = fadd reassoc <16 x bfloat> [[MULADD2]], [[MULADD1]]
+; CHECK: call reassoc bfloat @llvm.vector.reduce.fadd.v16bf16(bfloat 0xR8000, <16 x bfloat> [[BIN_RDX]])
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum.07 = phi bfloat [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
+  %arrayidx = getelementptr inbounds bfloat, ptr %a, i64 %iv
+  %0 = load bfloat, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds bfloat, ptr %b, i64 %iv
+  %1 = load bfloat, ptr %arrayidx2, align 4
+  %muladd = tail call reassoc bfloat @llvm.fmuladd.bf16(bfloat %0, bfloat %1, bfloat %sum.07)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1
+
+for.end:
+  ret bfloat %muladd
+}
+
 declare float @llvm.fmuladd.f32(float, float, float)
 
 attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }