[llvm] [RISCV] Enable scalable loop vectorization for zvfhmin/zvfbfmin (PR #115272)

Wed Nov 6 22:46:29 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-backend-risc-v

Author: Luke Lau (lukel97)

<details>
<summary>Changes</summary>

This PR enables scalable loop vectorization for f16 with zvfhmin and bf16 with zvfbfmin. 

Enabling this was dependent on filling out the gaps for scalable zvfhmin/zvfbfmin codegen, but everything that the loop vectorizer might emit should now be handled.

It does this by marking f16 and bf16 as legal in `isLegalElementTypeForRVV`. There are a few users of `isLegalElementTypeForRVV` that have already been enabled in other PRs:

- `isLegalStridedLoadStore` #115264
- `isLegalInterleavedAccessType` #115257
- `isLegalMaskedLoadStore` #115145
- `isLegalMaskedGatherScatter` #114945

The remaining user is `isLegalToVectorizeReduction`. We can't promote f16/bf16 fadd reductions to f32 so we need to disable it for scalable vectors. The cost model actually marks these as invalid cost, but for out-of-tree reductions `ComputeReductionResult` doesn't actually get costed and so will end up emitting a reduction intrinsic regardless, so we still need to mark it as illegal.

After this PR, the following loop compiled with `-march=rv64gv_zvfbfwma -O2`:
```c
void f(float * restrict dst, __bf16 * restrict a, __bf16 * restrict b, int n) {
  for (int i = 0; i < n; i++)
    dst[i] += ((float)a[i] * (float)b[i]);
}
```

Goes from this in llvm-19.1.0:
```asm
.LBB0_4:
        addi    t1, t0, 16
        vsetivli        zero, 8, e16, m1, ta, ma
        vle16.v v8, (t0)
        vle16.v v9, (t1)
        addi    t1, a6, 32
        addi    t2, a7, 16
        vfwcvtbf16.f.f.v        v10, v8
        vfwcvtbf16.f.f.v        v12, v9
        vle16.v v8, (a7)
        vle16.v v9, (t2)
        vle32.v v14, (a6)
        vle32.v v16, (t1)
        vfwcvtbf16.f.f.v        v18, v8
        vfwcvtbf16.f.f.v        v20, v9
        vsetvli zero, zero, e32, m2, ta, ma
        vfmacc.vv       v14, v10, v18
        vfmacc.vv       v16, v12, v20
        vse32.v v14, (a6)
        vse32.v v16, (t1)
        addi    t0, t0, 32
        addi    a7, a7, 32
        addi    a6, a6, 64
        bne     t0, a5, .LBB0_4
```

To:
```asm
	vsetvli	t4, zero, e16, m1, ta, ma
.LBB0_4:
	vl1re16.v	v8, (t3)
	vl1re16.v	v9, (t2)
	vl2re32.v	v10, (t1)
	vfwmaccbf16.vv	v10, v8, v9
	vs2r.v	v10, (t1)
	add	t3, t3, a4
	add	t2, t2, a4
	sub	t0, t0, a6
	add	t1, t1, a7
	bnez	t0, .LBB0_4
```

---

Patch is 32.46 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115272.diff


5 Files Affected:

- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+5-11) 
- (modified) llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h (+9-14) 
- (added) llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll (+233) 
- (added) llvm/test/Transforms/LoopVectorize/RISCV/f16.ll (+94) 
- (modified) llvm/test/Transforms/LoopVectorize/RISCV/scalable-reductions.ll (+162-1) 


``````````diff

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5600524b69a620..7a27afa17c9319 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -2516,7 +2516,9 @@ bool RISCVTargetLowering::isLegalElementTypeForRVV(EVT ScalarTy) const {
   case MVT::i64:
     return Subtarget.hasVInstructionsI64();
   case MVT::f16:
-    return Subtarget.hasVInstructionsF16();
+    return Subtarget.hasVInstructionsF16Minimal();
+  case MVT::bf16:
+    return Subtarget.hasVInstructionsBF16Minimal();
   case MVT::f32:
     return Subtarget.hasVInstructionsF32();
   case MVT::f64:
@@ -21509,12 +21511,7 @@ bool RISCVTargetLowering::isLegalInterleavedAccessType(
   if (!isTypeLegal(VT))
     return false;
 
-  // TODO: Move bf16/f16 support into isLegalElementTypeForRVV
-  if (!(isLegalElementTypeForRVV(VT.getScalarType()) ||
-        (VT.getScalarType() == MVT::bf16 &&
-         Subtarget.hasVInstructionsBF16Minimal()) ||
-        (VT.getScalarType() == MVT::f16 &&
-         Subtarget.hasVInstructionsF16Minimal())) ||
+  if (!isLegalElementTypeForRVV(VT.getScalarType()) ||
       !allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace,
                                       Alignment))
     return false;
@@ -21554,10 +21551,7 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType,
     return false;
 
   EVT ScalarType = DataType.getScalarType();
-  // TODO: Move bf16/f16 support into isLegalElementTypeForRVV
-  if (!(isLegalElementTypeForRVV(ScalarType) ||
-        (ScalarType == MVT::bf16 && Subtarget.hasVInstructionsBF16Minimal()) ||
-        (ScalarType == MVT::f16 && Subtarget.hasVInstructionsF16Minimal())))
+  if (!isLegalElementTypeForRVV(ScalarType))
     return false;
 
   if (!Subtarget.enableUnalignedVectorMem() &&
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 4c01c1679cd818..bbbe101745f0e3 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -239,12 +239,7 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
     if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
       return false;
 
-    // TODO: Move bf16/f16 support into isLegalElementTypeForRVV
-    return TLI->isLegalElementTypeForRVV(ElemType) ||
-           (DataTypeVT.getVectorElementType() == MVT::bf16 &&
-            ST->hasVInstructionsBF16Minimal()) ||
-           (DataTypeVT.getVectorElementType() == MVT::f16 &&
-            ST->hasVInstructionsF16Minimal());
+    return TLI->isLegalElementTypeForRVV(ElemType);
   }
 
   bool isLegalMaskedLoad(Type *DataType, Align Alignment) {
@@ -274,12 +269,7 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
     if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
       return false;
 
-    // TODO: Move bf16/f16 support into isLegalElementTypeForRVV
-    return TLI->isLegalElementTypeForRVV(ElemType) ||
-           (DataTypeVT.getVectorElementType() == MVT::bf16 &&
-            ST->hasVInstructionsBF16Minimal()) ||
-           (DataTypeVT.getVectorElementType() == MVT::f16 &&
-            ST->hasVInstructionsF16Minimal());
+    return TLI->isLegalElementTypeForRVV(ElemType);
   }
 
   bool isLegalMaskedGather(Type *DataType, Align Alignment) {
@@ -342,8 +332,14 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
       return false;
 
     switch (RdxDesc.getRecurrenceKind()) {
-    case RecurKind::Add:
     case RecurKind::FAdd:
+    case RecurKind::FMulAdd:
+      // We can't promote f16/bf16 fadd reductions and scalable vectors can't be
+      // expanded.
+      if (Ty->isBFloatTy() || (Ty->isHalfTy() && !ST->hasVInstructionsF16()))
+        return false;
+      [[fallthrough]];
+    case RecurKind::Add:
     case RecurKind::And:
     case RecurKind::Or:
     case RecurKind::Xor:
@@ -353,7 +349,6 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
     case RecurKind::UMax:
     case RecurKind::FMin:
     case RecurKind::FMax:
-    case RecurKind::FMulAdd:
     case RecurKind::IAnyOf:
     case RecurKind::FAnyOf:
       return true;
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
new file mode 100644
index 00000000000000..27923f82411d00
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/bf16.ll
@@ -0,0 +1,233 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=NO-ZVFBFMIN
+; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfbfmin -S | FileCheck %s -check-prefix=ZVFBFMIN
+
+define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
+; NO-ZVFBFMIN-LABEL: define void @fadd(
+; NO-ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-ZVFBFMIN-NEXT:  [[ENTRY:.*]]:
+; NO-ZVFBFMIN-NEXT:    br label %[[LOOP:.*]]
+; NO-ZVFBFMIN:       [[LOOP]]:
+; NO-ZVFBFMIN-NEXT:    [[I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; NO-ZVFBFMIN-NEXT:    [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]]
+; NO-ZVFBFMIN-NEXT:    [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]]
+; NO-ZVFBFMIN-NEXT:    [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2
+; NO-ZVFBFMIN-NEXT:    [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2
+; NO-ZVFBFMIN-NEXT:    [[Z:%.*]] = fadd bfloat [[X]], [[Y]]
+; NO-ZVFBFMIN-NEXT:    store bfloat [[Z]], ptr [[A_GEP]], align 2
+; NO-ZVFBFMIN-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
+; NO-ZVFBFMIN-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; NO-ZVFBFMIN-NEXT:    br i1 [[DONE]], label %[[EXIT:.*]], label %[[LOOP]]
+; NO-ZVFBFMIN:       [[EXIT]]:
+; NO-ZVFBFMIN-NEXT:    ret void
+;
+; ZVFBFMIN-LABEL: define void @fadd(
+; ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; ZVFBFMIN-NEXT:  [[ENTRY:.*]]:
+; ZVFBFMIN-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFBFMIN-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
+; ZVFBFMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP8]]
+; ZVFBFMIN-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; ZVFBFMIN:       [[VECTOR_PH]]:
+; ZVFBFMIN-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFBFMIN-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 8
+; ZVFBFMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP10]]
+; ZVFBFMIN-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; ZVFBFMIN-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFBFMIN-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP12]], 8
+; ZVFBFMIN-NEXT:    br label %[[VECTOR_BODY:.*]]
+; ZVFBFMIN:       [[VECTOR_BODY]]:
+; ZVFBFMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; ZVFBFMIN-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; ZVFBFMIN-NEXT:    [[TMP1:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[TMP0]]
+; ZVFBFMIN-NEXT:    [[TMP2:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP0]]
+; ZVFBFMIN-NEXT:    [[TMP3:%.*]] = getelementptr bfloat, ptr [[TMP1]], i32 0
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x bfloat>, ptr [[TMP3]], align 2
+; ZVFBFMIN-NEXT:    [[TMP4:%.*]] = getelementptr bfloat, ptr [[TMP2]], i32 0
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x bfloat>, ptr [[TMP4]], align 2
+; ZVFBFMIN-NEXT:    [[TMP11:%.*]] = fadd <vscale x 8 x bfloat> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; ZVFBFMIN-NEXT:    store <vscale x 8 x bfloat> [[TMP11]], ptr [[TMP3]], align 2
+; ZVFBFMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; ZVFBFMIN-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; ZVFBFMIN-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; ZVFBFMIN:       [[MIDDLE_BLOCK]]:
+; ZVFBFMIN-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; ZVFBFMIN-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; ZVFBFMIN:       [[SCALAR_PH]]:
+; ZVFBFMIN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; ZVFBFMIN-NEXT:    br label %[[LOOP:.*]]
+; ZVFBFMIN:       [[LOOP]]:
+; ZVFBFMIN-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; ZVFBFMIN-NEXT:    [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]]
+; ZVFBFMIN-NEXT:    [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]]
+; ZVFBFMIN-NEXT:    [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2
+; ZVFBFMIN-NEXT:    [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2
+; ZVFBFMIN-NEXT:    [[Z:%.*]] = fadd bfloat [[X]], [[Y]]
+; ZVFBFMIN-NEXT:    store bfloat [[Z]], ptr [[A_GEP]], align 2
+; ZVFBFMIN-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
+; ZVFBFMIN-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; ZVFBFMIN-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; ZVFBFMIN:       [[EXIT]]:
+; ZVFBFMIN-NEXT:    ret void
+;
+entry:
+  br label %loop
+loop:
+  %i = phi i64 [0, %entry], [%i.next, %loop]
+  %a.gep = getelementptr bfloat, ptr %a, i64 %i
+  %b.gep = getelementptr bfloat, ptr %b, i64 %i
+  %x = load bfloat, ptr %a.gep
+  %y = load bfloat, ptr %b.gep
+  %z = fadd bfloat %x, %y
+  store bfloat %z, ptr %a.gep
+  %i.next = add i64 %i, 1
+  %done = icmp eq i64 %i.next, %n
+  br i1 %done, label %exit, label %loop
+exit:
+  ret void
+}
+
+define void @vfwmaccbf16.vv(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) {
+; NO-ZVFBFMIN-LABEL: define void @vfwmaccbf16.vv(
+; NO-ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; NO-ZVFBFMIN-NEXT:  [[ENTRY:.*]]:
+; NO-ZVFBFMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 8
+; NO-ZVFBFMIN-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; NO-ZVFBFMIN:       [[VECTOR_PH]]:
+; NO-ZVFBFMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 8
+; NO-ZVFBFMIN-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; NO-ZVFBFMIN-NEXT:    br label %[[VECTOR_BODY:.*]]
+; NO-ZVFBFMIN:       [[VECTOR_BODY]]:
+; NO-ZVFBFMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; NO-ZVFBFMIN-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; NO-ZVFBFMIN-NEXT:    [[TMP1:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[TMP0]]
+; NO-ZVFBFMIN-NEXT:    [[TMP2:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP0]]
+; NO-ZVFBFMIN-NEXT:    [[TMP3:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP0]]
+; NO-ZVFBFMIN-NEXT:    [[TMP4:%.*]] = getelementptr bfloat, ptr [[TMP1]], i32 0
+; NO-ZVFBFMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x bfloat>, ptr [[TMP4]], align 2
+; NO-ZVFBFMIN-NEXT:    [[TMP5:%.*]] = getelementptr bfloat, ptr [[TMP2]], i32 0
+; NO-ZVFBFMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x bfloat>, ptr [[TMP5]], align 2
+; NO-ZVFBFMIN-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[TMP3]], i32 0
+; NO-ZVFBFMIN-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x float>, ptr [[TMP6]], align 4
+; NO-ZVFBFMIN-NEXT:    [[TMP7:%.*]] = fpext <8 x bfloat> [[WIDE_LOAD]] to <8 x float>
+; NO-ZVFBFMIN-NEXT:    [[TMP8:%.*]] = fpext <8 x bfloat> [[WIDE_LOAD1]] to <8 x float>
+; NO-ZVFBFMIN-NEXT:    [[TMP9:%.*]] = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> [[TMP7]], <8 x float> [[TMP8]], <8 x float> [[WIDE_LOAD2]])
+; NO-ZVFBFMIN-NEXT:    store <8 x float> [[TMP9]], ptr [[TMP6]], align 4
+; NO-ZVFBFMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; NO-ZVFBFMIN-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; NO-ZVFBFMIN-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; NO-ZVFBFMIN:       [[MIDDLE_BLOCK]]:
+; NO-ZVFBFMIN-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; NO-ZVFBFMIN-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; NO-ZVFBFMIN:       [[SCALAR_PH]]:
+; NO-ZVFBFMIN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; NO-ZVFBFMIN-NEXT:    br label %[[LOOP:.*]]
+; NO-ZVFBFMIN:       [[LOOP]]:
+; NO-ZVFBFMIN-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; NO-ZVFBFMIN-NEXT:    [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]]
+; NO-ZVFBFMIN-NEXT:    [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]]
+; NO-ZVFBFMIN-NEXT:    [[C_GEP:%.*]] = getelementptr float, ptr [[C]], i64 [[I]]
+; NO-ZVFBFMIN-NEXT:    [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2
+; NO-ZVFBFMIN-NEXT:    [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2
+; NO-ZVFBFMIN-NEXT:    [[Z:%.*]] = load float, ptr [[C_GEP]], align 4
+; NO-ZVFBFMIN-NEXT:    [[X_EXT:%.*]] = fpext bfloat [[X]] to float
+; NO-ZVFBFMIN-NEXT:    [[Y_EXT:%.*]] = fpext bfloat [[Y]] to float
+; NO-ZVFBFMIN-NEXT:    [[FMULADD:%.*]] = call float @llvm.fmuladd.f32(float [[X_EXT]], float [[Y_EXT]], float [[Z]])
+; NO-ZVFBFMIN-NEXT:    store float [[FMULADD]], ptr [[C_GEP]], align 4
+; NO-ZVFBFMIN-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
+; NO-ZVFBFMIN-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; NO-ZVFBFMIN-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; NO-ZVFBFMIN:       [[EXIT]]:
+; NO-ZVFBFMIN-NEXT:    ret void
+;
+; ZVFBFMIN-LABEL: define void @vfwmaccbf16.vv(
+; ZVFBFMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; ZVFBFMIN-NEXT:  [[ENTRY:.*]]:
+; ZVFBFMIN-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFBFMIN-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; ZVFBFMIN-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP1]]
+; ZVFBFMIN-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; ZVFBFMIN:       [[VECTOR_PH]]:
+; ZVFBFMIN-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFBFMIN-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; ZVFBFMIN-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; ZVFBFMIN-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
+; ZVFBFMIN-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; ZVFBFMIN-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 4
+; ZVFBFMIN-NEXT:    br label %[[VECTOR_BODY:.*]]
+; ZVFBFMIN:       [[VECTOR_BODY]]:
+; ZVFBFMIN-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; ZVFBFMIN-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
+; ZVFBFMIN-NEXT:    [[TMP7:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[TMP6]]
+; ZVFBFMIN-NEXT:    [[TMP8:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[TMP6]]
+; ZVFBFMIN-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP6]]
+; ZVFBFMIN-NEXT:    [[TMP10:%.*]] = getelementptr bfloat, ptr [[TMP7]], i32 0
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x bfloat>, ptr [[TMP10]], align 2
+; ZVFBFMIN-NEXT:    [[TMP11:%.*]] = getelementptr bfloat, ptr [[TMP8]], i32 0
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x bfloat>, ptr [[TMP11]], align 2
+; ZVFBFMIN-NEXT:    [[TMP12:%.*]] = getelementptr float, ptr [[TMP9]], i32 0
+; ZVFBFMIN-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP12]], align 4
+; ZVFBFMIN-NEXT:    [[TMP13:%.*]] = fpext <vscale x 4 x bfloat> [[WIDE_LOAD]] to <vscale x 4 x float>
+; ZVFBFMIN-NEXT:    [[TMP14:%.*]] = fpext <vscale x 4 x bfloat> [[WIDE_LOAD1]] to <vscale x 4 x float>
+; ZVFBFMIN-NEXT:    [[TMP15:%.*]] = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> [[TMP13]], <vscale x 4 x float> [[TMP14]], <vscale x 4 x float> [[WIDE_LOAD2]])
+; ZVFBFMIN-NEXT:    store <vscale x 4 x float> [[TMP15]], ptr [[TMP12]], align 4
+; ZVFBFMIN-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
+; ZVFBFMIN-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; ZVFBFMIN-NEXT:    br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; ZVFBFMIN:       [[MIDDLE_BLOCK]]:
+; ZVFBFMIN-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
+; ZVFBFMIN-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; ZVFBFMIN:       [[SCALAR_PH]]:
+; ZVFBFMIN-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; ZVFBFMIN-NEXT:    br label %[[LOOP:.*]]
+; ZVFBFMIN:       [[LOOP]]:
+; ZVFBFMIN-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; ZVFBFMIN-NEXT:    [[A_GEP:%.*]] = getelementptr bfloat, ptr [[A]], i64 [[I]]
+; ZVFBFMIN-NEXT:    [[B_GEP:%.*]] = getelementptr bfloat, ptr [[B]], i64 [[I]]
+; ZVFBFMIN-NEXT:    [[C_GEP:%.*]] = getelementptr float, ptr [[C]], i64 [[I]]
+; ZVFBFMIN-NEXT:    [[X:%.*]] = load bfloat, ptr [[A_GEP]], align 2
+; ZVFBFMIN-NEXT:    [[Y:%.*]] = load bfloat, ptr [[B_GEP]], align 2
+; ZVFBFMIN-NEXT:    [[Z:%.*]] = load float, ptr [[C_GEP]], align 4
+; ZVFBFMIN-NEXT:    [[X_EXT:%.*]] = fpext bfloat [[X]] to float
+; ZVFBFMIN-NEXT:    [[Y_EXT:%.*]] = fpext bfloat [[Y]] to float
+; ZVFBFMIN-NEXT:    [[FMULADD:%.*]] = call float @llvm.fmuladd.f32(float [[X_EXT]], float [[Y_EXT]], float [[Z]])
+; ZVFBFMIN-NEXT:    store float [[FMULADD]], ptr [[C_GEP]], align 4
+; ZVFBFMIN-NEXT:    [[I_NEXT]] = add i64 [[I]], 1
+; ZVFBFMIN-NEXT:    [[DONE:%.*]] = icmp eq i64 [[I_NEXT]], [[N]]
+; ZVFBFMIN-NEXT:    br i1 [[DONE]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
+; ZVFBFMIN:       [[EXIT]]:
+; ZVFBFMIN-NEXT:    ret void
+;
+entry:
+  br label %loop
+loop:
+  %i = phi i64 [0, %entry], [%i.next, %loop]
+  %a.gep = getelementptr bfloat, ptr %a, i64 %i
+  %b.gep = getelementptr bfloat, ptr %b, i64 %i
+  %c.gep = getelementptr float, ptr %c, i64 %i
+  %x = load bfloat, ptr %a.gep
+  %y = load bfloat, ptr %b.gep
+  %z = load float, ptr %c.gep
+  %x.ext = fpext bfloat %x to float
+  %y.ext = fpext bfloat %y to float
+  %fmuladd = call float @llvm.fmuladd.f32(float %x.ext, float %y.ext, float %z)
+  store float %fmuladd, ptr %c.gep
+  %i.next = add i64 %i, 1
+  %done = icmp eq i64 %i.next, %n
+  br i1 %done, label %exit, label %loop
+exit:
+  ret void
+}
+;.
+; NO-ZVFBFMIN: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; NO-ZVFBFMIN: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; NO-ZVFBFMIN: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; NO-ZVFBFMIN: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+;.
+; ZVFBFMIN: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
+; ZVFBFMIN: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
+; ZVFBFMIN: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
+; ZVFBFMIN: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; ZVFBFMIN: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; ZVFBFMIN: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll
new file mode 100644
index 00000000000000..2b267f6a2a9778
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/f16.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v -S | FileCheck %s -check-prefix=NO-ZVFHMIN
+; RUN: opt < %s -passes=loop-vectorize -mtriple riscv64 -mattr=+v,+zvfhmin -S | FileCheck %s -check-prefix=ZVFHMIN
+
+define void @fadd(ptr noalias %a, ptr noalias %b, i64 %n) {
+; NO-ZVFHMIN-LABEL: define void @fadd(
+; NO-ZVFHMIN-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; NO-ZVFHMIN-NEXT:  [[ENTRY:.*]]:
+; NO-ZVFHMIN-NEXT:    br label %[[LOOP:.*]]
+; NO-ZVFHMIN:       [[LOOP]]:
+; NO-ZVFHMIN-NEXT:    [[I:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[I_NEXT:%.*]], %[[LOOP]] ]
+; NO-ZVFHMIN-NEXT:    [[A_GEP:%.*]] = getelementptr half, ptr [[A]], i64 [[I]]
+; NO-ZVFHMIN-NEXT:    [[B_GEP:%.*]] = getelementptr half, ptr [[B]], i64 [[I]]
+; NO-ZVFHMIN-NEXT:    [[X:%.*]] = load half, ptr [[A_GEP]], align 2
+; NO-ZVFHMIN-NEXT:    [[Y:%.*]] = load half, ptr [[B_GEP]], align 2
+; NO-ZVFHMIN-NEXT:    [[Z:%.*]] = fadd half [[X]], [[Y]]
+; NO-ZVFHMIN-NEXT...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/115272