[llvm] [RISCV] Lower fixed-length mload/mstore for zvfhmin/zvfbfmin (PR #115145)

via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 6 03:02:17 PST 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-risc-v

Author: Luke Lau (lukel97)

<details>
<summary>Changes</summary>

This is the same idea as #<!-- -->114945.

The tests contain a setcc which needs promoted, so at v64[b]f16 and above it ends up getting expanded because it can't promote to LMUL 16.
We could eventually do something similar to what we do with scalable f16/bf16 vectors by custom lowering and splitting.


---

Patch is 363.19 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115145.diff


4 Files Affected:

- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+3-5) 
- (modified) llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h (+6-2) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll (+4573-54) 
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll (+4607-88) 


``````````diff
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index aaa10aaeb22d37..a625e9d5efeb55 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1339,9 +1339,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
                             ISD::VECTOR_SHUFFLE, ISD::VECTOR_COMPRESS},
                            VT, Custom);
 
-        // FIXME: mload, mstore, vp_gather/scatter can be
-        // hoisted to here.
-        setOperationAction({ISD::LOAD, ISD::STORE, ISD::MGATHER, ISD::MSCATTER},
+        // FIXME: vp_gather/scatter can be hoisted to here.
+        setOperationAction({ISD::LOAD, ISD::STORE, ISD::MLOAD, ISD::MSTORE,
+                            ISD::MGATHER, ISD::MSCATTER},
                            VT, Custom);
         setOperationAction({ISD::VP_LOAD, ISD::VP_STORE,
                             ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
@@ -1409,8 +1409,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         setOperationAction({ISD::BUILD_VECTOR, ISD::SCALAR_TO_VECTOR}, VT,
                            Custom);
 
-        setOperationAction({ISD::MLOAD, ISD::MSTORE}, VT, Custom);
-
         setOperationAction({ISD::VP_GATHER, ISD::VP_SCATTER}, VT, Custom);
 
         setOperationAction({ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index eaaa035710facc..4c01c1679cd818 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -239,8 +239,12 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
     if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
       return false;
 
-    return TLI->isLegalElementTypeForRVV(ElemType);
-
+    // TODO: Move bf16/f16 support into isLegalElementTypeForRVV
+    return TLI->isLegalElementTypeForRVV(ElemType) ||
+           (DataTypeVT.getVectorElementType() == MVT::bf16 &&
+            ST->hasVInstructionsBF16Minimal()) ||
+           (DataTypeVT.getVectorElementType() == MVT::f16 &&
+            ST->hasVInstructionsF16Minimal());
   }
 
   bool isLegalMaskedLoad(Type *DataType, Align Alignment) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
index f1d300b300a646..ede0939a928f51 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
@@ -1,17 +1,51 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh,+zvfbfmin,+zfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV32,RV32-ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh,+zvfbfmin,+zfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV64,RV64-ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfhmin,+zvfbfmin,+zfhmin,+zfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV32,RV32-ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfhmin,+zvfbfmin,+zfhmin,+zfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV64,RV64-ZVFHMIN
 
-define void @masked_load_v1f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
-; CHECK-LABEL: masked_load_v1f16:
+define void @masked_load_v1bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; CHECK-LABEL: masked_load_v1bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    fmv.h.x fa5, zero
-; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    fmv.w.x fa5, zero
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vmfeq.vf v0, v9, fa5
 ; CHECK-NEXT:    vle16.v v8, (a0), v0.t
 ; CHECK-NEXT:    vse16.v v8, (a2)
 ; CHECK-NEXT:    ret
+  %m = load <1 x bfloat>, ptr %m_ptr
+  %mask = fcmp oeq <1 x bfloat> %m, zeroinitializer
+  %load = call <1 x bfloat> @llvm.masked.load.v1bf16(ptr %a, i32 8, <1 x i1> %mask, <1 x bfloat> undef)
+  store <1 x bfloat> %load, ptr %res_ptr
+  ret void
+}
+declare <1 x bfloat> @llvm.masked.load.v1bf16(ptr, i32, <1 x i1>, <1 x bfloat>)
+
+define void @masked_load_v1f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; ZVFH-LABEL: masked_load_v1f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFH-NEXT:    vle16.v v8, (a1)
+; ZVFH-NEXT:    fmv.h.x fa5, zero
+; ZVFH-NEXT:    vmfeq.vf v0, v8, fa5
+; ZVFH-NEXT:    vle16.v v8, (a0), v0.t
+; ZVFH-NEXT:    vse16.v v8, (a2)
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: masked_load_v1f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    fmv.w.x fa5, zero
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vf v0, v9, fa5
+; ZVFHMIN-NEXT:    vle16.v v8, (a0), v0.t
+; ZVFHMIN-NEXT:    vse16.v v8, (a2)
+; ZVFHMIN-NEXT:    ret
   %m = load <1 x half>, ptr %m_ptr
   %mask = fcmp oeq <1 x half> %m, zeroinitializer
   %load = call <1 x half> @llvm.masked.load.v1f16(ptr %a, i32 8, <1 x i1> %mask, <1 x half> undef)
@@ -66,16 +100,48 @@ define void @masked_load_v1f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
 }
 declare <1 x double> @llvm.masked.load.v1f64(ptr, i32, <1 x i1>, <1 x double>)
 
-define void @masked_load_v2f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
-; CHECK-LABEL: masked_load_v2f16:
+define void @masked_load_v2bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; CHECK-LABEL: masked_load_v2bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    fmv.h.x fa5, zero
-; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    fmv.w.x fa5, zero
+; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT:    vmfeq.vf v0, v9, fa5
 ; CHECK-NEXT:    vle16.v v8, (a0), v0.t
 ; CHECK-NEXT:    vse16.v v8, (a2)
 ; CHECK-NEXT:    ret
+  %m = load <2 x bfloat>, ptr %m_ptr
+  %mask = fcmp oeq <2 x bfloat> %m, zeroinitializer
+  %load = call <2 x bfloat> @llvm.masked.load.v2bf16(ptr %a, i32 8, <2 x i1> %mask, <2 x bfloat> undef)
+  store <2 x bfloat> %load, ptr %res_ptr
+  ret void
+}
+declare <2 x bfloat> @llvm.masked.load.v2bf16(ptr, i32, <2 x i1>, <2 x bfloat>)
+
+define void @masked_load_v2f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; ZVFH-LABEL: masked_load_v2f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFH-NEXT:    vle16.v v8, (a1)
+; ZVFH-NEXT:    fmv.h.x fa5, zero
+; ZVFH-NEXT:    vmfeq.vf v0, v8, fa5
+; ZVFH-NEXT:    vle16.v v8, (a0), v0.t
+; ZVFH-NEXT:    vse16.v v8, (a2)
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: masked_load_v2f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    fmv.w.x fa5, zero
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vf v0, v9, fa5
+; ZVFHMIN-NEXT:    vle16.v v8, (a0), v0.t
+; ZVFHMIN-NEXT:    vse16.v v8, (a2)
+; ZVFHMIN-NEXT:    ret
   %m = load <2 x half>, ptr %m_ptr
   %mask = fcmp oeq <2 x half> %m, zeroinitializer
   %load = call <2 x half> @llvm.masked.load.v2f16(ptr %a, i32 8, <2 x i1> %mask, <2 x half> undef)
@@ -130,16 +196,48 @@ define void @masked_load_v2f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
 }
 declare <2 x double> @llvm.masked.load.v2f64(ptr, i32, <2 x i1>, <2 x double>)
 
-define void @masked_load_v4f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
-; CHECK-LABEL: masked_load_v4f16:
+define void @masked_load_v4bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; CHECK-LABEL: masked_load_v4bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    fmv.h.x fa5, zero
-; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT:    fmv.w.x fa5, zero
+; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vf v0, v9, fa5
 ; CHECK-NEXT:    vle16.v v8, (a0), v0.t
 ; CHECK-NEXT:    vse16.v v8, (a2)
 ; CHECK-NEXT:    ret
+  %m = load <4 x bfloat>, ptr %m_ptr
+  %mask = fcmp oeq <4 x bfloat> %m, zeroinitializer
+  %load = call <4 x bfloat> @llvm.masked.load.v4bf16(ptr %a, i32 8, <4 x i1> %mask, <4 x bfloat> undef)
+  store <4 x bfloat> %load, ptr %res_ptr
+  ret void
+}
+declare <4 x bfloat> @llvm.masked.load.v4bf16(ptr, i32, <4 x i1>, <4 x bfloat>)
+
+define void @masked_load_v4f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; ZVFH-LABEL: masked_load_v4f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFH-NEXT:    vle16.v v8, (a1)
+; ZVFH-NEXT:    fmv.h.x fa5, zero
+; ZVFH-NEXT:    vmfeq.vf v0, v8, fa5
+; ZVFH-NEXT:    vle16.v v8, (a0), v0.t
+; ZVFH-NEXT:    vse16.v v8, (a2)
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: masked_load_v4f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT:    fmv.w.x fa5, zero
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vf v0, v9, fa5
+; ZVFHMIN-NEXT:    vle16.v v8, (a0), v0.t
+; ZVFHMIN-NEXT:    vse16.v v8, (a2)
+; ZVFHMIN-NEXT:    ret
   %m = load <4 x half>, ptr %m_ptr
   %mask = fcmp oeq <4 x half> %m, zeroinitializer
   %load = call <4 x half> @llvm.masked.load.v4f16(ptr %a, i32 8, <4 x i1> %mask, <4 x half> undef)
@@ -194,16 +292,48 @@ define void @masked_load_v4f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
 }
 declare <4 x double> @llvm.masked.load.v4f64(ptr, i32, <4 x i1>, <4 x double>)
 
-define void @masked_load_v8f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
-; CHECK-LABEL: masked_load_v8f16:
+define void @masked_load_v8bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; CHECK-LABEL: masked_load_v8bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    fmv.h.x fa5, zero
-; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    fmv.w.x fa5, zero
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vf v0, v10, fa5
 ; CHECK-NEXT:    vle16.v v8, (a0), v0.t
 ; CHECK-NEXT:    vse16.v v8, (a2)
 ; CHECK-NEXT:    ret
+  %m = load <8 x bfloat>, ptr %m_ptr
+  %mask = fcmp oeq <8 x bfloat> %m, zeroinitializer
+  %load = call <8 x bfloat> @llvm.masked.load.v8bf16(ptr %a, i32 8, <8 x i1> %mask, <8 x bfloat> undef)
+  store <8 x bfloat> %load, ptr %res_ptr
+  ret void
+}
+declare <8 x bfloat> @llvm.masked.load.v8bf16(ptr, i32, <8 x i1>, <8 x bfloat>)
+
+define void @masked_load_v8f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; ZVFH-LABEL: masked_load_v8f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFH-NEXT:    vle16.v v8, (a1)
+; ZVFH-NEXT:    fmv.h.x fa5, zero
+; ZVFH-NEXT:    vmfeq.vf v0, v8, fa5
+; ZVFH-NEXT:    vle16.v v8, (a0), v0.t
+; ZVFH-NEXT:    vse16.v v8, (a2)
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: masked_load_v8f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    fmv.w.x fa5, zero
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vf v0, v10, fa5
+; ZVFHMIN-NEXT:    vle16.v v8, (a0), v0.t
+; ZVFHMIN-NEXT:    vse16.v v8, (a2)
+; ZVFHMIN-NEXT:    ret
   %m = load <8 x half>, ptr %m_ptr
   %mask = fcmp oeq <8 x half> %m, zeroinitializer
   %load = call <8 x half> @llvm.masked.load.v8f16(ptr %a, i32 8, <8 x i1> %mask, <8 x half> undef)
@@ -258,16 +388,48 @@ define void @masked_load_v8f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
 }
 declare <8 x double> @llvm.masked.load.v8f64(ptr, i32, <8 x i1>, <8 x double>)
 
-define void @masked_load_v16f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
-; CHECK-LABEL: masked_load_v16f16:
+define void @masked_load_v16bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; CHECK-LABEL: masked_load_v16bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    fmv.h.x fa5, zero
-; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    fmv.w.x fa5, zero
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vf v0, v12, fa5
 ; CHECK-NEXT:    vle16.v v8, (a0), v0.t
 ; CHECK-NEXT:    vse16.v v8, (a2)
 ; CHECK-NEXT:    ret
+  %m = load <16 x bfloat>, ptr %m_ptr
+  %mask = fcmp oeq <16 x bfloat> %m, zeroinitializer
+  %load = call <16 x bfloat> @llvm.masked.load.v16bf16(ptr %a, i32 8, <16 x i1> %mask, <16 x bfloat> undef)
+  store <16 x bfloat> %load, ptr %res_ptr
+  ret void
+}
+declare <16 x bfloat> @llvm.masked.load.v16bf16(ptr, i32, <16 x i1>, <16 x bfloat>)
+
+define void @masked_load_v16f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; ZVFH-LABEL: masked_load_v16f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFH-NEXT:    vle16.v v8, (a1)
+; ZVFH-NEXT:    fmv.h.x fa5, zero
+; ZVFH-NEXT:    vmfeq.vf v0, v8, fa5
+; ZVFH-NEXT:    vle16.v v8, (a0), v0.t
+; ZVFH-NEXT:    vse16.v v8, (a2)
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: masked_load_v16f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    fmv.w.x fa5, zero
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vf v0, v12, fa5
+; ZVFHMIN-NEXT:    vle16.v v8, (a0), v0.t
+; ZVFHMIN-NEXT:    vse16.v v8, (a2)
+; ZVFHMIN-NEXT:    ret
   %m = load <16 x half>, ptr %m_ptr
   %mask = fcmp oeq <16 x half> %m, zeroinitializer
   %load = call <16 x half> @llvm.masked.load.v16f16(ptr %a, i32 8, <16 x i1> %mask, <16 x half> undef)
@@ -322,17 +484,51 @@ define void @masked_load_v16f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
 }
 declare <16 x double> @llvm.masked.load.v16f64(ptr, i32, <16 x i1>, <16 x double>)
 
-define void @masked_load_v32f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
-; CHECK-LABEL: masked_load_v32f16:
+define void @masked_load_v32bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; CHECK-LABEL: masked_load_v32bf16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    li a3, 32
 ; CHECK-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
 ; CHECK-NEXT:    vle16.v v8, (a1)
-; CHECK-NEXT:    fmv.h.x fa5, zero
-; CHECK-NEXT:    vmfeq.vf v0, v8, fa5
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    fmv.w.x fa5, zero
+; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vf v0, v16, fa5
 ; CHECK-NEXT:    vle16.v v8, (a0), v0.t
 ; CHECK-NEXT:    vse16.v v8, (a2)
 ; CHECK-NEXT:    ret
+  %m = load <32 x bfloat>, ptr %m_ptr
+  %mask = fcmp oeq <32 x bfloat> %m, zeroinitializer
+  %load = call <32 x bfloat> @llvm.masked.load.v32bf16(ptr %a, i32 8, <32 x i1> %mask, <32 x bfloat> undef)
+  store <32 x bfloat> %load, ptr %res_ptr
+  ret void
+}
+declare <32 x bfloat> @llvm.masked.load.v32bf16(ptr, i32, <32 x i1>, <32 x bfloat>)
+
+define void @masked_load_v32f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; ZVFH-LABEL: masked_load_v32f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    li a3, 32
+; ZVFH-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFH-NEXT:    vle16.v v8, (a1)
+; ZVFH-NEXT:    fmv.h.x fa5, zero
+; ZVFH-NEXT:    vmfeq.vf v0, v8, fa5
+; ZVFH-NEXT:    vle16.v v8, (a0), v0.t
+; ZVFH-NEXT:    vse16.v v8, (a2)
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: masked_load_v32f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    li a3, 32
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vle16.v v8, (a1)
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    fmv.w.x fa5, zero
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vf v0, v16, fa5
+; ZVFHMIN-NEXT:    vle16.v v8, (a0), v0.t
+; ZVFHMIN-NEXT:    vse16.v v8, (a2)
+; ZVFHMIN-NEXT:    ret
   %m = load <32 x half>, ptr %m_ptr
   %mask = fcmp oeq <32 x half> %m, zeroinitializer
   %load = call <32 x half> @llvm.masked.load.v32f16(ptr %a, i32 8, <32 x i1> %mask, <32 x half> undef)
@@ -404,17 +600,1477 @@ define void @masked_load_v32f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
 }
 declare <32 x double> @llvm.masked.load.v32f64(ptr, i32, <32 x i1>, <32 x double>)
 
+define void @masked_load_v64bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; RV32-LABEL: masked_load_v64bf16:
+; RV32:       # %bb.0:
+; RV32-NEXT:    addi sp, sp, -384
+; RV32-NEXT:    sw ra, 380(sp) # 4-byte Folded Spill
+; RV32-NEXT:    sw s0, 376(sp) # 4-byte Folded Spill
+; RV32-NEXT:    addi s0, sp, 384
+; RV32-NEXT:    andi sp, sp, -128
+; RV32-NEXT:    li a3, 64
+; RV32-NEXT:    vsetvli zero, a3, e16, m8, ta, ma
+; RV32-NEXT:    vle16.v v8, (a1)
+; RV32-NEXT:    addi a1, sp, 128
+; RV32-NEXT:    vse16.v v8, (a1)
+; RV32-NEXT:    lh a1, 192(sp)
+; RV32-NEXT:    fmv.h.x fa5, a1
+; RV32-NEXT:    fcvt.s.bf16 fa4, fa5
+; RV32-NEXT:    fmv.w.x fa5, zero
+; RV32-NEXT:    feq.s a1, fa4, fa5
+; RV32-NEXT:    sb a1, 96(sp)
+; RV32-NEXT:    lh a1, 190(sp)
+; RV32-NEXT:    fmv.h.x fa4, a1
+; RV32-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV32-NEXT:    feq.s a1, fa4, fa5
+; RV32-NEXT:    sb a1, 95(sp)
+; RV32-NEXT:    lh a1, 188(sp)
+; RV32-NEXT:    fmv.h.x fa4, a1
+; RV32-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV32-NEXT:    feq.s a1, fa4, fa5
+; RV32-NEXT:    sb a1, 94(sp)
+; RV32-NEXT:    lh a1, 186(sp)
+; RV32-NEXT:    fmv.h.x fa4, a1
+; RV32-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV32-NEXT:    feq.s a1, fa4, fa5
+; RV32-NEXT:    sb a1, 93(sp)
+; RV32-NEXT:    lh a1, 184(sp)
+; RV32-NEXT:    fmv.h.x fa4, a1
+; RV32-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV32-NEXT:    feq.s a1, fa4, fa5
+; RV32-NEXT:    sb a1, 92(sp)
+; RV32-NEXT:    lh a1, 182(sp)
+; RV32-NEXT:    fmv.h.x fa4, a1
+; RV32-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV32-NEXT:    feq.s a1, fa4, fa5
+; RV32-NEXT:    sb a1, 91(sp)
+; RV32-NEXT:    lh a1, 180(sp)
+; RV32-NEXT:    fmv.h.x fa4, a1
+; RV32-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV32-NEXT:    feq.s a1, fa4, fa5
+; RV32-NEXT:    sb a1, 90(sp)
+; RV32-NEXT:    lh a1, 178(sp)
+; RV32-NEXT:    fmv.h.x fa4, a1
+; RV32-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV32-NEXT:    feq.s a1, fa4, fa5
+; RV32-NEXT:    sb a1, 89(sp)
+; RV32-NEXT:    lh a1, 176(sp)
+; RV32-NEXT:    fmv.h.x fa4, a1
+; RV32-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV32-NEXT:    feq.s a1, fa4, fa5
+; RV32-NEXT:    sb a1, 88(sp)
+; RV32-NEXT:    lh a1, 174(sp)
+; RV32-NEXT:    fmv.h.x fa4, a1
+; RV32-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV32-NEXT:    feq.s a1, fa4, fa5
+; RV32-NEXT:    sb a1, 87(sp)
+; RV32-NEXT:    lh a1, 172(sp)
+; RV32-NEXT:    fmv.h.x fa4, a1
+; RV32-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV32-NEXT:    feq.s a1, fa4, fa5
+; RV32-NEXT:    sb a1, 86(sp)
+; RV32-NEXT:    lh a1, 170(sp)
+; RV32-NEXT:    fmv.h.x fa4, a1
+; RV32-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV32-NEXT:    feq.s a1, fa4, fa5
+; RV32-NEXT:    sb a1, 85(sp)
+; RV32-NEXT:    lh a1, 168(sp)
+; RV32-NEXT:    fmv.h.x fa4, a1
+; RV32-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV32-NEXT:    feq.s a1, fa4, fa5
+; RV32-NEXT:    sb a1, 84(sp)
+; RV32-NEXT:    lh a1, 166(sp)
+; RV32-NEXT:    fmv.h.x fa4, a1
+; RV32-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV32-NEXT:    feq.s a1, fa4, fa5
+; RV32-NEXT:    sb a1, 83(sp)
+; RV32-NEXT:    lh a1, 164(sp)
+; RV32-NEXT:    fmv.h.x fa4, a1
+; RV32-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV32-NEXT:    feq.s a1, fa4, fa5
+; RV32-NEXT:    sb a1, 82(sp)
+; RV32-NEXT:    lh a1, 162(sp)
+; RV32-NEXT:    fmv.h.x fa4, a1
+; RV32-NEXT:    fcvt.s.bf16 fa4, fa4
+; RV32-NEXT:    feq.s a1, fa4, fa5
+; RV32-NEXT:    sb a1, 81(sp)
+; RV32-NEXT:    lh a1, 160(sp)
+; RV32-NEXT:    fmv.h.x fa4, a1
+; RV32-NEXT:    fcvt.s.bf16 ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/115145


More information about the llvm-commits mailing list