[llvm] [RISCV] Lower fixed-length mload/mstore for zvfhmin/zvfbfmin (PR #115145)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 6 03:02:17 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-risc-v
Author: Luke Lau (lukel97)
<details>
<summary>Changes</summary>
This is the same idea as #<!-- -->114945.
The tests contain a setcc which needs promoted, so at v64[b]f16 and above it ends up getting expanded because it can't promote to LMUL 16.
We could eventually do something similar to what we do with scalable f16/bf16 vectors by custom lowering and splitting.
---
Patch is 363.19 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115145.diff
4 Files Affected:
- (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+3-5)
- (modified) llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h (+6-2)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll (+4573-54)
- (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-store-fp.ll (+4607-88)
``````````diff
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index aaa10aaeb22d37..a625e9d5efeb55 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1339,9 +1339,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::VECTOR_SHUFFLE, ISD::VECTOR_COMPRESS},
VT, Custom);
- // FIXME: mload, mstore, vp_gather/scatter can be
- // hoisted to here.
- setOperationAction({ISD::LOAD, ISD::STORE, ISD::MGATHER, ISD::MSCATTER},
+ // FIXME: vp_gather/scatter can be hoisted to here.
+ setOperationAction({ISD::LOAD, ISD::STORE, ISD::MLOAD, ISD::MSTORE,
+ ISD::MGATHER, ISD::MSCATTER},
VT, Custom);
setOperationAction({ISD::VP_LOAD, ISD::VP_STORE,
ISD::EXPERIMENTAL_VP_STRIDED_LOAD,
@@ -1409,8 +1409,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::BUILD_VECTOR, ISD::SCALAR_TO_VECTOR}, VT,
Custom);
- setOperationAction({ISD::MLOAD, ISD::MSTORE}, VT, Custom);
-
setOperationAction({ISD::VP_GATHER, ISD::VP_SCATTER}, VT, Custom);
setOperationAction({ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV,
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index eaaa035710facc..4c01c1679cd818 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -239,8 +239,12 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
if (!ST->enableUnalignedVectorMem() && Alignment < ElemType.getStoreSize())
return false;
- return TLI->isLegalElementTypeForRVV(ElemType);
-
+ // TODO: Move bf16/f16 support into isLegalElementTypeForRVV
+ return TLI->isLegalElementTypeForRVV(ElemType) ||
+ (DataTypeVT.getVectorElementType() == MVT::bf16 &&
+ ST->hasVInstructionsBF16Minimal()) ||
+ (DataTypeVT.getVectorElementType() == MVT::f16 &&
+ ST->hasVInstructionsF16Minimal());
}
bool isLegalMaskedLoad(Type *DataType, Align Alignment) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
index f1d300b300a646..ede0939a928f51 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-masked-load-fp.ll
@@ -1,17 +1,51 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
-; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfh,+zvfbfmin,+zfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV32,RV32-ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfh,+zvfbfmin,+zfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH,RV64,RV64-ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+m,+v,+f,+d,+zvfhmin,+zvfbfmin,+zfhmin,+zfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV32,RV32-ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+f,+d,+zvfhmin,+zvfbfmin,+zfhmin,+zfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN,RV64,RV64-ZVFHMIN
-define void @masked_load_v1f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
-; CHECK-LABEL: masked_load_v1f16:
+define void @masked_load_v1bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; CHECK-LABEL: masked_load_v1bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
; CHECK-NEXT: vle16.v v8, (a1)
-; CHECK-NEXT: fmv.h.x fa5, zero
-; CHECK-NEXT: vmfeq.vf v0, v8, fa5
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: fmv.w.x fa5, zero
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vf v0, v9, fa5
; CHECK-NEXT: vle16.v v8, (a0), v0.t
; CHECK-NEXT: vse16.v v8, (a2)
; CHECK-NEXT: ret
+ %m = load <1 x bfloat>, ptr %m_ptr
+ %mask = fcmp oeq <1 x bfloat> %m, zeroinitializer
+ %load = call <1 x bfloat> @llvm.masked.load.v1bf16(ptr %a, i32 8, <1 x i1> %mask, <1 x bfloat> undef)
+ store <1 x bfloat> %load, ptr %res_ptr
+ ret void
+}
+declare <1 x bfloat> @llvm.masked.load.v1bf16(ptr, i32, <1 x i1>, <1 x bfloat>)
+
+define void @masked_load_v1f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; ZVFH-LABEL: masked_load_v1f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFH-NEXT: vle16.v v8, (a1)
+; ZVFH-NEXT: fmv.h.x fa5, zero
+; ZVFH-NEXT: vmfeq.vf v0, v8, fa5
+; ZVFH-NEXT: vle16.v v8, (a0), v0.t
+; ZVFH-NEXT: vse16.v v8, (a2)
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: masked_load_v1f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetivli zero, 1, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vle16.v v8, (a1)
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: fmv.w.x fa5, zero
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vf v0, v9, fa5
+; ZVFHMIN-NEXT: vle16.v v8, (a0), v0.t
+; ZVFHMIN-NEXT: vse16.v v8, (a2)
+; ZVFHMIN-NEXT: ret
%m = load <1 x half>, ptr %m_ptr
%mask = fcmp oeq <1 x half> %m, zeroinitializer
%load = call <1 x half> @llvm.masked.load.v1f16(ptr %a, i32 8, <1 x i1> %mask, <1 x half> undef)
@@ -66,16 +100,48 @@ define void @masked_load_v1f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
}
declare <1 x double> @llvm.masked.load.v1f64(ptr, i32, <1 x i1>, <1 x double>)
-define void @masked_load_v2f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
-; CHECK-LABEL: masked_load_v2f16:
+define void @masked_load_v2bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; CHECK-LABEL: masked_load_v2bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
; CHECK-NEXT: vle16.v v8, (a1)
-; CHECK-NEXT: fmv.h.x fa5, zero
-; CHECK-NEXT: vmfeq.vf v0, v8, fa5
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: fmv.w.x fa5, zero
+; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; CHECK-NEXT: vmfeq.vf v0, v9, fa5
; CHECK-NEXT: vle16.v v8, (a0), v0.t
; CHECK-NEXT: vse16.v v8, (a2)
; CHECK-NEXT: ret
+ %m = load <2 x bfloat>, ptr %m_ptr
+ %mask = fcmp oeq <2 x bfloat> %m, zeroinitializer
+ %load = call <2 x bfloat> @llvm.masked.load.v2bf16(ptr %a, i32 8, <2 x i1> %mask, <2 x bfloat> undef)
+ store <2 x bfloat> %load, ptr %res_ptr
+ ret void
+}
+declare <2 x bfloat> @llvm.masked.load.v2bf16(ptr, i32, <2 x i1>, <2 x bfloat>)
+
+define void @masked_load_v2f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; ZVFH-LABEL: masked_load_v2f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFH-NEXT: vle16.v v8, (a1)
+; ZVFH-NEXT: fmv.h.x fa5, zero
+; ZVFH-NEXT: vmfeq.vf v0, v8, fa5
+; ZVFH-NEXT: vle16.v v8, (a0), v0.t
+; ZVFH-NEXT: vse16.v v8, (a2)
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: masked_load_v2f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT: vle16.v v8, (a1)
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: fmv.w.x fa5, zero
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vf v0, v9, fa5
+; ZVFHMIN-NEXT: vle16.v v8, (a0), v0.t
+; ZVFHMIN-NEXT: vse16.v v8, (a2)
+; ZVFHMIN-NEXT: ret
%m = load <2 x half>, ptr %m_ptr
%mask = fcmp oeq <2 x half> %m, zeroinitializer
%load = call <2 x half> @llvm.masked.load.v2f16(ptr %a, i32 8, <2 x i1> %mask, <2 x half> undef)
@@ -130,16 +196,48 @@ define void @masked_load_v2f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
}
declare <2 x double> @llvm.masked.load.v2f64(ptr, i32, <2 x i1>, <2 x double>)
-define void @masked_load_v4f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
-; CHECK-LABEL: masked_load_v4f16:
+define void @masked_load_v4bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; CHECK-LABEL: masked_load_v4bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
; CHECK-NEXT: vle16.v v8, (a1)
-; CHECK-NEXT: fmv.h.x fa5, zero
-; CHECK-NEXT: vmfeq.vf v0, v8, fa5
+; CHECK-NEXT: vfwcvtbf16.f.f.v v9, v8
+; CHECK-NEXT: fmv.w.x fa5, zero
+; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; CHECK-NEXT: vmfeq.vf v0, v9, fa5
; CHECK-NEXT: vle16.v v8, (a0), v0.t
; CHECK-NEXT: vse16.v v8, (a2)
; CHECK-NEXT: ret
+ %m = load <4 x bfloat>, ptr %m_ptr
+ %mask = fcmp oeq <4 x bfloat> %m, zeroinitializer
+ %load = call <4 x bfloat> @llvm.masked.load.v4bf16(ptr %a, i32 8, <4 x i1> %mask, <4 x bfloat> undef)
+ store <4 x bfloat> %load, ptr %res_ptr
+ ret void
+}
+declare <4 x bfloat> @llvm.masked.load.v4bf16(ptr, i32, <4 x i1>, <4 x bfloat>)
+
+define void @masked_load_v4f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; ZVFH-LABEL: masked_load_v4f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFH-NEXT: vle16.v v8, (a1)
+; ZVFH-NEXT: fmv.h.x fa5, zero
+; ZVFH-NEXT: vmfeq.vf v0, v8, fa5
+; ZVFH-NEXT: vle16.v v8, (a0), v0.t
+; ZVFH-NEXT: vse16.v v8, (a2)
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: masked_load_v4f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT: vle16.v v8, (a1)
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v9, v8
+; ZVFHMIN-NEXT: fmv.w.x fa5, zero
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vf v0, v9, fa5
+; ZVFHMIN-NEXT: vle16.v v8, (a0), v0.t
+; ZVFHMIN-NEXT: vse16.v v8, (a2)
+; ZVFHMIN-NEXT: ret
%m = load <4 x half>, ptr %m_ptr
%mask = fcmp oeq <4 x half> %m, zeroinitializer
%load = call <4 x half> @llvm.masked.load.v4f16(ptr %a, i32 8, <4 x i1> %mask, <4 x half> undef)
@@ -194,16 +292,48 @@ define void @masked_load_v4f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
}
declare <4 x double> @llvm.masked.load.v4f64(ptr, i32, <4 x i1>, <4 x double>)
-define void @masked_load_v8f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
-; CHECK-LABEL: masked_load_v8f16:
+define void @masked_load_v8bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; CHECK-LABEL: masked_load_v8bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma
; CHECK-NEXT: vle16.v v8, (a1)
-; CHECK-NEXT: fmv.h.x fa5, zero
-; CHECK-NEXT: vmfeq.vf v0, v8, fa5
+; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT: fmv.w.x fa5, zero
+; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT: vmfeq.vf v0, v10, fa5
; CHECK-NEXT: vle16.v v8, (a0), v0.t
; CHECK-NEXT: vse16.v v8, (a2)
; CHECK-NEXT: ret
+ %m = load <8 x bfloat>, ptr %m_ptr
+ %mask = fcmp oeq <8 x bfloat> %m, zeroinitializer
+ %load = call <8 x bfloat> @llvm.masked.load.v8bf16(ptr %a, i32 8, <8 x i1> %mask, <8 x bfloat> undef)
+ store <8 x bfloat> %load, ptr %res_ptr
+ ret void
+}
+declare <8 x bfloat> @llvm.masked.load.v8bf16(ptr, i32, <8 x i1>, <8 x bfloat>)
+
+define void @masked_load_v8f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; ZVFH-LABEL: masked_load_v8f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; ZVFH-NEXT: vle16.v v8, (a1)
+; ZVFH-NEXT: fmv.h.x fa5, zero
+; ZVFH-NEXT: vmfeq.vf v0, v8, fa5
+; ZVFH-NEXT: vle16.v v8, (a0), v0.t
+; ZVFH-NEXT: vse16.v v8, (a2)
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: masked_load_v8f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT: vle16.v v8, (a1)
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT: fmv.w.x fa5, zero
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vf v0, v10, fa5
+; ZVFHMIN-NEXT: vle16.v v8, (a0), v0.t
+; ZVFHMIN-NEXT: vse16.v v8, (a2)
+; ZVFHMIN-NEXT: ret
%m = load <8 x half>, ptr %m_ptr
%mask = fcmp oeq <8 x half> %m, zeroinitializer
%load = call <8 x half> @llvm.masked.load.v8f16(ptr %a, i32 8, <8 x i1> %mask, <8 x half> undef)
@@ -258,16 +388,48 @@ define void @masked_load_v8f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
}
declare <8 x double> @llvm.masked.load.v8f64(ptr, i32, <8 x i1>, <8 x double>)
-define void @masked_load_v16f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
-; CHECK-LABEL: masked_load_v16f16:
+define void @masked_load_v16bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; CHECK-LABEL: masked_load_v16bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma
; CHECK-NEXT: vle16.v v8, (a1)
-; CHECK-NEXT: fmv.h.x fa5, zero
-; CHECK-NEXT: vmfeq.vf v0, v8, fa5
+; CHECK-NEXT: vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT: fmv.w.x fa5, zero
+; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT: vmfeq.vf v0, v12, fa5
; CHECK-NEXT: vle16.v v8, (a0), v0.t
; CHECK-NEXT: vse16.v v8, (a2)
; CHECK-NEXT: ret
+ %m = load <16 x bfloat>, ptr %m_ptr
+ %mask = fcmp oeq <16 x bfloat> %m, zeroinitializer
+ %load = call <16 x bfloat> @llvm.masked.load.v16bf16(ptr %a, i32 8, <16 x i1> %mask, <16 x bfloat> undef)
+ store <16 x bfloat> %load, ptr %res_ptr
+ ret void
+}
+declare <16 x bfloat> @llvm.masked.load.v16bf16(ptr, i32, <16 x i1>, <16 x bfloat>)
+
+define void @masked_load_v16f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; ZVFH-LABEL: masked_load_v16f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; ZVFH-NEXT: vle16.v v8, (a1)
+; ZVFH-NEXT: fmv.h.x fa5, zero
+; ZVFH-NEXT: vmfeq.vf v0, v8, fa5
+; ZVFH-NEXT: vle16.v v8, (a0), v0.t
+; ZVFH-NEXT: vse16.v v8, (a2)
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: masked_load_v16f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT: vle16.v v8, (a1)
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT: fmv.w.x fa5, zero
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vf v0, v12, fa5
+; ZVFHMIN-NEXT: vle16.v v8, (a0), v0.t
+; ZVFHMIN-NEXT: vse16.v v8, (a2)
+; ZVFHMIN-NEXT: ret
%m = load <16 x half>, ptr %m_ptr
%mask = fcmp oeq <16 x half> %m, zeroinitializer
%load = call <16 x half> @llvm.masked.load.v16f16(ptr %a, i32 8, <16 x i1> %mask, <16 x half> undef)
@@ -322,17 +484,51 @@ define void @masked_load_v16f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
}
declare <16 x double> @llvm.masked.load.v16f64(ptr, i32, <16 x i1>, <16 x double>)
-define void @masked_load_v32f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
-; CHECK-LABEL: masked_load_v32f16:
+define void @masked_load_v32bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; CHECK-LABEL: masked_load_v32bf16:
; CHECK: # %bb.0:
; CHECK-NEXT: li a3, 32
; CHECK-NEXT: vsetvli zero, a3, e16, m4, ta, ma
; CHECK-NEXT: vle16.v v8, (a1)
-; CHECK-NEXT: fmv.h.x fa5, zero
-; CHECK-NEXT: vmfeq.vf v0, v8, fa5
+; CHECK-NEXT: vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT: fmv.w.x fa5, zero
+; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; CHECK-NEXT: vmfeq.vf v0, v16, fa5
; CHECK-NEXT: vle16.v v8, (a0), v0.t
; CHECK-NEXT: vse16.v v8, (a2)
; CHECK-NEXT: ret
+ %m = load <32 x bfloat>, ptr %m_ptr
+ %mask = fcmp oeq <32 x bfloat> %m, zeroinitializer
+ %load = call <32 x bfloat> @llvm.masked.load.v32bf16(ptr %a, i32 8, <32 x i1> %mask, <32 x bfloat> undef)
+ store <32 x bfloat> %load, ptr %res_ptr
+ ret void
+}
+declare <32 x bfloat> @llvm.masked.load.v32bf16(ptr, i32, <32 x i1>, <32 x bfloat>)
+
+define void @masked_load_v32f16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; ZVFH-LABEL: masked_load_v32f16:
+; ZVFH: # %bb.0:
+; ZVFH-NEXT: li a3, 32
+; ZVFH-NEXT: vsetvli zero, a3, e16, m4, ta, ma
+; ZVFH-NEXT: vle16.v v8, (a1)
+; ZVFH-NEXT: fmv.h.x fa5, zero
+; ZVFH-NEXT: vmfeq.vf v0, v8, fa5
+; ZVFH-NEXT: vle16.v v8, (a0), v0.t
+; ZVFH-NEXT: vse16.v v8, (a2)
+; ZVFH-NEXT: ret
+;
+; ZVFHMIN-LABEL: masked_load_v32f16:
+; ZVFHMIN: # %bb.0:
+; ZVFHMIN-NEXT: li a3, 32
+; ZVFHMIN-NEXT: vsetvli zero, a3, e16, m4, ta, ma
+; ZVFHMIN-NEXT: vle16.v v8, (a1)
+; ZVFHMIN-NEXT: vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT: fmv.w.x fa5, zero
+; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT: vmfeq.vf v0, v16, fa5
+; ZVFHMIN-NEXT: vle16.v v8, (a0), v0.t
+; ZVFHMIN-NEXT: vse16.v v8, (a2)
+; ZVFHMIN-NEXT: ret
%m = load <32 x half>, ptr %m_ptr
%mask = fcmp oeq <32 x half> %m, zeroinitializer
%load = call <32 x half> @llvm.masked.load.v32f16(ptr %a, i32 8, <32 x i1> %mask, <32 x half> undef)
@@ -404,17 +600,1477 @@ define void @masked_load_v32f64(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
}
declare <32 x double> @llvm.masked.load.v32f64(ptr, i32, <32 x i1>, <32 x double>)
+define void @masked_load_v64bf16(ptr %a, ptr %m_ptr, ptr %res_ptr) nounwind {
+; RV32-LABEL: masked_load_v64bf16:
+; RV32: # %bb.0:
+; RV32-NEXT: addi sp, sp, -384
+; RV32-NEXT: sw ra, 380(sp) # 4-byte Folded Spill
+; RV32-NEXT: sw s0, 376(sp) # 4-byte Folded Spill
+; RV32-NEXT: addi s0, sp, 384
+; RV32-NEXT: andi sp, sp, -128
+; RV32-NEXT: li a3, 64
+; RV32-NEXT: vsetvli zero, a3, e16, m8, ta, ma
+; RV32-NEXT: vle16.v v8, (a1)
+; RV32-NEXT: addi a1, sp, 128
+; RV32-NEXT: vse16.v v8, (a1)
+; RV32-NEXT: lh a1, 192(sp)
+; RV32-NEXT: fmv.h.x fa5, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa5
+; RV32-NEXT: fmv.w.x fa5, zero
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 96(sp)
+; RV32-NEXT: lh a1, 190(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 95(sp)
+; RV32-NEXT: lh a1, 188(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 94(sp)
+; RV32-NEXT: lh a1, 186(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 93(sp)
+; RV32-NEXT: lh a1, 184(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 92(sp)
+; RV32-NEXT: lh a1, 182(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 91(sp)
+; RV32-NEXT: lh a1, 180(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 90(sp)
+; RV32-NEXT: lh a1, 178(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 89(sp)
+; RV32-NEXT: lh a1, 176(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 88(sp)
+; RV32-NEXT: lh a1, 174(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 87(sp)
+; RV32-NEXT: lh a1, 172(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 86(sp)
+; RV32-NEXT: lh a1, 170(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 85(sp)
+; RV32-NEXT: lh a1, 168(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 84(sp)
+; RV32-NEXT: lh a1, 166(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 83(sp)
+; RV32-NEXT: lh a1, 164(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 82(sp)
+; RV32-NEXT: lh a1, 162(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 fa4, fa4
+; RV32-NEXT: feq.s a1, fa4, fa5
+; RV32-NEXT: sb a1, 81(sp)
+; RV32-NEXT: lh a1, 160(sp)
+; RV32-NEXT: fmv.h.x fa4, a1
+; RV32-NEXT: fcvt.s.bf16 ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/115145
More information about the llvm-commits
mailing list