[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)

Fri Dec 13 01:20:53 PST 2024

================
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve2,+fp8,+fp8fma  < %s | FileCheck %s
+; RUN: llc -mattr=+sme,+fp8,+ssve-fp8fma --force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define <vscale x 8 x half> @fmla_2way_bot(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) {
+; CHECK-LABEL: fmla_2way_bot:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalb z0.h, z1.b, z2.b
+; CHECK-NEXT:    ret
+    %r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalb.nxv8f16(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2)
+    ret <vscale x 8 x half> %r
+}
+
+define <vscale x 8 x half> @fmla_2way_top(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) {
+; CHECK-LABEL: fmla_2way_top:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalt z0.h, z1.b, z2.b
+; CHECK-NEXT:    ret
+    %r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalt.nxv8f16(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2)
+    ret <vscale x 8 x half> %r
+}
+
+define <vscale x 8 x half> @fdot_2way_bot_lane(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) {
+; CHECK-LABEL: fdot_2way_bot_lane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalb z0.h, z1.b, z2.b[3]
+; CHECK-NEXT:    ret
+    %r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalb.lane.nxv8f16(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2, i32 3)
+    ret <vscale x 8 x half> %r
+}
+
+define <vscale x 8 x half> @fdot_2way_top_lane(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) {
----------------
momchil-velikov wrote:

Done

https://github.com/llvm/llvm-project/pull/118126