[clang] [llvm] [AArch64] Implement FP8 SVE intrinsics for fused multiply-add (PR #118126)
Momchil Velikov via cfe-commits
cfe-commits at lists.llvm.org
Fri Nov 29 11:31:24 PST 2024
https://github.com/momchil-velikov created https://github.com/llvm/llvm-project/pull/118126
None
>From 3a9643e6c2d61eae2e23df42c19b1410d4a5fcc5 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Thu, 21 Nov 2024 11:21:29 +0000
Subject: [PATCH 1/4] FP8 CVT/CVTL
---
clang/include/clang/Basic/arm_sve.td | 10 +
.../fp8-intrinsics/acle_sve2_fp8_cvt.c | 173 ++++++++++++++++++
.../aarch64-sve2-intrinsics/acle_sve2_fp8.c | 24 +++
llvm/include/llvm/IR/IntrinsicsAArch64.td | 17 ++
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 16 +-
llvm/lib/Target/AArch64/SVEInstrFormats.td | 7 +-
.../test/CodeGen/AArch64/fp8-sve-cvt-cvtlt.ll | 78 ++++++++
7 files changed, 316 insertions(+), 9 deletions(-)
create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c
create mode 100644 clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c
create mode 100644 llvm/test/CodeGen/AArch64/fp8-sve-cvt-cvtlt.ll
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index b36e592042da0b..b9d8360843aa8e 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2447,3 +2447,13 @@ let SVETargetGuard = "sve2,faminmax", SMETargetGuard = "sme2,faminmax" in {
defm SVAMIN : SInstZPZZ<"svamin", "hfd", "aarch64_sve_famin", "aarch64_sve_famin_u">;
defm SVAMAX : SInstZPZZ<"svamax", "hfd", "aarch64_sve_famax", "aarch64_sve_famax_u">;
}
+
+let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in {
+ // 8-bit floating-point convert to BFloat16/Float16
+ def SVF1CVT : SInst<"svcvt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt1", [VerifyRuntimeMode, SetsFPMR]>;
+ def SVF2CVT : SInst<"svcvt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvt2", [VerifyRuntimeMode, SetsFPMR]>;
+
+ // 8-bit floating-point convert to BFloat16/Float16 (top)
+ def SVF1CVTLT : SInst<"svcvtlt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt1", [VerifyRuntimeMode, SetsFPMR]>;
+ def SVF2CVTLT : SInst<"svcvtlt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt2", [VerifyRuntimeMode, SetsFPMR]>;
+}
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c
new file mode 100644
index 00000000000000..c026b8aa216f32
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvt.c
@@ -0,0 +1,173 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX
+
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -x c++ -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#ifdef __ARM_FEATURE_SME
+#include <arm_sme.h>
+#else
+#include <arm_sve.h>
+#endif
+
+#ifdef SVE_OVERLOADED_FORMS
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+#ifdef __ARM_FEATURE_SME
+#define STREAMING __arm_streaming
+#else
+#define STREAMING
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svcvt1_bf16_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvt1.nxv8bf16(<vscale x 16 x i8> [[ZN]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x bfloat> @_Z20test_svcvt1_bf16_mf8u13__SVMfloat8_tm(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvt1.nxv8bf16(<vscale x 16 x i8> [[ZN]])
+// CHECK-CXX-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svcvt1_bf16_mf8(svmfloat8_t zn, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svcvt1_bf16,_mf8,_fpm)(zn, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svcvt2_bf16_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvt2.nxv8bf16(<vscale x 16 x i8> [[ZN]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x bfloat> @_Z20test_svcvt2_bf16_mf8u13__SVMfloat8_tm(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvt2.nxv8bf16(<vscale x 16 x i8> [[ZN]])
+// CHECK-CXX-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svcvt2_bf16_mf8(svmfloat8_t zn, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svcvt2_bf16,_mf8,_fpm)(zn, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svcvtlt1_bf16_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvtlt1.nxv8bf16(<vscale x 16 x i8> [[ZN]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x bfloat> @_Z22test_svcvtlt1_bf16_mf8u13__SVMfloat8_tm(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvtlt1.nxv8bf16(<vscale x 16 x i8> [[ZN]])
+// CHECK-CXX-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svcvtlt1_bf16_mf8(svmfloat8_t zn, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svcvtlt1_bf16,_mf8,_fpm)(zn, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svcvtlt2_bf16_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvtlt2.nxv8bf16(<vscale x 16 x i8> [[ZN]])
+// CHECK-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x bfloat> @_Z22test_svcvtlt2_bf16_mf8u13__SVMfloat8_tm(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvtlt2.nxv8bf16(<vscale x 16 x i8> [[ZN]])
+// CHECK-CXX-NEXT: ret <vscale x 8 x bfloat> [[TMP0]]
+//
+svbfloat16_t test_svcvtlt2_bf16_mf8(svmfloat8_t zn, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svcvtlt2_bf16,_mf8,_fpm)(zn, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svcvt1_f16_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvt1.nxv8f16(<vscale x 16 x i8> [[ZN]])
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z19test_svcvt1_f16_mf8u13__SVMfloat8_tm(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvt1.nxv8f16(<vscale x 16 x i8> [[ZN]])
+// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svcvt1_f16_mf8(svmfloat8_t zn, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svcvt1_f16,_mf8,_fpm)(zn, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svcvt2_f16_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvt2.nxv8f16(<vscale x 16 x i8> [[ZN]])
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z19test_svcvt2_f16_mf8u13__SVMfloat8_tm(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvt2.nxv8f16(<vscale x 16 x i8> [[ZN]])
+// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svcvt2_f16_mf8(svmfloat8_t zn, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svcvt2_f16,_mf8,_fpm)(zn, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svcvtlt1_f16_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvtlt1.nxv8f16(<vscale x 16 x i8> [[ZN]])
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z21test_svcvtlt1_f16_mf8u13__SVMfloat8_tm(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvtlt1.nxv8f16(<vscale x 16 x i8> [[ZN]])
+// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svcvtlt1_f16_mf8(svmfloat8_t zn, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svcvtlt1_f16,_mf8,_fpm)(zn, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svcvtlt2_f16_mf8(
+// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvtlt2.nxv8f16(<vscale x 16 x i8> [[ZN]])
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z21test_svcvtlt2_f16_mf8u13__SVMfloat8_tm(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[ZN:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvtlt2.nxv8f16(<vscale x 16 x i8> [[ZN]])
+// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svcvtlt2_f16_mf8(svmfloat8_t zn, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svcvtlt2_f16,_mf8,_fpm)(zn, fpm);
+}
diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c
new file mode 100644
index 00000000000000..aafd42f798d935
--- /dev/null
+++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c
@@ -0,0 +1,24 @@
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -verify -emit-llvm %s
+
+#include <arm_sve.h>
+
+void test_features(svmfloat8_t zn, fpm_t fpm) {
+ svcvt1_bf16_mf8_fpm(zn, fpm);
+ // expected-error at -1 {{'svcvt1_bf16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
+ svcvt2_bf16_mf8_fpm(zn, fpm);
+ // expected-error at -1 {{'svcvt2_bf16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
+ svcvtlt1_bf16_mf8_fpm(zn, fpm);
+ // expected-error at -1 {{'svcvtlt1_bf16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
+ svcvtlt2_bf16_mf8_fpm(zn, fpm);
+ // expected-error at -1 {{'svcvtlt2_bf16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
+ svcvt1_f16_mf8_fpm(zn, fpm);
+ // expected-error at -1 {{'svcvt1_f16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
+ svcvt2_f16_mf8_fpm(zn, fpm);
+ // expected-error at -1 {{'svcvt2_f16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
+ svcvtlt1_f16_mf8_fpm(zn, fpm);
+ // expected-error at -1 {{'svcvtlt1_f16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
+ svcvtlt2_f16_mf8_fpm(zn, fpm);
+ // expected-error at -1 {{'svcvtlt2_f16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index a91616b9556828..13bc5e08d2756f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3864,3 +3864,20 @@ def int_aarch64_sve_famin_u : AdvSIMD_Pred2VectorArg_Intrinsic;
// Neon absolute maximum and minimum
def int_aarch64_neon_famax : AdvSIMD_2VectorArg_Intrinsic;
def int_aarch64_neon_famin : AdvSIMD_2VectorArg_Intrinsic;
+
+//
+// FP8 intrinsics
+//
+let TargetPrefix = "aarch64" in {
+
+// Conversions
+class SVE2_FP8_Cvt
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [llvm_nxv16i8_ty],
+ [IntrReadMem, IntrInaccessibleMemOnly]>;
+
+def int_aarch64_sve_fp8_cvt1 : SVE2_FP8_Cvt;
+def int_aarch64_sve_fp8_cvt2 : SVE2_FP8_Cvt;
+def int_aarch64_sve_fp8_cvtlt1 : SVE2_FP8_Cvt;
+def int_aarch64_sve_fp8_cvtlt2 : SVE2_FP8_Cvt;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index fb0eb7a80c6d72..5365a00f3f42ee 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4369,14 +4369,14 @@ let Predicates = [HasNonStreamingSVE2p2orSME2p2] in {
//===----------------------------------------------------------------------===//
let Predicates = [HasSVE2orSME2, HasFP8] in {
// FP8 upconvert
-defm F1CVT_ZZ : sve2_fp8_cvt_single<0b0, 0b00, "f1cvt">;
-defm F2CVT_ZZ : sve2_fp8_cvt_single<0b0, 0b01, "f2cvt">;
-defm BF1CVT_ZZ : sve2_fp8_cvt_single<0b0, 0b10, "bf1cvt">;
-defm BF2CVT_ZZ : sve2_fp8_cvt_single<0b0, 0b11, "bf2cvt">;
-defm F1CVTLT_ZZ : sve2_fp8_cvt_single<0b1, 0b00, "f1cvtlt">;
-defm F2CVTLT_ZZ : sve2_fp8_cvt_single<0b1, 0b01, "f2cvtlt">;
-defm BF1CVTLT_ZZ : sve2_fp8_cvt_single<0b1, 0b10, "bf1cvtlt">;
-defm BF2CVTLT_ZZ : sve2_fp8_cvt_single<0b1, 0b11, "bf2cvtlt">;
+defm F1CVT_ZZ : sve2_fp8_cvt_single<0b0, 0b00, "f1cvt", nxv8f16, int_aarch64_sve_fp8_cvt1>;
+defm F2CVT_ZZ : sve2_fp8_cvt_single<0b0, 0b01, "f2cvt", nxv8f16, int_aarch64_sve_fp8_cvt2>;
+defm BF1CVT_ZZ : sve2_fp8_cvt_single<0b0, 0b10, "bf1cvt", nxv8bf16, int_aarch64_sve_fp8_cvt1>;
+defm BF2CVT_ZZ : sve2_fp8_cvt_single<0b0, 0b11, "bf2cvt", nxv8bf16, int_aarch64_sve_fp8_cvt2>;
+defm F1CVTLT_ZZ : sve2_fp8_cvt_single<0b1, 0b00, "f1cvtlt", nxv8f16, int_aarch64_sve_fp8_cvtlt1>;
+defm F2CVTLT_ZZ : sve2_fp8_cvt_single<0b1, 0b01, "f2cvtlt", nxv8f16, int_aarch64_sve_fp8_cvtlt2>;
+defm BF1CVTLT_ZZ : sve2_fp8_cvt_single<0b1, 0b10, "bf1cvtlt", nxv8bf16, int_aarch64_sve_fp8_cvtlt1>;
+defm BF2CVTLT_ZZ : sve2_fp8_cvt_single<0b1, 0b11, "bf2cvtlt", nxv8bf16, int_aarch64_sve_fp8_cvtlt2>;
// FP8 downconvert
defm FCVTN_Z2Z_HtoB : sve2_fp8_down_cvt_single<0b00, "fcvtn", ZZ_h_mul_r>;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index faaaca3f28d758..58770bf6e274d3 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -10733,10 +10733,15 @@ class sve2_fp8_cvt_single<bit L, bits<2> opc, string mnemonic,
let Inst{9-5} = Zn;
let Inst{4-0} = Zd;
let Uses = [FPMR, FPCR];
+
+ let mayLoad = 1;
+ let mayStore = 0;
}
-multiclass sve2_fp8_cvt_single<bit L, bits<2> opc, string mnemonic> {
+multiclass sve2_fp8_cvt_single<bit L, bits<2> opc, string mnemonic, ValueType vtd, SDPatternOperator op> {
def _BtoH : sve2_fp8_cvt_single<L, opc, mnemonic, ZPR16, ZPR8>;
+
+ def : SVE_1_Op_Pat<vtd, op, nxv16i8, !cast<Instruction>(NAME # _BtoH)>;
}
// FP8 downconvert
diff --git a/llvm/test/CodeGen/AArch64/fp8-sve-cvt-cvtlt.ll b/llvm/test/CodeGen/AArch64/fp8-sve-cvt-cvtlt.ll
new file mode 100644
index 00000000000000..bf0030462db98f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fp8-sve-cvt-cvtlt.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+bf16,+sve2,+fp8 < %s | FileCheck %s
+; RUN: llc -mattr=+bf16,+sme2,+fp8 --force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define <vscale x 8 x bfloat> @cvt1_bf16(<vscale x 16 x i8> %s) {
+; CHECK-LABEL: cvt1_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bf1cvt z0.h, z0.b
+; CHECK-NEXT: ret
+ %r = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvt1.nxv8bf16(<vscale x 16 x i8> %s)
+ ret <vscale x 8 x bfloat> %r
+}
+
+define <vscale x 8 x bfloat> @cvt2_bf16(<vscale x 16 x i8> %s) {
+; CHECK-LABEL: cvt2_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bf2cvt z0.h, z0.b
+; CHECK-NEXT: ret
+ %r = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvt2.nxv8bf16(<vscale x 16 x i8> %s)
+ ret <vscale x 8 x bfloat> %r
+}
+
+define <vscale x 8 x bfloat> @cvtlt1_bf16(<vscale x 16 x i8> %s) {
+; CHECK-LABEL: cvtlt1_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bf1cvtlt z0.h, z0.b
+; CHECK-NEXT: ret
+ %r = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvtlt1.nxv8bf16(<vscale x 16 x i8> %s)
+ ret <vscale x 8 x bfloat> %r
+}
+
+define <vscale x 8 x bfloat> @cvtlt2_bf16(<vscale x 16 x i8> %s) {
+; CHECK-LABEL: cvtlt2_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bf2cvtlt z0.h, z0.b
+; CHECK-NEXT: ret
+ %r = call <vscale x 8 x bfloat> @llvm.aarch64.sve.fp8.cvtlt2.nxv8bf16(<vscale x 16 x i8> %s)
+ ret <vscale x 8 x bfloat> %r
+}
+
+define <vscale x 8 x half> @cvt1_f16(<vscale x 16 x i8> %s) {
+; CHECK-LABEL: cvt1_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: f1cvt z0.h, z0.b
+; CHECK-NEXT: ret
+ %r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvt1.nxv8f16(<vscale x 16 x i8> %s)
+ ret <vscale x 8 x half> %r
+}
+
+define <vscale x 8 x half> @cvt2_f16(<vscale x 16 x i8> %s) {
+; CHECK-LABEL: cvt2_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: f2cvt z0.h, z0.b
+; CHECK-NEXT: ret
+ %r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvt2.nxv8f16(<vscale x 16 x i8> %s)
+ ret <vscale x 8 x half> %r
+}
+
+
+define <vscale x 8 x half> @cvtlt1_f16(<vscale x 16 x i8> %s) {
+; CHECK-LABEL: cvtlt1_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: f1cvtlt z0.h, z0.b
+; CHECK-NEXT: ret
+ %r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvtlt1.nxv8f16(<vscale x 16 x i8> %s)
+ ret <vscale x 8 x half> %r
+}
+
+define <vscale x 8 x half> @cvtlt2_f16(<vscale x 16 x i8> %s) {
+; CHECK-LABEL: cvtlt2_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: f2cvtlt z0.h, z0.b
+; CHECK-NEXT: ret
+ %r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.cvtlt2.nxv8f16(<vscale x 16 x i8> %s)
+ ret <vscale x 8 x half> %r
+}
>From 12582745c36d948bb37f717d6967f2d71ba372e8 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Mon, 25 Nov 2024 09:47:41 +0000
Subject: [PATCH 2/4] FP8 CVTN
---
clang/include/clang/Basic/arm_sve.td | 7 ++
.../fp8-intrinsics/acle_sve2_fp8_cvtn.c | 101 ++++++++++++++++++
.../aarch64-sve2-intrinsics/acle_sve2_fp8.c | 11 +-
llvm/include/llvm/IR/IntrinsicsAArch64.td | 13 +++
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 9 +-
llvm/lib/Target/AArch64/SVEInstrFormats.td | 37 ++++++-
llvm/test/CodeGen/AArch64/fp8-sve-cvtn.ll | 49 +++++++++
7 files changed, 221 insertions(+), 6 deletions(-)
create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvtn.c
create mode 100644 llvm/test/CodeGen/AArch64/fp8-sve-cvtn.ll
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index b9d8360843aa8e..efba2d4671d819 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2456,4 +2456,11 @@ let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in {
// 8-bit floating-point convert to BFloat16/Float16 (top)
def SVF1CVTLT : SInst<"svcvtlt1_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt1", [VerifyRuntimeMode, SetsFPMR]>;
def SVF2CVTLT : SInst<"svcvtlt2_{d}[_mf8]_fpm", "d~>", "bh", MergeNone, "aarch64_sve_fp8_cvtlt2", [VerifyRuntimeMode, SetsFPMR]>;
+
+ // BFloat16/Float16 convert, narrow and interleave to 8-bit floating-point
+ def SVFCVTN : SInst<"svcvtn_mf8[_{d}_x2]_fpm", "~2>", "bh", MergeNone, "aarch64_sve_fp8_cvtn", [VerifyRuntimeMode, SetsFPMR]>;
+
+ // Single-precision convert, narrow and interleave to 8-bit floating-point (top and bottom)
+ def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]_fpm", "~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode, SetsFPMR]>;
+ def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]_fpm", "~~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode, SetsFPMR]>;
}
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvtn.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvtn.c
new file mode 100644
index 00000000000000..ed5b0ce02af4bd
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_cvtn.c
@@ -0,0 +1,101 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX
+
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -x c++ -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#ifdef __ARM_FEATURE_SME
+#include <arm_sme.h>
+#else
+#include <arm_sve.h>
+#endif
+
+#ifdef SVE_OVERLOADED_FORMS
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+#ifdef __ARM_FEATURE_SME
+#define STREAMING __arm_streaming
+#else
+#define STREAMING
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svcvtn_f8_bf16(
+// CHECK-SAME: <vscale x 8 x bfloat> [[ZN_ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_ZM_COERCE1:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.nxv8bf16(<vscale x 8 x bfloat> [[ZN_ZM_COERCE0]], <vscale x 8 x bfloat> [[ZN_ZM_COERCE1]])
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z19test_svcvtn_f8_bf1614svbfloat16x2_tm(
+// CHECK-CXX-SAME: <vscale x 8 x bfloat> [[ZN_ZM_COERCE0:%.*]], <vscale x 8 x bfloat> [[ZN_ZM_COERCE1:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.nxv8bf16(<vscale x 8 x bfloat> [[ZN_ZM_COERCE0]], <vscale x 8 x bfloat> [[ZN_ZM_COERCE1]])
+// CHECK-CXX-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svcvtn_f8_bf16(svbfloat16x2_t zn_zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svcvtn_mf8,_bf16_x2,_fpm)(zn_zm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svcvtn_f8_f16(
+// CHECK-SAME: <vscale x 8 x half> [[ZN_ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZN_ZM_COERCE1:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.nxv8f16(<vscale x 8 x half> [[ZN_ZM_COERCE0]], <vscale x 8 x half> [[ZN_ZM_COERCE1]])
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z18test_svcvtn_f8_f1613svfloat16x2_tm(
+// CHECK-CXX-SAME: <vscale x 8 x half> [[ZN_ZM_COERCE0:%.*]], <vscale x 8 x half> [[ZN_ZM_COERCE1:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.nxv8f16(<vscale x 8 x half> [[ZN_ZM_COERCE0]], <vscale x 8 x half> [[ZN_ZM_COERCE1]])
+// CHECK-CXX-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svcvtn_f8_f16(svfloat16x2_t zn_zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svcvtn_mf8,_f16_x2,_fpm)(zn_zm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svcvtnb_f8_f32(
+// CHECK-SAME: <vscale x 4 x float> [[ZN_ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZN_ZM_COERCE1:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtnb.nxv4f32(<vscale x 4 x float> [[ZN_ZM_COERCE0]], <vscale x 4 x float> [[ZN_ZM_COERCE1]])
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z19test_svcvtnb_f8_f3213svfloat32x2_tm(
+// CHECK-CXX-SAME: <vscale x 4 x float> [[ZN_ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZN_ZM_COERCE1:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtnb.nxv4f32(<vscale x 4 x float> [[ZN_ZM_COERCE0]], <vscale x 4 x float> [[ZN_ZM_COERCE1]])
+// CHECK-CXX-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svcvtnb_f8_f32(svfloat32x2_t zn_zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svcvtnb_mf8,_f32_x2,_fpm)(zn_zm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 16 x i8> @test_svcvtnt_f8_f32(
+// CHECK-SAME: <vscale x 16 x i8> [[ZD:%.*]], <vscale x 4 x float> [[ZN_ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZN_ZM_COERCE1:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtnt.nxv4f32(<vscale x 16 x i8> [[ZD]], <vscale x 4 x float> [[ZN_ZM_COERCE0]], <vscale x 4 x float> [[ZN_ZM_COERCE1]])
+// CHECK-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 16 x i8> @_Z19test_svcvtnt_f8_f32u13__SVMfloat8_t13svfloat32x2_tm(
+// CHECK-CXX-SAME: <vscale x 16 x i8> [[ZD:%.*]], <vscale x 4 x float> [[ZN_ZM_COERCE0:%.*]], <vscale x 4 x float> [[ZN_ZM_COERCE1:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtnt.nxv4f32(<vscale x 16 x i8> [[ZD]], <vscale x 4 x float> [[ZN_ZM_COERCE0]], <vscale x 4 x float> [[ZN_ZM_COERCE1]])
+// CHECK-CXX-NEXT: ret <vscale x 16 x i8> [[TMP0]]
+//
+svmfloat8_t test_svcvtnt_f8_f32(svmfloat8_t zd, svfloat32x2_t zn_zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svcvtnt_mf8,_f32_x2,_fpm)(zd, zn_zm, fpm);
+}
diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c
index aafd42f798d935..803bdd5f40183a 100644
--- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c
+++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c
@@ -1,6 +1,6 @@
// REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -verify -emit-llvm %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -verify -emit-llvm %s
#include <arm_sve.h>
@@ -21,4 +21,13 @@ void test_features(svmfloat8_t zn, fpm_t fpm) {
// expected-error at -1 {{'svcvtlt1_f16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
svcvtlt2_f16_mf8_fpm(zn, fpm);
// expected-error at -1 {{'svcvtlt2_f16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
+
+ svcvtn_mf8_bf16_x2_fpm(svcreate2(svundef_bf16(), svundef_bf16()), fpm);
+ // expected-error at -1 {{'svcvtn_mf8_bf16_x2_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
+ svcvtn_mf8_f16_x2_fpm(svcreate2(svundef_f16(), svundef_f16()), fpm);
+ // expected-error at -1 {{'svcvtn_mf8_f16_x2_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
+ svcvtnb_mf8_f32_x2_fpm(svcreate2(svundef_f32(), svundef_f32()), fpm);
+ // expected-error at -1 {{'svcvtnb_mf8_f32_x2_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
+ svcvtnt_mf8_f32_x2_fpm(zn, svcreate2(svundef_f32(), svundef_f32()), fpm);
+ // expected-error at -1 {{'svcvtnt_mf8_f32_x2_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 13bc5e08d2756f..622dbd9e5c97db 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3880,4 +3880,17 @@ def int_aarch64_sve_fp8_cvt1 : SVE2_FP8_Cvt;
def int_aarch64_sve_fp8_cvt2 : SVE2_FP8_Cvt;
def int_aarch64_sve_fp8_cvtlt1 : SVE2_FP8_Cvt;
def int_aarch64_sve_fp8_cvtlt2 : SVE2_FP8_Cvt;
+
+class SVE2_FP8_Narrow_Cvt
+ : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
+ [llvm_anyvector_ty, LLVMMatchType<0>],
+ [IntrReadMem, IntrInaccessibleMemOnly]>;
+
+def int_aarch64_sve_fp8_cvtn : SVE2_FP8_Narrow_Cvt;
+def int_aarch64_sve_fp8_cvtnb : SVE2_FP8_Narrow_Cvt;
+
+def int_aarch64_sve_fp8_cvtnt
+ : DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
+ [llvm_nxv16i8_ty, llvm_anyvector_ty, LLVMMatchType<0>],
+ [IntrReadMem, IntrInaccessibleMemOnly]>;
}
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 5365a00f3f42ee..47d6418db549d7 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4379,10 +4379,11 @@ defm BF1CVTLT_ZZ : sve2_fp8_cvt_single<0b1, 0b10, "bf1cvtlt", nxv8bf16, int_aar
defm BF2CVTLT_ZZ : sve2_fp8_cvt_single<0b1, 0b11, "bf2cvtlt", nxv8bf16, int_aarch64_sve_fp8_cvtlt2>;
// FP8 downconvert
-defm FCVTN_Z2Z_HtoB : sve2_fp8_down_cvt_single<0b00, "fcvtn", ZZ_h_mul_r>;
-defm FCVTNB_Z2Z_StoB : sve2_fp8_down_cvt_single<0b01, "fcvtnb", ZZ_s_mul_r>;
-defm BFCVTN_Z2Z_HtoB : sve2_fp8_down_cvt_single<0b10, "bfcvtn", ZZ_h_mul_r>;
-defm FCVTNT_Z2Z_StoB : sve2_fp8_down_cvt_single<0b11, "fcvtnt", ZZ_s_mul_r>;
+defm FCVTN_Z2Z_HtoB : sve2_fp8_down_cvt_single<0b00, "fcvtn", ZZ_h_mul_r, nxv8f16, int_aarch64_sve_fp8_cvtn>;
+defm FCVTNB_Z2Z_StoB : sve2_fp8_down_cvt_single<0b01, "fcvtnb", ZZ_s_mul_r, nxv4f32, int_aarch64_sve_fp8_cvtnb>;
+defm BFCVTN_Z2Z_HtoB : sve2_fp8_down_cvt_single<0b10, "bfcvtn", ZZ_h_mul_r, nxv8bf16, int_aarch64_sve_fp8_cvtn>;
+
+defm FCVTNT_Z2Z_StoB : sve2_fp8_down_cvt_single_top<0b11, "fcvtnt", ZZ_s_mul_r, nxv4f32, int_aarch64_sve_fp8_cvtnt>;
} // End HasSVE2orSME2, HasFP8
let Predicates = [HasSVE2orSME2, HasFAMINMAX] in {
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 58770bf6e274d3..95cb5ff8c4a03b 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -10758,10 +10758,45 @@ class sve2_fp8_down_cvt_single<bits<2> opc, string mnemonic,
let Inst{5} = 0b0;
let Inst{4-0} = Zd;
let Uses = [FPMR, FPCR];
+
+ let mayLoad = 1;
+ let mayStore = 0;
}
-multiclass sve2_fp8_down_cvt_single<bits<2> opc, string mnemonic, RegisterOperand src> {
+multiclass sve2_fp8_down_cvt_single<bits<2> opc, string mnemonic, RegisterOperand src,
+ ValueType ty, SDPatternOperator op> {
def NAME : sve2_fp8_down_cvt_single<opc, mnemonic, ZPR8, src>;
+
+ def : Pat<(nxv16i8 (op ty:$Zn1, ty:$Zn2)),
+ (!cast<Instruction>(NAME) (REG_SEQUENCE ZPR2Mul2, $Zn1, zsub0, $Zn2, zsub1))>;
+}
+
+class sve2_fp8_down_cvt_single_top<bits<2> opc, string mnemonic, RegisterOperand src_ty>
+ : I<(outs ZPR8:$Zd), (ins ZPR8:$_Zd, src_ty:$Zn), mnemonic, "\t$Zd, $Zn","", []>, Sched<[]> {
+ bits<5> Zd;
+ bits<4> Zn;
+
+ let Inst{31-12} = 0b01100101000010100011;
+ let Inst{11-10} = opc;
+ let Inst{9-6} = Zn;
+ let Inst{5} = 0b0;
+ let Inst{4-0} = Zd;
+
+ let Constraints = "$Zd = $_Zd";
+ let DestructiveInstType = DestructiveOther;
+ let ElementSize = ZPR8.ElementSize;
+
+ let Uses = [FPMR, FPCR];
+ let mayLoad = 1;
+ let mayStore = 0;
+}
+
+multiclass sve2_fp8_down_cvt_single_top<bits<2> opc, string mnemonic, RegisterOperand src_ty,
+ ValueType ty, SDPatternOperator op> {
+ def NAME : sve2_fp8_down_cvt_single_top<opc, mnemonic, src_ty>;
+
+ def : Pat<(nxv16i8 (op nxv16i8:$Zd, ty:$Zn1, ty:$Zn2)),
+ (!cast<Instruction>(NAME) $Zd, (REG_SEQUENCE ZPR2Mul2, $Zn1, zsub0, $Zn2, zsub1))>;
}
// FP8 Widening Multiply-Add Long - Indexed Group
diff --git a/llvm/test/CodeGen/AArch64/fp8-sve-cvtn.ll b/llvm/test/CodeGen/AArch64/fp8-sve-cvtn.ll
new file mode 100644
index 00000000000000..7daeafc7916043
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fp8-sve-cvtn.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+bf16,+sve2,+fp8 < %s | FileCheck %s
+; RUN: llc -mattr=+bf16,+sme2,+fp8 --force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define <vscale x 16 x i8> @cvtn_bf16(<vscale x 8 x bfloat> %s1, <vscale x 8 x bfloat> %s2) {
+; CHECK-LABEL: cvtn_bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: bfcvtn z0.b, { z0.h, z1.h }
+; CHECK-NEXT: ret
+ %r = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.nxv8bf16(<vscale x 8 x bfloat> %s1, <vscale x 8 x bfloat> %s2)
+ ret <vscale x 16 x i8> %r
+}
+
+define <vscale x 16 x i8> @cvtn_f16(<vscale x 8 x half> %s1, <vscale x 8 x half> %s2) {
+; CHECK-LABEL: cvtn_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fcvtn z0.b, { z0.h, z1.h }
+; CHECK-NEXT: ret
+ %r = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtn.nxv8f16(<vscale x 8 x half> %s1, <vscale x 8 x half> %s2)
+ ret <vscale x 16 x i8> %r
+}
+
+define <vscale x 16 x i8> @cvtnb_f32(<vscale x 4 x float> %s1, <vscale x 4 x float> %s2) {
+; CHECK-LABEL: cvtnb_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fcvtnb z0.b, { z0.s, z1.s }
+; CHECK-NEXT: ret
+ %r = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtnb.nxv4f32(<vscale x 4 x float> %s1, <vscale x 4 x float> %s2)
+ ret <vscale x 16 x i8> %r
+}
+
+define <vscale x 16 x i8> @cvtnt_f32(<vscale x 16 x i8> %d, <vscale x 4 x float> %s1, <vscale x 4 x float> %s2) {
+; CHECK-LABEL: cvtnt_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z3.d, z2.d
+; CHECK-NEXT: mov z2.d, z1.d
+; CHECK-NEXT: fcvtnt z0.b, { z2.s, z3.s }
+; CHECK-NEXT: ret
+ %r = call <vscale x 16 x i8> @llvm.aarch64.sve.fp8.cvtnt.nxv4f32(<vscale x 16 x i8> %d, <vscale x 4 x float> %s1, <vscale x 4 x float> %s2)
+ ret <vscale x 16 x i8> %r
+}
>From 7d36ccc26c5f131b99464b2273b191e096acb3ca Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Mon, 25 Nov 2024 17:21:55 +0000
Subject: [PATCH 3/4] FP8 FDOT
---
clang/include/clang/Basic/arm_sve.td | 19 +++
clang/include/clang/Basic/arm_sve_sme_incl.td | 1 +
clang/lib/CodeGen/CGBuiltin.cpp | 11 +-
.../fp8-intrinsics/acle_sve2_fp8_fdot.c | 149 ++++++++++++++++++
.../aarch64-sve2-intrinsics/acle_sve2_fp8.c | 23 ++-
clang/utils/TableGen/SveEmitter.cpp | 11 +-
llvm/include/llvm/IR/IntrinsicsAArch64.td | 17 ++
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 9 +-
llvm/lib/Target/AArch64/SVEInstrFormats.td | 29 +++-
llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll | 41 +++++
10 files changed, 296 insertions(+), 14 deletions(-)
create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c
create mode 100644 llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index efba2d4671d819..65f98b614ecf0a 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2464,3 +2464,22 @@ let SVETargetGuard = "sve2,fp8", SMETargetGuard = "sme2,fp8" in {
def SVFCVTNB : SInst<"svcvtnb_mf8[_f32_x2]_fpm", "~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnb", [VerifyRuntimeMode, SetsFPMR]>;
def SVFCVTNT : SInst<"svcvtnt_mf8[_f32_x2]_fpm", "~~2>", "f", MergeNone, "aarch64_sve_fp8_cvtnt", [VerifyRuntimeMode, SetsFPMR]>;
}
+
+let SVETargetGuard = "sve2,fp8dot2", SMETargetGuard ="sme,ssve-fp8dot2" in {
+ // 8-bit floating-point dot product to half-precision (vectors)
+ def SVFDOT_2WAY : SInst<"svdot[_f16_mf8]_fpm", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+ def SVFDOT_N_2WAY : SInst<"svdot[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+
+ // 8-bit floating-point dot product to half-precision (indexed)
+ def SVFDOT_LANE_2WAY : SInst<"svdot_lane[_f16_mf8]_fpm", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>;
+}
+
+let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in {
+ // 8-bit floating-point dot product to single-precision (vectors)
+ def SVFDOT_4WAY : SInst<"svdot[_f32_mf8]_fpm", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+ def SVFDOT_N_4WAY : SInst<"svdot[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fdot", [VerifyRuntimeMode, SetsFPMR]>;
+
+ // 8-bit floating-point dot product to single-precision (indexed)
+ def SVFDOT_LANE_4WAY : SInst<"svdot_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_3>]>;
+}
+
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
index de10be7bdce0db..44201b15505599 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -105,6 +105,7 @@ include "arm_immcheck_incl.td"
// N: svfloat64_t
// $: svbfloat16_t
// ~: svmfloat8_t
+// !: mfloat8_t (splat to svmfloat8_t)
// J: Prefetch type (sv_prfop)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index cb9c23b8e0a0d0..9f9beae3059cc9 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -10688,7 +10688,16 @@ Value *CodeGenFunction::EmitSVEDupX(Value *Scalar, llvm::Type *Ty) {
cast<llvm::VectorType>(Ty)->getElementCount(), Scalar);
}
-Value *CodeGenFunction::EmitSVEDupX(Value* Scalar) {
+Value *CodeGenFunction::EmitSVEDupX(Value *Scalar) {
+ if (auto *Ty = Scalar->getType(); Ty->isVectorTy()) {
+#ifndef NDEBUG
+ auto *VecTy = cast<llvm::VectorType>(Ty);
+ ElementCount EC = VecTy->getElementCount();
+ assert(EC.isScalar() && VecTy->getElementType() == Int8Ty &&
+ "Only <1 x i8> expected");
+#endif
+ Scalar = Builder.CreateExtractElement(Scalar, uint64_t(0));
+ }
return EmitSVEDupX(Scalar, getSVEVectorForElementType(Scalar->getType()));
}
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c
new file mode 100644
index 00000000000000..950a19115811ec
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c
@@ -0,0 +1,149 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX
+
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -x c++ -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -target-feature +ssve-fp8dot2 -target-feature +ssve-fp8dot4 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#ifdef __ARM_FEATURE_SME
+#include <arm_sme.h>
+#else
+#include <arm_sve.h>
+#endif
+
+#ifdef SVE_OVERLOADED_FORMS
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+#ifdef __ARM_FEATURE_SME
+#define STREAMING __arm_streaming
+#else
+#define STREAMING
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svdot_f32_mf8(
+// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fdot.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z18test_svdot_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m(
+// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fdot.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_svdot_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svdot,_f32_mf8,_fpm)(zda, zn, zm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svdot_n_f32_mf8(
+// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fdot.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z20test_svdot_n_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tu6__mfp8m(
+// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0
+// CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fdot.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP1]]
+//
+svfloat32_t test_svdot_n_f32_mf8(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svdot,_n_f32_mf8,_fpm)(zda, zn, zm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svdot_f16_mf8(
+// CHECK-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fdot.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z18test_svdot_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m(
+// CHECK-CXX-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fdot.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svdot_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svdot,_f16_mf8,_fpm)(zda, zn, zm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svdot_n_f16_mf8(
+// CHECK-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fdot.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP1]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z20test_svdot_n_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tu6__mfp8m(
+// CHECK-CXX-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0
+// CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fdot.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP1]]
+//
+svfloat16_t test_svdot_n_f16_mf8(svfloat16_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svdot,_n_f16_mf8,_fpm)(zda, zn, zm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svdot_lane_f32_mf8(
+// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fdot.lane.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 3)
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z23test_svdot_lane_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m(
+// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fdot.lane.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 3)
+// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_svdot_lane_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svdot_lane,_f32_mf8,_fpm)(zda, zn, zm, 3, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svdot_lane_f16_mf8(
+// CHECK-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fdot.lane.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7)
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z23test_svdot_lane_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m(
+// CHECK-CXX-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fdot.lane.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7)
+// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svdot_lane_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svdot_lane,_f16_mf8,_fpm)(zda, zn, zm, 7, fpm);
+}
diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c
index 803bdd5f40183a..033b1caca3998e 100644
--- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c
+++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c
@@ -4,7 +4,7 @@
#include <arm_sve.h>
-void test_features(svmfloat8_t zn, fpm_t fpm) {
+void test_features(svmfloat8_t zn, svmfloat8_t zm, mfloat8_t x, fpm_t fpm) {
svcvt1_bf16_mf8_fpm(zn, fpm);
// expected-error at -1 {{'svcvt1_bf16_mf8_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
svcvt2_bf16_mf8_fpm(zn, fpm);
@@ -30,4 +30,25 @@ void test_features(svmfloat8_t zn, fpm_t fpm) {
// expected-error at -1 {{'svcvtnb_mf8_f32_x2_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
svcvtnt_mf8_f32_x2_fpm(zn, svcreate2(svundef_f32(), svundef_f32()), fpm);
// expected-error at -1 {{'svcvtnt_mf8_f32_x2_fpm' needs target feature (sve,sve2,fp8)|(sme,sme2,fp8)}}
+
+ svdot_f32_mf8_fpm(svundef_f32(), zn, zm, fpm);
+// expected-error at -1 {{'svdot_f32_mf8_fpm' needs target feature (sve,sve2,fp8dot4)|(sme,ssve-fp8dot4)}}
+ svdot_n_f32_mf8_fpm(svundef_f32(), zn, x, fpm);
+// expected-error at -1 {{'svdot_n_f32_mf8_fpm' needs target feature (sve,sve2,fp8dot4)|(sme,ssve-fp8dot4)}}
+ svdot_f16_mf8_fpm(svundef_f16(), zn, zm, fpm);
+// expected-error at -1 {{'svdot_f16_mf8_fpm' needs target feature (sve,sve2,fp8dot2)|(sme,ssve-fp8dot2)}}
+ svdot_n_f16_mf8_fpm(svundef_f16(), zn, x, fpm);
+// expected-error at -1 {{'svdot_n_f16_mf8_fpm' needs target feature (sve,sve2,fp8dot2)|(sme,ssve-fp8dot2)}}
+ svdot_lane_f32_mf8_fpm(svundef_f32(), zn, zm, 3, fpm);
+// expected-error at -1 {{'svdot_lane_f32_mf8_fpm' needs target feature (sve,sve2,fp8dot4)|(sme,ssve-fp8dot4)}}
+ svdot_lane_f16_mf8_fpm(svundef_f16(), zn, zm, 7, fpm);
+// expected-error at -1 {{'svdot_lane_f16_mf8_fpm' needs target feature (sve,sve2,fp8dot2)|(sme,ssve-fp8dot2)}}
}
+
+
+void test_imm_range(svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) {
+ svdot_lane_f32_mf8_fpm(svundef_f32(), zn, zm, -1, fpm);
+// expected-error at -1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+ svdot_lane_f16_mf8_fpm(svundef_f16(), zn, zm, -1, fpm);
+// expected-error at -1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+}
\ No newline at end of file
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index e9fa01ea98dced..9a25acad623164 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -239,7 +239,7 @@ class Intrinsic {
/// Return true if the intrinsic takes a splat operand.
bool hasSplat() const {
// These prototype modifiers are described in arm_sve.td.
- return Proto.find_first_of("ajfrKLR@") != std::string::npos;
+ return Proto.find_first_of("ajfrKLR@!") != std::string::npos;
}
/// Return the parameter index of the splat operand.
@@ -248,7 +248,7 @@ class Intrinsic {
for (; I < Proto.size(); ++I, ++Param) {
if (Proto[I] == 'a' || Proto[I] == 'j' || Proto[I] == 'f' ||
Proto[I] == 'r' || Proto[I] == 'K' || Proto[I] == 'L' ||
- Proto[I] == 'R' || Proto[I] == '@')
+ Proto[I] == 'R' || Proto[I] == '@' || Proto[I] == '!')
break;
// Multivector modifier can be skipped
@@ -939,6 +939,13 @@ void SVEType::applyModifier(char Mod) {
MFloat = true;
ElementBitwidth = 8;
break;
+ case '!':
+ Float = false;
+ BFloat = false;
+ MFloat = true;
+ Bitwidth = ElementBitwidth = 8;
+ NumVectors = 0;
+ break;
case '.':
llvm_unreachable(". is never a type in itself");
break;
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 622dbd9e5c97db..65cac7de8e4d55 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3893,4 +3893,21 @@ def int_aarch64_sve_fp8_cvtnt
: DefaultAttrsIntrinsic<[llvm_nxv16i8_ty],
[llvm_nxv16i8_ty, llvm_anyvector_ty, LLVMMatchType<0>],
[IntrReadMem, IntrInaccessibleMemOnly]>;
+
+
+// Dot product
+class SVE2_FP8_FMLA_FDOT
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>,
+ llvm_nxv16i8_ty, llvm_nxv16i8_ty],
+ [IntrReadMem, IntrInaccessibleMemOnly]>;
+
+class SVE2_FP8_FMLA_FDOT_Lane
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>,
+ llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_i32_ty],
+ [IntrReadMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<3>>]>;
+
+def int_aarch64_sve_fp8_fdot : SVE2_FP8_FMLA_FDOT;
+def int_aarch64_sve_fp8_fdot_lane : SVE2_FP8_FMLA_FDOT_Lane;
}
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 47d6418db549d7..003781038ab2fa 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4423,18 +4423,17 @@ let Predicates = [HasSVE2, HasF8F16MM] in {
let Predicates = [HasSSVE_FP8DOT2] in {
// FP8 Widening Dot-Product - Indexed Group
-defm FDOT_ZZZI_BtoH : sve2_fp8_dot_indexed_h<"fdot">;
+defm FDOT_ZZZI_BtoH : sve2_fp8_dot_indexed_h<"fdot", int_aarch64_sve_fp8_fdot_lane>;
// FP8 Widening Dot-Product - Group
-// TODO: Replace nxv16i8 by nxv16f8
-defm FDOT_ZZZ_BtoH : sve_fp8_dot<0b0, ZPR16, "fdot">;
+defm FDOT_ZZZ_BtoH : sve_fp8_dot<0b0, ZPR16, "fdot", nxv8f16, int_aarch64_sve_fp8_fdot>;
}
// TODO: Replace nxv16i8 by nxv16f8
let Predicates = [HasSSVE_FP8DOT4] in {
// FP8 Widening Dot-Product - Indexed Group
-defm FDOT_ZZZI_BtoS : sve2_fp8_dot_indexed_s<"fdot">;
+defm FDOT_ZZZI_BtoS : sve2_fp8_dot_indexed_s<"fdot", int_aarch64_sve_fp8_fdot_lane>;
// FP8 Widening Dot-Product - Group
-defm FDOT_ZZZ_BtoS : sve_fp8_dot<0b1, ZPR32, "fdot">;
+defm FDOT_ZZZ_BtoS : sve_fp8_dot<0b1, ZPR32, "fdot", nxv4f32, int_aarch64_sve_fp8_fdot>;
}
let Predicates = [HasSVE2orSME2, HasLUT] in {
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 95cb5ff8c4a03b..78c575b4d135bc 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -9231,10 +9231,16 @@ multiclass sve_float_dot<bit bf, bit o2, ZPRRegOp dst_ty, ZPRRegOp src_ty,
def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, InVT, InVT, !cast<Instruction>(NAME)>;
}
-multiclass sve_fp8_dot<bit bf, ZPRRegOp dst_ty, string asm> {
- def NAME : sve_float_dot<bf, 0b1, dst_ty, ZPR8, asm>{
+multiclass sve_fp8_dot<bit bf, ZPRRegOp dstrc, string asm, ValueType vt,
+ SDPatternOperator op> {
+ def NAME : sve_float_dot<bf, 0b1, dstrc, ZPR8, asm> {
let Uses = [FPMR, FPCR];
+
+ let mayLoad = 1;
+ let mayStore = 0;
}
+
+ def : SVE_3_Op_Pat<vt, op, vt, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
}
class sve_float_dot_indexed<bit bf, ZPRRegOp dst_ty, ZPRRegOp src1_ty,
@@ -10917,24 +10923,37 @@ class sve_fp8_dot_indexed<bits<4> opc, ZPRRegOp dst_ty, Operand iop_ty, string m
let DestructiveInstType = DestructiveOther;
let hasSideEffects = 0;
let mayRaiseFPException = 1;
+
+ let mayLoad = 1;
+ let mayStore = 0;
}
// FP8 Widening Dot-Product - Indexed Group
-multiclass sve2_fp8_dot_indexed_h<string asm>{
- def NAME : sve_fp8_dot_indexed<{0, ?, ?, ?}, ZPR16, VectorIndexH, asm> {
+multiclass sve2_fp8_dot_indexed_h<string asm, SDPatternOperator op> {
+ def NAME : sve_fp8_dot_indexed<{0, ?, ?, ?}, ZPR16, VectorIndexH32b, asm> {
bits<3> iop;
let Inst{20-19} = iop{2-1};
let Inst{11} = iop{0};
+
+ let mayLoad = 1;
+ let mayStore = 0;
}
+
+ def : SVE_4_Op_Pat<nxv8f16, op, nxv8f16, nxv16i8, nxv16i8, i32, !cast<Instruction>(NAME)>;
}
-multiclass sve2_fp8_dot_indexed_s<string asm>{
+multiclass sve2_fp8_dot_indexed_s<string asm, SDPatternOperator op> {
def NAME : sve_fp8_dot_indexed<{1, ?, ?, 0}, ZPR32, VectorIndexS32b, asm> {
bits<2> iop;
let Inst{20-19} = iop{1-0};
+
+ let mayLoad = 1;
+ let mayStore = 0;
}
+
+ def : SVE_4_Op_Pat<nxv4f32, op, nxv4f32, nxv16i8, nxv16i8, i32, !cast<Instruction>(NAME)>;
}
// FP8 Look up table
diff --git a/llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll b/llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll
new file mode 100644
index 00000000000000..0cead19a74bfd5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fp8-sve-fdot.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve2,+fp8,+fp8dot2,+fp8dot4 < %s | FileCheck %s
+; RUN: llc -mattr=+sme,+fp8,+ssve-fp8dot2,+ssve-fp8dot4 --force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define <vscale x 4 x float> @fdot_4way(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) {
+; CHECK-LABEL: fdot_4way:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fdot z0.s, z1.b, z2.b
+; CHECK-NEXT: ret
+ %r = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fdot.nxv4f32(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2)
+ ret <vscale x 4 x float> %r
+}
+
+define <vscale x 8 x half> @fdot_2way(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) {
+; CHECK-LABEL: fdot_2way:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fdot z0.h, z1.b, z2.b
+; CHECK-NEXT: ret
+ %r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fdot.nxv8f16(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2)
+ ret <vscale x 8 x half> %r
+}
+
+define <vscale x 4 x float> @fdot_4way_lane(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) {
+; CHECK-LABEL: fdot_4way_lane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fdot z0.s, z1.b, z2.b[3]
+; CHECK-NEXT: ret
+ %r = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fdot.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2, i32 3)
+ ret <vscale x 4 x float> %r
+}
+
+define <vscale x 8 x half> @fdot_2way_lane(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) {
+; CHECK-LABEL: fdot_2way_lane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fdot z0.h, z1.b, z2.b[5]
+; CHECK-NEXT: ret
+ %r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fdot.lane.nxv8f16(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2, i32 5)
+ ret <vscale x 8 x half> %r
+}
>From 9fbb4e43c28a20cddbf5221d629939cc82d1624d Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 26 Nov 2024 18:01:03 +0000
Subject: [PATCH 4/4] FP8 FMLA
---
clang/include/clang/Basic/arm_sve.td | 31 ++
.../fp8-intrinsics/acle_sve2_fp8_fmla.c | 389 ++++++++++++++++++
.../aarch64-sve2-intrinsics/acle_sve2_fp8.c | 53 ++-
llvm/include/llvm/IR/IntrinsicsAArch64.td | 20 +-
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 24 +-
llvm/lib/Target/AArch64/SVEInstrFormats.td | 22 +-
llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll | 114 +++++
7 files changed, 636 insertions(+), 17 deletions(-)
create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fmla.c
create mode 100644 llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 65f98b614ecf0a..1a9089c5466747 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2483,3 +2483,34 @@ let SVETargetGuard = "sve2,fp8dot4", SMETargetGuard ="sme,ssve-fp8dot4" in {
def SVFDOT_LANE_4WAY : SInst<"svdot_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fdot_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_3>]>;
}
+let SVETargetGuard = "sve2,fp8fma", SMETargetGuard = "sme,ssve-fp8fma" in {
+ // 8-bit floating-point multiply-add long to half-precision (bottom)
+ def SVFMLALB : SInst<"svmlalb[_f16_mf8]_fpm", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fmlalb", [VerifyRuntimeMode, SetsFPMR]>;
+ def SVFMLALB_N : SInst<"svmlalb[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fmlalb", [VerifyRuntimeMode, SetsFPMR]>;
+
+ // 8-bit floating-point multiply-add long to ha_fpmlf-precision (bottom, indexed)
+ def SVFMLALB_LANE : SInst<"svmlalb_lane[_f16_mf8]_fpm", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fmlalb_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_15>]>;
+
+ // 8-bit floating-point multiply-add long to half-precision (top)
+ def SVFMLALT : SInst<"svmlalt[_f16_mf8]_fpm", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fmlalt", [VerifyRuntimeMode, SetsFPMR]>;
+ def SVFMLALT_N : SInst<"svmlalt[_n_f16_mf8]_fpm", "dd~!>", "h", MergeNone, "aarch64_sve_fp8_fmlalt", [VerifyRuntimeMode, SetsFPMR]>;
+
+ // 8-bit floating-point multiply-add long to half-precision (top, indexed)
+ def SVFMLALT_LANE : SInst<"svmlalt_lane[_f16_mf8]_fpm", "dd~~i>", "h", MergeNone, "aarch64_sve_fp8_fmlalt_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_15>]>;
+
+ // 8-bit floating-point multiply-add long long to single-precision (all top/bottom variants)
+ def SVFMLALLBB : SInst<"svmlallbb[_f32_mf8]_fpm", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmlallbb", [VerifyRuntimeMode, SetsFPMR]>;
+ def SVFMLALLBB_N : SInst<"svmlallbb[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fmlallbb", [VerifyRuntimeMode, SetsFPMR]>;
+ def SVFMLALLBT : SInst<"svmlallbt[_f32_mf8]_fpm", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmlallbt", [VerifyRuntimeMode, SetsFPMR]>;
+ def SVFMLALLBT_N : SInst<"svmlallbt[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fmlallbt", [VerifyRuntimeMode, SetsFPMR]>;
+ def SVFMLALLTB : SInst<"svmlalltb[_f32_mf8]_fpm", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmlalltb", [VerifyRuntimeMode, SetsFPMR]>;
+ def SVFMLALLTB_N : SInst<"svmlalltb[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fmlalltb", [VerifyRuntimeMode, SetsFPMR]>;
+ def SVFMLALLTT : SInst<"svmlalltt[_f32_mf8]_fpm", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmlalltt", [VerifyRuntimeMode, SetsFPMR]>;
+ def SVFMLALLTT_N : SInst<"svmlalltt[_n_f32_mf8]_fpm", "dd~!>", "f", MergeNone, "aarch64_sve_fp8_fmlalltt", [VerifyRuntimeMode, SetsFPMR]>;
+
+ // 8-bit floating-point multiply-add long long to single-precision (indexed, all top/bottom variants)
+ def SVFMLALLBB_LANE : SInst<"svmlallbb_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fmlallbb_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>;
+ def SVFMLALLBT_LANE : SInst<"svmlallbt_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fmlallbt_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>;
+ def SVFMLALLTB_LANE : SInst<"svmlalltb_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fmlalltb_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>;
+ def SVFMLALLTT_LANE : SInst<"svmlalltt_lane[_f32_mf8]_fpm", "dd~~i>", "f", MergeNone, "aarch64_sve_fp8_fmlalltt_lane", [VerifyRuntimeMode, SetsFPMR], [ImmCheck<3, ImmCheck0_7>]>;
+}
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fmla.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fmla.c
new file mode 100644
index 00000000000000..425e6a57ffe3ca
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fmla.c
@@ -0,0 +1,389 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -target-feature +ssve-fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX
+
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -x c++ -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +fp8 -target-feature +ssve-fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CHECK-CXX
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sve2 -target-feature +fp8 -target-feature +fp8fma -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -target-feature +ssve-fp8fma -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#ifdef __ARM_FEATURE_SME
+#include <arm_sme.h>
+#else
+#include <arm_sve.h>
+#endif
+
+#ifdef SVE_OVERLOADED_FORMS
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+#ifdef __ARM_FEATURE_SME
+#define STREAMING __arm_streaming
+#else
+#define STREAMING
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svmlalb_f16_mf8(
+// CHECK-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalb.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z20test_svmlalb_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m(
+// CHECK-CXX-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalb.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svmlalb_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svmlalb,_f16_mf8,_fpm)(zda, zn, zm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svmlalb_n_f16_mf8(
+// CHECK-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalb.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP1]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z22test_svmlalb_n_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tu6__mfp8m(
+// CHECK-CXX-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0
+// CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalb.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP1]]
+//
+svfloat16_t test_svmlalb_n_f16_mf8(svfloat16_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svmlalb,_n_f16_mf8,_fpm)(zda, zn, zm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svmlalt_f16_mf8(
+// CHECK-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalt.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z20test_svmlalt_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m(
+// CHECK-CXX-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalt.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svmlalt_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svmlalt,_f16_mf8,_fpm)(zda, zn, zm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svmlalt_n_f16_mf8(
+// CHECK-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalt.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP1]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z22test_svmlalt_n_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tu6__mfp8m(
+// CHECK-CXX-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0
+// CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalt.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP1]]
+//
+svfloat16_t test_svmlalt_n_f16_mf8(svfloat16_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svmlalt,_n_f16_mf8,_fpm)(zda, zn, zm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svmlalb_lane_f16_mf8(
+// CHECK-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalb.lane.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7)
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z25test_svmlalb_lane_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m(
+// CHECK-CXX-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalb.lane.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7)
+// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svmlalb_lane_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svmlalb_lane,_f16_mf8,_fpm)(zda, zn, zm, 7, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svmlalt_lane_f16_mf8(
+// CHECK-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalt.lane.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7)
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 8 x half> @_Z25test_svmlalt_lane_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m(
+// CHECK-CXX-SAME: <vscale x 8 x half> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalt.lane.nxv8f16(<vscale x 8 x half> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7)
+// CHECK-CXX-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_svmlalt_lane_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svmlalt_lane,_f16_mf8,_fpm)(zda, zn, zm, 7, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlallbb_f32_mf8(
+// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbb.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z22test_svmlallbb_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m(
+// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbb.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_svmlallbb_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svmlallbb,_f32_mf8,_fpm)(zda, zn, zm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlallbb_n_f32_mf8(
+// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbb.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z24test_svmlallbb_n_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tu6__mfp8m(
+// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0
+// CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbb.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP1]]
+//
+svfloat32_t test_svmlallbb_n_f32_mf8(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svmlallbb,_n_f32_mf8,_fpm)(zda, zn, zm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlallbt_f32_mf8(
+// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbt.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZM]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z22test_svmlallbt_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m(
+// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbt.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZM]], <vscale x 16 x i8> [[ZM]])
+// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_svmlallbt_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svmlallbt,_f32_mf8,_fpm)(zda, zm, zm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlallbt_n_f32_mf8(
+// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbt.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z24test_svmlallbt_n_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tu6__mfp8m(
+// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0
+// CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbt.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP1]]
+//
+svfloat32_t test_svmlallbt_n_f32_mf8(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svmlallbt,_n_f32_mf8,_fpm)(zda, zn, zm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlalltb_f32_mf8(
+// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltb.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z22test_svmlalltb_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m(
+// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltb.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_svmlalltb_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svmlalltb,_f32_mf8,_fpm)(zda, zn, zm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlalltb_n_f32_mf8(
+// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltb.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z24test_svmlalltb_n_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tu6__mfp8m(
+// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0
+// CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltb.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP1]]
+//
+svfloat32_t test_svmlalltb_n_f32_mf8(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svmlalltb,_n_f32_mf8,_fpm)(zda, zn, zm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlalltt_f32_mf8(
+// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltt.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z22test_svmlalltt_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m(
+// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltt.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_svmlalltt_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svmlalltt,_f32_mf8,_fpm)(zda, zn, zm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlalltt_n_f32_mf8(
+// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0
+// CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltt.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP1]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z24test_svmlalltt_n_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tu6__mfp8m(
+// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0
+// CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[TMP0]], i64 0
+// CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[DOTSPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltt.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[DOTSPLAT]])
+// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP1]]
+//
+svfloat32_t test_svmlalltt_n_f32_mf8(svfloat32_t zda, svmfloat8_t zn, mfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svmlalltt,_n_f32_mf8,_fpm)(zda, zn, zm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlallbb_lane_f32_mf8(
+// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbb.lane.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7)
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z27test_svmlallbb_lane_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m(
+// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbb.lane.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7)
+// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_svmlallbb_lane_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svmlallbb_lane,_f32_mf8,_fpm)(zda, zn, zm, 7, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlallbt_lane_f32_mf8(
+// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbt.lane.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7)
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z27test_svmlallbt_lane_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m(
+// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbt.lane.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7)
+// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_svmlallbt_lane_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svmlallbt_lane,_f32_mf8,_fpm)(zda, zn, zm, 7, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlalltb_lane_f32_mf8(
+// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltb.lane.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7)
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z27test_svmlalltb_lane_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m(
+// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltb.lane.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7)
+// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_svmlalltb_lane_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svmlalltb_lane,_f32_mf8,_fpm)(zda, zn, zm, 7, fpm);
+}
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svmlalltt_lane_f32_mf8(
+// CHECK-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltt.lane.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7)
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <vscale x 4 x float> @_Z27test_svmlalltt_lane_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m(
+// CHECK-CXX-SAME: <vscale x 4 x float> [[ZDA:%.*]], <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltt.lane.nxv4f32(<vscale x 4 x float> [[ZDA]], <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]], i32 7)
+// CHECK-CXX-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_svmlalltt_lane_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) STREAMING {
+ return SVE_ACLE_FUNC(svmlalltt_lane,_f32_mf8,_fpm)(zda, zn, zm, 7, fpm);
+}
diff --git a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c
index 033b1caca3998e..2f54297c66499d 100644
--- a/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c
+++ b/clang/test/Sema/aarch64-sve2-intrinsics/acle_sve2_fp8.c
@@ -43,12 +43,61 @@ void test_features(svmfloat8_t zn, svmfloat8_t zm, mfloat8_t x, fpm_t fpm) {
// expected-error at -1 {{'svdot_lane_f32_mf8_fpm' needs target feature (sve,sve2,fp8dot4)|(sme,ssve-fp8dot4)}}
svdot_lane_f16_mf8_fpm(svundef_f16(), zn, zm, 7, fpm);
// expected-error at -1 {{'svdot_lane_f16_mf8_fpm' needs target feature (sve,sve2,fp8dot2)|(sme,ssve-fp8dot2)}}
-}
+ svmlalb_f16_mf8_fpm(svundef_f16(), zn, zm, fpm);
+ // expected-error at -1 {{'svmlalb_f16_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}}
+ svmlalb_n_f16_mf8_fpm(svundef_f16(), zn, x, fpm);
+ // expected-error at -1 {{'svmlalb_n_f16_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}}
+ svmlalt_f16_mf8_fpm(svundef_f16(), zn, zm, fpm);
+ // expected-error at -1 {{'svmlalt_f16_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}}
+ svmlalt_n_f16_mf8_fpm(svundef_f16(), zn, x, fpm);
+ // expected-error at -1 {{'svmlalt_n_f16_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}}
+ svmlalb_lane_f16_mf8_fpm(svundef_f16(), zn, zm, 7, fpm);
+ // expected-error at -1 {{'svmlalb_lane_f16_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}}
+ svmlalt_lane_f16_mf8_fpm(svundef_f16(), zn, zm, 7, fpm);
+ // expected-error at -1 {{'svmlalt_lane_f16_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}}
+ svmlallbb_f32_mf8_fpm(svundef_f32(), zn, zm, fpm);
+ // expected-error at -1 {{'svmlallbb_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}}
+ svmlallbb_n_f32_mf8_fpm(svundef_f32(), zn, x, fpm);
+ // expected-error at -1 {{'svmlallbb_n_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}}
+ svmlallbt_f32_mf8_fpm(svundef_f32(), zn, zm, fpm);
+ // expected-error at -1 {{'svmlallbt_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}}
+ svmlallbt_n_f32_mf8_fpm(svundef_f32(), zn, x, fpm);
+ // expected-error at -1 {{'svmlallbt_n_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}}
+ svmlalltb_f32_mf8_fpm(svundef_f32(), zn, zm, fpm);
+ // expected-error at -1 {{'svmlalltb_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}}
+ svmlalltb_n_f32_mf8_fpm(svundef_f32(), zn, x, fpm);
+ // expected-error at -1 {{'svmlalltb_n_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}}
+ svmlalltt_f32_mf8_fpm(svundef_f32(), zn, zm, fpm);
+ // expected-error at -1 {{'svmlalltt_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}}
+ svmlalltt_n_f32_mf8_fpm(svundef_f32(), zn, x, fpm);
+ // expected-error at -1 {{'svmlalltt_n_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}}
+ svmlallbb_lane_f32_mf8_fpm(svundef_f32(), zn, zm, 7, fpm);
+ // expected-error at -1 {{'svmlallbb_lane_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}}
+ svmlallbt_lane_f32_mf8_fpm(svundef_f32(), zn, zm, 7, fpm);
+ // expected-error at -1 {{'svmlallbt_lane_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}}
+ svmlalltb_lane_f32_mf8_fpm(svundef_f32(), zn, zm, 7, fpm);
+ // expected-error at -1 {{'svmlalltb_lane_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}}
+ svmlalltt_lane_f32_mf8_fpm(svundef_f32(), zn, zm, 7, fpm);
+ // expected-error at -1 {{'svmlalltt_lane_f32_mf8_fpm' needs target feature (sve,sve2,fp8fma)|(sme,ssve-fp8fma)}}
+}
void test_imm_range(svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm) {
svdot_lane_f32_mf8_fpm(svundef_f32(), zn, zm, -1, fpm);
// expected-error at -1 {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
svdot_lane_f16_mf8_fpm(svundef_f16(), zn, zm, -1, fpm);
// expected-error at -1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
-}
\ No newline at end of file
+
+ svmlalb_lane_f16_mf8_fpm(svundef_f16(), zn, zm, -1, fpm);
+ // expected-error at -1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
+ svmlalt_lane_f16_mf8_fpm(svundef_f16(), zn, zm, -1, fpm);
+ // expected-error at -1 {{argument value 18446744073709551615 is outside the valid range [0, 15]}}
+ svmlallbb_lane_f32_mf8_fpm(svundef_f32(), zn, zm, -1, fpm);
+ // expected-error at -1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svmlallbt_lane_f32_mf8_fpm(svundef_f32(), zn, zm, -1, fpm);
+ // expected-error at -1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svmlalltb_lane_f32_mf8_fpm(svundef_f32(), zn, zm, -1, fpm);
+ // expected-error at -1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+ svmlalltt_lane_f32_mf8_fpm(svundef_f32(), zn, zm, -1, fpm);
+ // expected-error at -1 {{argument value 18446744073709551615 is outside the valid range [0, 7]}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 65cac7de8e4d55..482f3025ced9b8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3894,7 +3894,6 @@ def int_aarch64_sve_fp8_cvtnt
[llvm_nxv16i8_ty, llvm_anyvector_ty, LLVMMatchType<0>],
[IntrReadMem, IntrInaccessibleMemOnly]>;
-
// Dot product
class SVE2_FP8_FMLA_FDOT
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
@@ -3910,4 +3909,23 @@ class SVE2_FP8_FMLA_FDOT_Lane
def int_aarch64_sve_fp8_fdot : SVE2_FP8_FMLA_FDOT;
def int_aarch64_sve_fp8_fdot_lane : SVE2_FP8_FMLA_FDOT_Lane;
+
+// Fused multiply-add
+def int_aarch64_sve_fp8_fmlalb : SVE2_FP8_FMLA_FDOT;
+def int_aarch64_sve_fp8_fmlalb_lane : SVE2_FP8_FMLA_FDOT_Lane;
+
+def int_aarch64_sve_fp8_fmlalt : SVE2_FP8_FMLA_FDOT;
+def int_aarch64_sve_fp8_fmlalt_lane : SVE2_FP8_FMLA_FDOT_Lane;
+
+def int_aarch64_sve_fp8_fmlallbb : SVE2_FP8_FMLA_FDOT;
+def int_aarch64_sve_fp8_fmlallbb_lane : SVE2_FP8_FMLA_FDOT_Lane;
+
+def int_aarch64_sve_fp8_fmlallbt : SVE2_FP8_FMLA_FDOT;
+def int_aarch64_sve_fp8_fmlallbt_lane : SVE2_FP8_FMLA_FDOT_Lane;
+
+def int_aarch64_sve_fp8_fmlalltb : SVE2_FP8_FMLA_FDOT;
+def int_aarch64_sve_fp8_fmlalltb_lane : SVE2_FP8_FMLA_FDOT_Lane;
+
+def int_aarch64_sve_fp8_fmlalltt : SVE2_FP8_FMLA_FDOT;
+def int_aarch64_sve_fp8_fmlalltt_lane : SVE2_FP8_FMLA_FDOT_Lane;
}
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 003781038ab2fa..ec322bf2e56966 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -4396,21 +4396,21 @@ defm FAMIN_ZPZZ : sve_fp_bin_pred_hfd<AArch64famin_p>;
let Predicates = [HasSSVE_FP8FMA] in {
// FP8 Widening Multiply-Add Long - Indexed Group
-def FMLALB_ZZZI : sve2_fp8_mla_long_by_indexed_elem<0b0, "fmlalb">;
-def FMLALT_ZZZI : sve2_fp8_mla_long_by_indexed_elem<0b1, "fmlalt">;
+defm FMLALB_ZZZI : sve2_fp8_mla_long_by_indexed_elem<0b0, "fmlalb", int_aarch64_sve_fp8_fmlalb_lane>;
+defm FMLALT_ZZZI : sve2_fp8_mla_long_by_indexed_elem<0b1, "fmlalt", int_aarch64_sve_fp8_fmlalt_lane>;
// FP8 Widening Multiply-Add Long Group
-def FMLALB_ZZZ : sve2_fp8_mla<0b100, ZPR16, "fmlalb">;
-def FMLALT_ZZZ : sve2_fp8_mla<0b101, ZPR16, "fmlalt">;
+defm FMLALB_ZZZ : sve2_fp8_mla<0b100, ZPR16, "fmlalb", nxv8f16, int_aarch64_sve_fp8_fmlalb>;
+defm FMLALT_ZZZ : sve2_fp8_mla<0b101, ZPR16, "fmlalt", nxv8f16, int_aarch64_sve_fp8_fmlalt>;
// FP8 Widening Multiply-Add Long Long - Indexed Group
-def FMLALLBB_ZZZI : sve2_fp8_mla_long_long_by_indexed_elem<0b00, "fmlallbb">;
-def FMLALLBT_ZZZI : sve2_fp8_mla_long_long_by_indexed_elem<0b01, "fmlallbt">;
-def FMLALLTB_ZZZI : sve2_fp8_mla_long_long_by_indexed_elem<0b10, "fmlalltb">;
-def FMLALLTT_ZZZI : sve2_fp8_mla_long_long_by_indexed_elem<0b11, "fmlalltt">;
+defm FMLALLBB_ZZZI : sve2_fp8_mla_long_long_by_indexed_elem<0b00, "fmlallbb", int_aarch64_sve_fp8_fmlallbb_lane>;
+defm FMLALLBT_ZZZI : sve2_fp8_mla_long_long_by_indexed_elem<0b01, "fmlallbt", int_aarch64_sve_fp8_fmlallbt_lane>;
+defm FMLALLTB_ZZZI : sve2_fp8_mla_long_long_by_indexed_elem<0b10, "fmlalltb", int_aarch64_sve_fp8_fmlalltb_lane>;
+defm FMLALLTT_ZZZI : sve2_fp8_mla_long_long_by_indexed_elem<0b11, "fmlalltt", int_aarch64_sve_fp8_fmlalltt_lane>;
// FP8 Widening Multiply-Add Long Long Group
-def FMLALLBB_ZZZ : sve2_fp8_mla<0b000, ZPR32, "fmlallbb">;
-def FMLALLBT_ZZZ : sve2_fp8_mla<0b001, ZPR32, "fmlallbt">;
-def FMLALLTB_ZZZ : sve2_fp8_mla<0b010, ZPR32, "fmlalltb">;
-def FMLALLTT_ZZZ : sve2_fp8_mla<0b011, ZPR32, "fmlalltt">;
+defm FMLALLBB_ZZZ : sve2_fp8_mla<0b000, ZPR32, "fmlallbb", nxv4f32, int_aarch64_sve_fp8_fmlallbb>;
+defm FMLALLBT_ZZZ : sve2_fp8_mla<0b001, ZPR32, "fmlallbt", nxv4f32, int_aarch64_sve_fp8_fmlallbt>;
+defm FMLALLTB_ZZZ : sve2_fp8_mla<0b010, ZPR32, "fmlalltb", nxv4f32, int_aarch64_sve_fp8_fmlalltb>;
+defm FMLALLTT_ZZZ : sve2_fp8_mla<0b011, ZPR32, "fmlalltt", nxv4f32, int_aarch64_sve_fp8_fmlalltt>;
} // End HasSSVE_FP8FMA
let Predicates = [HasSVE2, HasF8F32MM] in {
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 78c575b4d135bc..647a6c7088b0fd 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -10808,7 +10808,7 @@ multiclass sve2_fp8_down_cvt_single_top<bits<2> opc, string mnemonic, RegisterOp
// FP8 Widening Multiply-Add Long - Indexed Group
class sve2_fp8_mla_long_by_indexed_elem<bit T, string mnemonic>
: I<(outs ZPR16:$Zda),
- (ins ZPR16:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, VectorIndexB:$imm4),
+ (ins ZPR16:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, VectorIndexB32b:$imm4),
mnemonic, "\t$Zda, $Zn, $Zm$imm4",
"", []>, Sched<[]>{
bits<5> Zda;
@@ -10830,6 +10830,12 @@ class sve2_fp8_mla_long_by_indexed_elem<bit T, string mnemonic>
let Uses = [FPMR, FPCR];
}
+multiclass sve2_fp8_mla_long_by_indexed_elem<bit T, string mnemonic, SDPatternOperator op> {
+ def NAME : sve2_fp8_mla_long_by_indexed_elem<T, mnemonic>;
+
+ def : SVE_4_Op_Pat<nxv8f16, op, nxv8f16, nxv16i8, nxv16i8, i32, !cast<Instruction>(NAME)>;
+}
+
// FP8 Widening Multiply-Add (Long)/(Long Long) Group
class sve2_fp8_mla<bits<3>opc, ZPRRegOp dst_ty, string mnemonic>
: I<(outs dst_ty:$Zda),
@@ -10854,10 +10860,16 @@ class sve2_fp8_mla<bits<3>opc, ZPRRegOp dst_ty, string mnemonic>
let Uses = [FPMR, FPCR];
}
+multiclass sve2_fp8_mla<bits<3> opc, ZPRRegOp dst_ty, string mnemonic, ValueType vta, SDPatternOperator op> {
+ def NAME : sve2_fp8_mla<opc, dst_ty, mnemonic>;
+
+ def : SVE_3_Op_Pat<vta, op, vta, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
+}
+
// FP8 Widening Multiply-Add Long Long - Indexed Group
class sve2_fp8_mla_long_long_by_indexed_elem<bits<2> TT, string mnemonic>
: I<(outs ZPR32:$Zda),
- (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, VectorIndexB:$imm4),
+ (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, VectorIndexB32b:$imm4),
mnemonic, "\t$Zda, $Zn, $Zm$imm4",
"", []>, Sched<[]>{
bits<5> Zda;
@@ -10879,6 +10891,12 @@ class sve2_fp8_mla_long_long_by_indexed_elem<bits<2> TT, string mnemonic>
let Uses = [FPMR, FPCR];
}
+multiclass sve2_fp8_mla_long_long_by_indexed_elem<bits<2> TT, string mnemonic, SDPatternOperator op> {
+ def NAME : sve2_fp8_mla_long_long_by_indexed_elem<TT, mnemonic>;
+
+ def : SVE_4_Op_Pat<nxv4f32, op, nxv4f32, nxv16i8, nxv16i8, i32, !cast<Instruction>(NAME)>;
+}
+
// FP8 Matrix Multiply-accumulate Group
class sve2_fp8_mmla<bit opc, ZPRRegOp dst_ty, string mnemonic>
: I<(outs dst_ty:$Zda),
diff --git a/llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll b/llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll
new file mode 100644
index 00000000000000..b9ec7086d7f08d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fp8-sve-fmla.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mattr=+sve2,+fp8,+fp8fma < %s | FileCheck %s
+; RUN: llc -mattr=+sme,+fp8,+ssve-fp8fma --force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define <vscale x 8 x half> @fmla_2way_bot(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) {
+; CHECK-LABEL: fmla_2way_bot:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmlalb z0.h, z1.b, z2.b
+; CHECK-NEXT: ret
+ %r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalb.nxv8f16(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2)
+ ret <vscale x 8 x half> %r
+}
+
+define <vscale x 8 x half> @fmla_2way_top(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) {
+; CHECK-LABEL: fmla_2way_top:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmlalt z0.h, z1.b, z2.b
+; CHECK-NEXT: ret
+ %r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalt.nxv8f16(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2)
+ ret <vscale x 8 x half> %r
+}
+
+define <vscale x 8 x half> @fdot_2way_bot_lane(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) {
+; CHECK-LABEL: fdot_2way_bot_lane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmlalb z0.h, z1.b, z2.b[3]
+; CHECK-NEXT: ret
+ %r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalb.lane.nxv8f16(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2, i32 3)
+ ret <vscale x 8 x half> %r
+}
+
+define <vscale x 8 x half> @fdot_2way_top_lane(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) {
+; CHECK-LABEL: fdot_2way_top_lane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmlalt z0.h, z1.b, z2.b[3]
+; CHECK-NEXT: ret
+ %r = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmlalt.lane.nxv8f16(<vscale x 8 x half> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2, i32 3)
+ ret <vscale x 8 x half> %r
+}
+
+define <vscale x 4 x float> @fmla_4way_bb(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) {
+; CHECK-LABEL: fmla_4way_bb:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmlallbb z0.s, z1.b, z2.b
+; CHECK-NEXT: ret
+ %r = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbb.nxv4f32(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2)
+ ret <vscale x 4 x float> %r
+}
+
+define <vscale x 4 x float> @fmla_4way_bt(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) {
+; CHECK-LABEL: fmla_4way_bt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmlallbt z0.s, z1.b, z2.b
+; CHECK-NEXT: ret
+ %r = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbt.nxv4f32(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2)
+ ret <vscale x 4 x float> %r
+}
+
+define <vscale x 4 x float> @fmla_4way_tb(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) {
+; CHECK-LABEL: fmla_4way_tb:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmlalltb z0.s, z1.b, z2.b
+; CHECK-NEXT: ret
+ %r = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltb.nxv4f32(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2)
+ ret <vscale x 4 x float> %r
+}
+
+define <vscale x 4 x float> @fmla_4way_tt(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) {
+; CHECK-LABEL: fmla_4way_tt:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmlalltt z0.s, z1.b, z2.b
+; CHECK-NEXT: ret
+ %r = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltt.nxv4f32(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2)
+ ret <vscale x 4 x float> %r
+}
+
+define <vscale x 4 x float> @fmla_4way_bb_lane(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) {
+; CHECK-LABEL: fmla_4way_bb_lane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmlallbb z0.s, z1.b, z2.b[3]
+; CHECK-NEXT: ret
+ %r = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbb.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2, i32 3)
+ ret <vscale x 4 x float> %r
+}
+
+define <vscale x 4 x float> @fmla_4way_bt_lane(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) {
+; CHECK-LABEL: fmla_4way_bt_lane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmlallbt z0.s, z1.b, z2.b[3]
+; CHECK-NEXT: ret
+ %r = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlallbt.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2, i32 3)
+ ret <vscale x 4 x float> %r
+}
+
+define <vscale x 4 x float> @fmla_4way_tb_lane(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) {
+; CHECK-LABEL: fmla_4way_tb_lane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmlalltb z0.s, z1.b, z2.b[3]
+; CHECK-NEXT: ret
+ %r = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltb.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2, i32 3)
+ ret <vscale x 4 x float> %r
+}
+
+define <vscale x 4 x float> @fmla_4way_tt_lane(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2) {
+; CHECK-LABEL: fmla_4way_tt_lane:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fmlalltt z0.s, z1.b, z2.b[3]
+; CHECK-NEXT: ret
+ %r = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmlalltt.lane.nxv4f32(<vscale x 4 x float> %a, <vscale x 16 x i8> %s1, <vscale x 16 x i8> %s2, i32 3)
+ ret <vscale x 4 x float> %r
+}
+
More information about the cfe-commits
mailing list