[clang] [llvm] [Clang][llvm] Implement fp8 FMOP4A intrinsics (PR #130127)
Virginia Cangelosi via llvm-commits
llvm-commits at lists.llvm.org
Mon Apr 14 05:21:45 PDT 2025
https://github.com/virginia-cangelosi updated https://github.com/llvm/llvm-project/pull/130127
>From 3b94dd78f741574cef32798bfb863f39bc190eec Mon Sep 17 00:00:00 2001
From: Virginia Cangelosi <virginia.cangelosi at arm.com>
Date: Thu, 6 Mar 2025 13:38:19 +0000
Subject: [PATCH 1/3] [Clang][llvm] Implement fp8 FMOP4A intrinsics
---
clang/include/clang/Basic/arm_sme.td | 18 ++
.../sme2-intrinsics/acle_sme2_mop4_fp8.c | 160 ++++++++++++++++++
.../acle_sme2p2_fp8_imm.cpp | 31 ++++
llvm/include/llvm/IR/IntrinsicsAArch64.td | 34 ++++
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 8 +-
llvm/lib/Target/AArch64/SMEInstrFormats.td | 55 ++++--
.../AArch64/sme2-intrinsics-mop4-fp8.ll | 98 +++++++++++
7 files changed, 389 insertions(+), 15 deletions(-)
create mode 100644 clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_fp8.c
create mode 100644 clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_fp8_imm.cpp
create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4-fp8.ll
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 1bfcf4c31d552..62b95e19bf78a 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -321,6 +321,24 @@ let SMETargetGuard = "sme2,sme-mop4,sme-b16b16" in {
defm SVBMOP4S_H : MOP4<"s", "_za16", "b", "aarch64_sme_mop4s", [ImmCheck<0, ImmCheck0_1>]>;
}
+////////////////////////////////////////////////////////////////////////////////
+// SME2 - FP8 FMOP4A, FMOP4S
+
+multiclass MOP4_FP8<string za, string t, list<ImmCheck> checks> {
+ def _1x1 : Inst<"svmop4a" # "[_1x1]" # za # "[_{d}_{d}]", "vidd>", t, MergeNone, "aarch64_sme_fp8_fmop4a" # za # "_1x1", [IsInOutZA, IsStreaming, IsOverloadNone], checks>;
+ def _1x2 : Inst<"svmop4a" # "[_1x2]" # za # "[_{d}_{d}]", "vid2>", t, MergeNone, "aarch64_sme_fp8_fmop4a" # za # "_1x2", [IsInOutZA, IsStreaming, IsOverloadNone], checks>;
+ def _2x1 : Inst<"svmop4a" # "[_2x1]" # za # "[_{d}_{d}]", "vi2d>", t, MergeNone, "aarch64_sme_fp8_fmop4a" # za # "_2x1", [IsInOutZA, IsStreaming, IsOverloadNone], checks>;
+ def _2x2 : Inst<"svmop4a" # "[_2x2]" # za # "[_{d}_{d}]", "vi22>", t, MergeNone, "aarch64_sme_fp8_fmop4a" # za # "_2x2", [IsInOutZA, IsStreaming, IsOverloadNone], checks>;
+}
+
+let SMETargetGuard = "sme2,sme-mop4,sme-f8f32" in {
+ defm SVMOP4A_FP8_ZA32 : MOP4_FP8<"_za32", "m", [ImmCheck<0, ImmCheck0_3>]>;
+}
+
+let SMETargetGuard = "sme2,sme-mop4,sme-f8f16" in {
+ defm SVMOP4A_FP8_ZA16 : MOP4_FP8<"_za16", "m", [ImmCheck<0, ImmCheck0_1>]>;
+}
+
////////////////////////////////////////////////////////////////////////////////
// SME2 - SMOP4A, SMOP4S, UMOP4A, UMOP4S
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_fp8.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_fp8.c
new file mode 100644
index 0000000000000..24fa11538dd32
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_fp8.c
@@ -0,0 +1,160 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: aarch64-registered-target
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-f8f32 -target-feature +sme-f8f16 -target-feature +sme-mop4 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-f8f32 -target-feature +sme-f8f16 -target-feature +sme-mop4 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-f8f32 -target-feature +sme-f8f16 -target-feature +sme-mop4 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-f8f32 -target-feature +sme-f8f16 -target-feature +sme-mop4 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-f8f32 -target-feature +sme-f8f16 -target-feature +sme-mop4 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+
+#include <arm_sme.h>
+
+#ifdef SME_OVERLOADED_FORMS
+#define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: define dso_local void @test_svmop4a_1x1_za16_mf8_mf8_fpm(
+// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za16.1x1(i32 1, <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z33test_svmop4a_1x1_za16_mf8_mf8_fpmu13__SVMfloat8_tS_m(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za16.1x1(i32 1, <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za16_mf8_mf8_fpm(svmfloat8_t zn, svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x1_za16,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmop4a_1x2_za16_mf8_mf8_fpm(
+// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za16.1x2(i32 1, <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z33test_svmop4a_1x2_za16_mf8_mf8_fpmu13__SVMfloat8_t13svmfloat8x2_tm(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za16.1x2(i32 1, <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x2_za16_mf8_mf8_fpm(svmfloat8_t zn, svmfloat8x2_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x2_za16,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmop4a_2x1_za16_mf8_mf8_fpm(
+// CHECK-SAME: <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za16.2x1(i32 1, <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z33test_svmop4a_2x1_za16_mf8_mf8_fpm13svmfloat8x2_tu13__SVMfloat8_tm(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za16.2x1(i32 1, <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_2x1_za16_mf8_mf8_fpm(svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_2x1_za16,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmop4a_2x2_za16_mf8_mf8_fpm(
+// CHECK-SAME: <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za16.2x2(i32 1, <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z33test_svmop4a_2x2_za16_mf8_mf8_fpm13svmfloat8x2_tS_m(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za16.2x2(i32 1, <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_2x2_za16_mf8_mf8_fpm(svmfloat8x2_t zn, svmfloat8x2_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_2x2_za16,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmop4a_1x1_za32_mf8_mf8_fpm(
+// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za32.1x1(i32 1, <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z33test_svmop4a_1x1_za32_mf8_mf8_fpmu13__SVMfloat8_tS_m(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za32.1x1(i32 1, <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x1_za32_mf8_mf8_fpm(svmfloat8_t zn, svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x1_za32,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmop4a_1x2_za32_mf8_mf8_fpm(
+// CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za32.1x2(i32 1, <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z33test_svmop4a_1x2_za32_mf8_mf8_fpmu13__SVMfloat8_t13svmfloat8x2_tm(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[ZN:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za32.1x2(i32 1, <vscale x 16 x i8> [[ZN]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_1x2_za32_mf8_mf8_fpm(svmfloat8_t zn, svmfloat8x2_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_1x2_za32,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmop4a_2x1_za32_mf8_mf8_fpm(
+// CHECK-SAME: <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za32.2x1(i32 1, <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z33test_svmop4a_2x1_za32_mf8_mf8_fpm13svmfloat8x2_tu13__SVMfloat8_tm(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za32.2x1(i32 1, <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_2x1_za32_mf8_mf8_fpm(svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_2x1_za32,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local void @test_svmop4a_2x2_za32_mf8_mf8_fpm(
+// CHECK-SAME: <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za32.2x2(i32 1, <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z33test_svmop4a_2x2_za32_mf8_mf8_fpm13svmfloat8x2_tS_m(
+// CPP-CHECK-SAME: <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fmop4a.za32.2x2(i32 1, <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
+// CPP-CHECK-NEXT: ret void
+//
+void test_svmop4a_2x2_za32_mf8_mf8_fpm(svmfloat8x2_t zn, svmfloat8x2_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
+ SME_ACLE_FUNC(svmop4a_2x2_za32,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
+}
diff --git a/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_fp8_imm.cpp b/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_fp8_imm.cpp
new file mode 100644
index 0000000000000..e031cfe9b3cb4
--- /dev/null
+++ b/clang/test/Sema/aarch64-sme2p2-instrinsics/acle_sme2p2_fp8_imm.cpp
@@ -0,0 +1,31 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu \
+// RUN: -target-feature +sme -target-feature +sme2p2 -target-feature +sme-mop4 -target-feature +sme-f8f32 -target-feature +sme-f8f16 -fsyntax-only -verify %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sme.h>
+
+void tests_mop4_imm_1x1(svmfloat8_t zn, svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
+ svmop4a_1x1_za16_mf8_mf8_fpm(-1, zn, zm, fpmr); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
+ svmop4a_1x1_za32_mf8_mf8_fpm(-1, zn, zm, fpmr); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+ return;
+}
+
+void tests_mop4_imm_1x2(svmfloat8_t zn, svmfloat8x2_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
+ svmop4a_1x2_za16_mf8_mf8_fpm(-1, zn, zm, fpmr); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
+ svmop4a_1x2_za32_mf8_mf8_fpm(-1, zn, zm, fpmr); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+ return;
+}
+
+void tests_mop4_imm_2x1(svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
+ svmop4a_2x1_za16_mf8_mf8_fpm(-1, zn, zm, fpmr); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
+ svmop4a_2x1_za32_mf8_mf8_fpm(-1, zn, zm, fpmr); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+ return;
+}
+
+void tests_mop4_imm_2x2(svmfloat8x2_t zn, svmfloat8x2_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
+ svmop4a_2x2_za16_mf8_mf8_fpm(-1, zn, zm, fpmr); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 1]}}
+ svmop4a_2x2_za32_mf8_mf8_fpm(-1, zn, zm, fpmr); // expected-error {{argument value 18446744073709551615 is outside the valid range [0, 3]}}
+ return;
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 77ea0bcaa4b5f..7989b5c52b27b 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3107,6 +3107,39 @@ let TargetPrefix = "aarch64" in {
}
}
+ class SME_FP8_OuterProduct_Intrinsic_Single_Single
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_nxv16i8_ty,
+ llvm_nxv16i8_ty],
+ [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+
+ class SME_FP8_OuterProduct_Intrinsic_Single_Multi
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_nxv16i8_ty,
+ llvm_nxv16i8_ty,
+ llvm_nxv16i8_ty],
+ [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+
+ class SME_FP8_OuterProduct_Intrinsic_Multi_Multi
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_nxv16i8_ty,
+ llvm_nxv16i8_ty,
+ llvm_nxv16i8_ty,
+ llvm_nxv16i8_ty],
+ [ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+
+ def int_aarch64_sme_fp8_fmop4a_za16_1x1 : SME_FP8_OuterProduct_Intrinsic_Single_Single;
+ def int_aarch64_sme_fp8_fmop4a_za32_1x1 : SME_FP8_OuterProduct_Intrinsic_Single_Single;
+ def int_aarch64_sme_fp8_fmop4a_za16_1x2 : SME_FP8_OuterProduct_Intrinsic_Single_Multi;
+ def int_aarch64_sme_fp8_fmop4a_za32_1x2 : SME_FP8_OuterProduct_Intrinsic_Single_Multi;
+ def int_aarch64_sme_fp8_fmop4a_za16_2x1 : SME_FP8_OuterProduct_Intrinsic_Single_Multi;
+ def int_aarch64_sme_fp8_fmop4a_za32_2x1 : SME_FP8_OuterProduct_Intrinsic_Single_Multi;
+ def int_aarch64_sme_fp8_fmop4a_za16_2x2 : SME_FP8_OuterProduct_Intrinsic_Multi_Multi;
+ def int_aarch64_sme_fp8_fmop4a_za32_2x2 : SME_FP8_OuterProduct_Intrinsic_Multi_Multi;
+
class SME_AddVectorToTile_Intrinsic
: DefaultAttrsIntrinsic<[],
[llvm_i32_ty,
@@ -4096,6 +4129,7 @@ let TargetPrefix = "aarch64" in {
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
[IntrInaccessibleMemOnly, IntrHasSideEffects]>;
+
//
// CVT from FP8 to half-precision/BFloat16 multi-vector
//
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index f992f73171e0e..af93cdb14a620 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -1090,8 +1090,8 @@ let Predicates = [HasSME_TMOP, HasSMEF8F16], Uses = [FPMR, FPCR] in {
def FTMOPA_M2ZZZI_BtoH : sme_tmopa_16b<0b01001, ZZ_b_mul_r, ZPR8, "ftmopa">;
}
-let Predicates = [HasSME_MOP4, HasSMEF8F16], Uses = [FPMR, FPCR] in {
- defm FMOP4A : sme2_fmop4a_fp8_fp16_2way<"fmop4a">;
+let Predicates = [HasSME_MOP4, HasSMEF8F16] in {
+ defm FMOP4A : sme2_fmop4a_fp8_fp16_2way<"fmop4a", "int_aarch64_sme_fp8_fmop4a_za16">;
}
let Predicates = [HasSME_TMOP, HasSMEF16F16] in {
@@ -1108,10 +1108,8 @@ let Predicates = [HasSME2, HasSVEBFSCALE] in {
defm BFMUL : sme2_bfmul_multi<"bfmul">;
}
-let Uses = [FPMR, FPCR] in {
let Predicates = [HasSME_MOP4, HasSMEF8F32] in {
- defm FMOP4A : sme2_fmop4a_fp8_fp32_4way<"fmop4a">;
-}
+ defm FMOP4A : sme2_fmop4a_fp8_fp32_4way<"fmop4a", "int_aarch64_sme_fp8_fmop4a_za32">;
}
let Predicates = [HasSME_MOP4, HasSMEB16B16] in {
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index c008cda21cf05..e5b896ede84c1 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -5774,20 +5774,37 @@ class sme2_fp8_fp32_quarter_tile_outer_product<bit M, bit N, string mnemonic, Re
let Inst{1-0} = ZAda;
let Constraints = "$ZAda = $_ZAda";
+ let Uses = [FPMR, FPCR];
}
-multiclass sme2_fmop4a_fp8_fp32_4way<string mnemonic> {
+multiclass sme2_fmop4a_fp8_fp32_4way<string mnemonic, string op> {
// Single vectors
- def _MZZ_BtoS : sme2_fp8_fp32_quarter_tile_outer_product<0, 0, mnemonic, ZPR8Mul2_Lo, ZPR8Mul2_Hi>;
+ def _MZZ_BtoS : sme2_fp8_fp32_quarter_tile_outer_product<0, 0, mnemonic, ZPR8Mul2_Lo, ZPR8Mul2_Hi>, SMEPseudo2Instr<NAME # _MZZ_BtoS, 1>;
+
+ def NAME # _MZZ_BtoS_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR8Mul2_Lo, ZPR8Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_BtoS, 0>;
+
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_BtoS, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv16i8>;
// Multiple and single vectors
- def _M2ZZ_BtoS : sme2_fp8_fp32_quarter_tile_outer_product<0, 1, mnemonic, ZZ_b_mul_r_Lo, ZPR8Mul2_Hi>;
+ def _M2ZZ_BtoS : sme2_fp8_fp32_quarter_tile_outer_product<0, 1, mnemonic, ZZ_b_mul_r_Lo, ZPR8Mul2_Hi>, SMEPseudo2Instr<NAME # _M2ZZ_BtoS, 1>;
+
+ def NAME # _M2ZZ_BtoS_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZZ_b_mul_r_Lo, ZPR8Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _M2ZZ_BtoS, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Single_Pat<NAME # _M2ZZ_BtoS, !cast<SDPatternOperator>(op # "_2x1"), timm32_0_3, nxv16i8>;
// Single and multiple vectors
- def _MZ2Z_BtoS : sme2_fp8_fp32_quarter_tile_outer_product<1, 0, mnemonic, ZPR8Mul2_Lo, ZZ_b_mul_r_Hi>;
+ def _MZ2Z_BtoS : sme2_fp8_fp32_quarter_tile_outer_product<1, 0, mnemonic, ZPR8Mul2_Lo, ZZ_b_mul_r_Hi>, SMEPseudo2Instr<NAME # _MZ2Z_BtoS, 1>;
+
+ def NAME # _MZ2Z_BtoS_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR8Mul2_Lo, ZZ_b_mul_r_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZ2Z_BtoS, 0>;
+
+ def : SME2_ZA_Tile_Vec_Single_Multi_Pat<NAME # _MZ2Z_BtoS, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv16i8>;
// Multiple vectors
- def _M2Z2Z_BtoS : sme2_fp8_fp32_quarter_tile_outer_product<1, 1, mnemonic, ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi>;
+ def _M2Z2Z_BtoS : sme2_fp8_fp32_quarter_tile_outer_product<1, 1, mnemonic, ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi>, SMEPseudo2Instr<NAME # _M2Z2Z_BtoS, 1>;
+
+ def NAME # _M2Z2Z_BtoS_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _M2Z2Z_BtoS, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Multi_Pat<NAME # _M2Z2Z_BtoS, !cast<SDPatternOperator>(op # "_2x2"), timm32_0_3, nxv16i8>;
}
class sme2_bf16_fp16_quarter_tile_outer_product<bit M, bit N, bit S, string mnemonic, RegisterOperand zn_ty, RegisterOperand zm_ty>
@@ -6021,20 +6038,38 @@ class sme2_fp8_fp16_quarter_tile_outer_product<bit M, bit N, string mnemonic, Re
let Inst{0} = ZAda;
let Constraints = "$ZAda = $_ZAda";
+ let Uses = [FPMR, FPCR];
}
-multiclass sme2_fmop4a_fp8_fp16_2way<string mnemonic> {
+multiclass sme2_fmop4a_fp8_fp16_2way<string mnemonic, string op> {
+
// Single vectors
- def _MZZ_BtoH : sme2_fp8_fp16_quarter_tile_outer_product<0b0, 0b0, mnemonic, ZPR8Mul2_Lo, ZPR8Mul2_Hi>;
+ def _MZZ_BtoH : sme2_fp8_fp16_quarter_tile_outer_product<0b0, 0b0, mnemonic, ZPR8Mul2_Lo, ZPR8Mul2_Hi>, SMEPseudo2Instr<NAME # _MZZ_BtoH, 1>;
+
+ def NAME # _MZZ_BtoH_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR8Mul2_Lo, ZPR8Mul2_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _MZZ_BtoH, 0>;
+
+ def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_BtoH, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_1, nxv16i8>;
// Multiple and single vectors
- def _M2ZZ_BtoH : sme2_fp8_fp16_quarter_tile_outer_product<0b0, 0b1, mnemonic, ZZ_b_mul_r_Lo, ZPR8Mul2_Hi>;
+ def _M2ZZ_BtoH : sme2_fp8_fp16_quarter_tile_outer_product<0b0, 0b1, mnemonic, ZZ_b_mul_r_Lo, ZPR8Mul2_Hi>, SMEPseudo2Instr<NAME # _M2ZZ_BtoH, 1>;
+
+ def NAME # _M2ZZ_BtoH_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZZ_b_mul_r_Lo, ZPR8Mul2_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _M2ZZ_BtoH, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Single_Pat<NAME # _M2ZZ_BtoH, !cast<SDPatternOperator>(op # "_2x1"), timm32_0_1, nxv16i8>;
// Single and multiple vectors
- def _MZ2Z_BtoH : sme2_fp8_fp16_quarter_tile_outer_product<0b1, 0b0, mnemonic, ZPR8Mul2_Lo, ZZ_b_mul_r_Hi>;
+ def _MZ2Z_BtoH : sme2_fp8_fp16_quarter_tile_outer_product<0b1, 0b0, mnemonic, ZPR8Mul2_Lo, ZZ_b_mul_r_Hi>, SMEPseudo2Instr<NAME # _MZ2Z_BtoH, 1>;
+
+ def NAME # _MZ2Z_BtoH_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR8Mul2_Lo, ZZ_b_mul_r_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _MZ2Z_BtoH, 0>;
+
+ def : SME2_ZA_Tile_Vec_Single_Multi_Pat<NAME # _MZ2Z_BtoH, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_1, nxv16i8>;
// Multiple vectors
- def _M2Z2Z_BtoH : sme2_fp8_fp16_quarter_tile_outer_product<0b1, 0b1, mnemonic, ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi>;
+ def _M2Z2Z_BtoH : sme2_fp8_fp16_quarter_tile_outer_product<0b1, 0b1, mnemonic, ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi>, SMEPseudo2Instr<NAME # _M2Z2Z_BtoH, 1>;
+
+ def NAME # _M2Z2Z_BtoH_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _M2Z2Z_BtoH, 0>;
+
+ def : SME2_ZA_Tile_Vec_Multi_Multi_Pat<NAME # _M2Z2Z_BtoH, !cast<SDPatternOperator>(op # "_2x2"), timm32_0_1, nxv16i8>;
}
// FP8 SME FDOT instructions
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4-fp8.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4-fp8.ll
new file mode 100644
index 0000000000000..5a0cf8e57904b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-mop4-fp8.ll
@@ -0,0 +1,98 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -force-streaming -verify-machineinstrs < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define void @mop4a_za16_fp8_1x1(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4a_za16_fp8_1x1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4a za0.h, z0.b, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fp8.fmop4a.za16.1x1(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4a_za16_fp8_1x2(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) #0 {
+; CHECK-LABEL: mop4a_za16_fp8_1x2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4a za0.h, z0.b, { z24.b, z25.b }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fp8.fmop4a.za16.1x2(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ ret void
+}
+
+define void @mop4a_za16_fp8_2x1(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4a_za16_fp8_2x1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmop4a za0.h, { z0.b, z1.b }, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fp8.fmop4a.za16.2x1(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4a_za16_fp8_2x2(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) #0 {
+; CHECK-LABEL: mop4a_za16_fp8_2x2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z3.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmop4a za0.h, { z0.b, z1.b }, { z24.b, z25.b }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fp8.fmop4a.za16.2x2(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ ret void
+}
+
+define void @mop4a_za32_fp8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_fp8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4a za0.s, z0.b, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fp8.fmop4a.za32.1x1(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4a_za32_fp8_1x2(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) #0 {
+; CHECK-LABEL: mop4a_za32_fp8_1x2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z2.d
+; CHECK-NEXT: mov z24.d, z1.d
+; CHECK-NEXT: fmop4a za0.s, z0.b, { z24.b, z25.b }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fp8.fmop4a.za32.1x2(i32 0, <vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ ret void
+}
+
+define void @mop4a_za32_fp8_2x1(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: mop4a_za32_fp8_2x1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmop4a za0.s, { z0.b, z1.b }, z24.b
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fp8.fmop4a.za32.2x1(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm)
+ ret void
+}
+
+define void @mop4a_za32_fp8_2x2(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2) #0 {
+; CHECK-LABEL: mop4a_za32_fp8_2x2:
+; CHECK: // %bb.0:
+; CHECK-NEXT: mov z25.d, z3.d
+; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: mov z24.d, z2.d
+; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: fmop4a za0.s, { z0.b, z1.b }, { z24.b, z25.b }
+; CHECK-NEXT: ret
+ call void @llvm.aarch64.sme.fp8.fmop4a.za32.2x2(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm1, <vscale x 16 x i8> %zm2)
+ ret void
+}
+
+attributes #0 = {nounwind "target-features" = "+sme-f8f16,+sme-f8f32,+sme2p1,+sme-mop4" }
>From 98ab691ddca939dadc4bff460cab6d1ff8f38fca Mon Sep 17 00:00:00 2001
From: Virginia Cangelosi <virginia.cangelosi at arm.com>
Date: Mon, 10 Mar 2025 09:44:16 +0000
Subject: [PATCH 2/3] Address review comments
---
clang/include/clang/Basic/arm_sme.td | 14 +++++++-------
llvm/include/llvm/IR/IntrinsicsAArch64.td | 15 +++++++--------
llvm/lib/Target/AArch64/SMEInstrFormats.td | 8 ++++----
3 files changed, 18 insertions(+), 19 deletions(-)
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 62b95e19bf78a..bb501122d9526 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -324,19 +324,19 @@ let SMETargetGuard = "sme2,sme-mop4,sme-b16b16" in {
////////////////////////////////////////////////////////////////////////////////
// SME2 - FP8 FMOP4A, FMOP4S
-multiclass MOP4_FP8<string za, string t, list<ImmCheck> checks> {
- def _1x1 : Inst<"svmop4a" # "[_1x1]" # za # "[_{d}_{d}]", "vidd>", t, MergeNone, "aarch64_sme_fp8_fmop4a" # za # "_1x1", [IsInOutZA, IsStreaming, IsOverloadNone], checks>;
- def _1x2 : Inst<"svmop4a" # "[_1x2]" # za # "[_{d}_{d}]", "vid2>", t, MergeNone, "aarch64_sme_fp8_fmop4a" # za # "_1x2", [IsInOutZA, IsStreaming, IsOverloadNone], checks>;
- def _2x1 : Inst<"svmop4a" # "[_2x1]" # za # "[_{d}_{d}]", "vi2d>", t, MergeNone, "aarch64_sme_fp8_fmop4a" # za # "_2x1", [IsInOutZA, IsStreaming, IsOverloadNone], checks>;
- def _2x2 : Inst<"svmop4a" # "[_2x2]" # za # "[_{d}_{d}]", "vi22>", t, MergeNone, "aarch64_sme_fp8_fmop4a" # za # "_2x2", [IsInOutZA, IsStreaming, IsOverloadNone], checks>;
+multiclass MOP4_FP8<string za, list<ImmCheck> checks> {
+ def _1x1 : Inst<"svmop4a" # "[_1x1]" # za # "[_{d}_{d}]", "vidd>", "m", MergeNone, "aarch64_sme_fp8_fmop4a" # za # "_1x1", [IsInOutZA, IsStreaming, IsOverloadNone], checks>;
+ def _1x2 : Inst<"svmop4a" # "[_1x2]" # za # "[_{d}_{d}]", "vid2>", "m", MergeNone, "aarch64_sme_fp8_fmop4a" # za # "_1x2", [IsInOutZA, IsStreaming, IsOverloadNone], checks>;
+ def _2x1 : Inst<"svmop4a" # "[_2x1]" # za # "[_{d}_{d}]", "vi2d>", "m", MergeNone, "aarch64_sme_fp8_fmop4a" # za # "_2x1", [IsInOutZA, IsStreaming, IsOverloadNone], checks>;
+ def _2x2 : Inst<"svmop4a" # "[_2x2]" # za # "[_{d}_{d}]", "vi22>", "m", MergeNone, "aarch64_sme_fp8_fmop4a" # za # "_2x2", [IsInOutZA, IsStreaming, IsOverloadNone], checks>;
}
let SMETargetGuard = "sme2,sme-mop4,sme-f8f32" in {
- defm SVMOP4A_FP8_ZA32 : MOP4_FP8<"_za32", "m", [ImmCheck<0, ImmCheck0_3>]>;
+ defm SVMOP4A_FP8_ZA32 : MOP4_FP8<"_za32", [ImmCheck<0, ImmCheck0_3>]>;
}
let SMETargetGuard = "sme2,sme-mop4,sme-f8f16" in {
- defm SVMOP4A_FP8_ZA16 : MOP4_FP8<"_za16", "m", [ImmCheck<0, ImmCheck0_1>]>;
+ defm SVMOP4A_FP8_ZA16 : MOP4_FP8<"_za16", [ImmCheck<0, ImmCheck0_1>]>;
}
////////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 7989b5c52b27b..eae23af98711a 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3131,14 +3131,13 @@ let TargetPrefix = "aarch64" in {
llvm_nxv16i8_ty],
[ImmArg<ArgIndex<0>>, IntrInaccessibleMemOnly, IntrHasSideEffects]>;
- def int_aarch64_sme_fp8_fmop4a_za16_1x1 : SME_FP8_OuterProduct_Intrinsic_Single_Single;
- def int_aarch64_sme_fp8_fmop4a_za32_1x1 : SME_FP8_OuterProduct_Intrinsic_Single_Single;
- def int_aarch64_sme_fp8_fmop4a_za16_1x2 : SME_FP8_OuterProduct_Intrinsic_Single_Multi;
- def int_aarch64_sme_fp8_fmop4a_za32_1x2 : SME_FP8_OuterProduct_Intrinsic_Single_Multi;
- def int_aarch64_sme_fp8_fmop4a_za16_2x1 : SME_FP8_OuterProduct_Intrinsic_Single_Multi;
- def int_aarch64_sme_fp8_fmop4a_za32_2x1 : SME_FP8_OuterProduct_Intrinsic_Single_Multi;
- def int_aarch64_sme_fp8_fmop4a_za16_2x2 : SME_FP8_OuterProduct_Intrinsic_Multi_Multi;
- def int_aarch64_sme_fp8_fmop4a_za32_2x2 : SME_FP8_OuterProduct_Intrinsic_Multi_Multi;
+ // 16 and 32 bit multi-vector floating point 8 Quarter Tile Quarter Product
+ foreach za = ["za16", "za32"] in {
+ def int_aarch64_sme_fp8_fmop4a_ # za # "_1x1" : SME_FP8_OuterProduct_Intrinsic_Single_Single;
+ def int_aarch64_sme_fp8_fmop4a_ # za # "_1x2" : SME_FP8_OuterProduct_Intrinsic_Single_Multi;
+ def int_aarch64_sme_fp8_fmop4a_ # za # "_2x1" : SME_FP8_OuterProduct_Intrinsic_Single_Multi;
+ def int_aarch64_sme_fp8_fmop4a_ # za # "_2x2" : SME_FP8_OuterProduct_Intrinsic_Multi_Multi;
+ }
class SME_AddVectorToTile_Intrinsic
: DefaultAttrsIntrinsic<[],
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index e5b896ede84c1..d701a12d31061 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -5783,7 +5783,7 @@ multiclass sme2_fmop4a_fp8_fp32_4way<string mnemonic, string op> {
def NAME # _MZZ_BtoS_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR8Mul2_Lo, ZPR8Mul2_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZZ_BtoS, 0>;
- def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_BtoS, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv16i8>;
+ def : SME2_ZA_Tile_Vec_Single_Single_Pat<NAME # _MZZ_BtoS, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_3, nxv16i8>;
// Multiple and single vectors
def _M2ZZ_BtoS : sme2_fp8_fp32_quarter_tile_outer_product<0, 1, mnemonic, ZZ_b_mul_r_Lo, ZPR8Mul2_Hi>, SMEPseudo2Instr<NAME # _M2ZZ_BtoS, 1>;
@@ -5797,7 +5797,7 @@ multiclass sme2_fmop4a_fp8_fp32_4way<string mnemonic, string op> {
def NAME # _MZ2Z_BtoS_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR8Mul2_Lo, ZZ_b_mul_r_Hi, SMEMatrixTileS>, SMEPseudo2Instr<NAME # _MZ2Z_BtoS, 0>;
- def : SME2_ZA_Tile_Vec_Single_Multi_Pat<NAME # _MZ2Z_BtoS, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv16i8>;
+ def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_BtoS, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_3, nxv16i8>;
// Multiple vectors
def _M2Z2Z_BtoS : sme2_fp8_fp32_quarter_tile_outer_product<1, 1, mnemonic, ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi>, SMEPseudo2Instr<NAME # _M2Z2Z_BtoS, 1>;
@@ -6048,7 +6048,7 @@ multiclass sme2_fmop4a_fp8_fp16_2way<string mnemonic, string op> {
def NAME # _MZZ_BtoH_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR8Mul2_Lo, ZPR8Mul2_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _MZZ_BtoH, 0>;
- def : SME2_ZA_Tile_TwoVec_Pat<NAME # _MZZ_BtoH, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_1, nxv16i8>;
+ def : SME2_ZA_Tile_Vec_Single_Single_Pat<NAME # _MZZ_BtoH, !cast<SDPatternOperator>(op # "_1x1"), timm32_0_1, nxv16i8>;
// Multiple and single vectors
def _M2ZZ_BtoH : sme2_fp8_fp16_quarter_tile_outer_product<0b0, 0b1, mnemonic, ZZ_b_mul_r_Lo, ZPR8Mul2_Hi>, SMEPseudo2Instr<NAME # _M2ZZ_BtoH, 1>;
@@ -6062,7 +6062,7 @@ multiclass sme2_fmop4a_fp8_fp16_2way<string mnemonic, string op> {
def NAME # _MZ2Z_BtoH_PSEUDO : sme2_quarter_tile_outer_product_pseudo<ZPR8Mul2_Lo, ZZ_b_mul_r_Hi, SMEMatrixTileH>, SMEPseudo2Instr<NAME # _MZ2Z_BtoH, 0>;
- def : SME2_ZA_Tile_Vec_Single_Multi_Pat<NAME # _MZ2Z_BtoH, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_1, nxv16i8>;
+ def : SME2_ZA_Tile_Vec_Multi_Pat<NAME # _MZ2Z_BtoH, !cast<SDPatternOperator>(op # "_1x2"), timm32_0_1, nxv16i8>;
// Multiple vectors
def _M2Z2Z_BtoH : sme2_fp8_fp16_quarter_tile_outer_product<0b1, 0b1, mnemonic, ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi>, SMEPseudo2Instr<NAME # _M2Z2Z_BtoH, 1>;
>From c8368de86b60c12cf0132b9b8ba5d427d75828a9 Mon Sep 17 00:00:00 2001
From: Virginia Cangelosi <virginia.cangelosi at arm.com>
Date: Mon, 14 Apr 2025 10:23:40 +0000
Subject: [PATCH 3/3] Fix tests
---
.../sme2-intrinsics/acle_sme2_mop4_fp8.c | 24 +++++++++----------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_fp8.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_fp8.c
index 24fa11538dd32..f89277bb049f8 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_fp8.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_mop4_fp8.c
@@ -2,17 +2,17 @@
// REQUIRES: aarch64-registered-target
// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-f8f32 -target-feature +sme-f8f16 -target-feature +sme-mop4 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-f8f32 -target-feature +sme-f8f16 -target-feature +sme-mop4 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-f8f32 -target-feature +sme-f8f16 -target-feature +sme-mop4 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-f8f32 -target-feature +sme-f8f16 -target-feature +sme-mop4 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-f8f32 -target-feature +sme-f8f16 -target-feature +sme-mop4 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64 -target-feature +sme-f8f32 -target-feature +sme-f8f16 -target-feature +sme-mop4 -target-feature +sme -target-feature +sme2 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
// RUN: %clang_cc1 -triple aarch64 -target-feature +sme-f8f32 -target-feature +sme-f8f16 -target-feature +sme-mop4 -target-feature +sme -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
#include <arm_sme.h>
#ifdef SME_OVERLOADED_FORMS
-#define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#define SME_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
#else
-#define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#define SME_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
#endif
// CHECK-LABEL: define dso_local void @test_svmop4a_1x1_za16_mf8_mf8_fpm(
@@ -30,7 +30,7 @@
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za16_mf8_mf8_fpm(svmfloat8_t zn, svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za16,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
+ SME_ACLE_FUNC(svmop4a,_1x1,_za16,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
}
// CHECK-LABEL: define dso_local void @test_svmop4a_1x2_za16_mf8_mf8_fpm(
@@ -48,7 +48,7 @@ void test_svmop4a_1x1_za16_mf8_mf8_fpm(svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x2_za16_mf8_mf8_fpm(svmfloat8_t zn, svmfloat8x2_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x2_za16,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
+ SME_ACLE_FUNC(svmop4a,_1x2,_za16,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
}
// CHECK-LABEL: define dso_local void @test_svmop4a_2x1_za16_mf8_mf8_fpm(
@@ -66,7 +66,7 @@ void test_svmop4a_1x2_za16_mf8_mf8_fpm(svmfloat8_t zn, svmfloat8x2_t zm, fpm_t f
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_2x1_za16_mf8_mf8_fpm(svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_2x1_za16,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
+ SME_ACLE_FUNC(svmop4a,_2x1,_za16,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
}
// CHECK-LABEL: define dso_local void @test_svmop4a_2x2_za16_mf8_mf8_fpm(
@@ -84,7 +84,7 @@ void test_svmop4a_2x1_za16_mf8_mf8_fpm(svmfloat8x2_t zn, svmfloat8_t zm, fpm_t f
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_2x2_za16_mf8_mf8_fpm(svmfloat8x2_t zn, svmfloat8x2_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_2x2_za16,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
+ SME_ACLE_FUNC(svmop4a,_2x2,_za16,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
}
// CHECK-LABEL: define dso_local void @test_svmop4a_1x1_za32_mf8_mf8_fpm(
@@ -102,7 +102,7 @@ void test_svmop4a_2x2_za16_mf8_mf8_fpm(svmfloat8x2_t zn, svmfloat8x2_t zm, fpm_t
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x1_za32_mf8_mf8_fpm(svmfloat8_t zn, svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x1_za32,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
+ SME_ACLE_FUNC(svmop4a,_1x1,_za32,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
}
// CHECK-LABEL: define dso_local void @test_svmop4a_1x2_za32_mf8_mf8_fpm(
@@ -120,7 +120,7 @@ void test_svmop4a_1x1_za32_mf8_mf8_fpm(svmfloat8_t zn, svmfloat8_t zm, fpm_t fpm
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_1x2_za32_mf8_mf8_fpm(svmfloat8_t zn, svmfloat8x2_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_1x2_za32,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
+ SME_ACLE_FUNC(svmop4a,_1x2,_za32,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
}
// CHECK-LABEL: define dso_local void @test_svmop4a_2x1_za32_mf8_mf8_fpm(
@@ -138,7 +138,7 @@ void test_svmop4a_1x2_za32_mf8_mf8_fpm(svmfloat8_t zn, svmfloat8x2_t zm, fpm_t f
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_2x1_za32_mf8_mf8_fpm(svmfloat8x2_t zn, svmfloat8_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_2x1_za32,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
+ SME_ACLE_FUNC(svmop4a,_2x1,_za32,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
}
// CHECK-LABEL: define dso_local void @test_svmop4a_2x2_za32_mf8_mf8_fpm(
@@ -156,5 +156,5 @@ void test_svmop4a_2x1_za32_mf8_mf8_fpm(svmfloat8x2_t zn, svmfloat8_t zm, fpm_t f
// CPP-CHECK-NEXT: ret void
//
void test_svmop4a_2x2_za32_mf8_mf8_fpm(svmfloat8x2_t zn, svmfloat8x2_t zm, fpm_t fpmr) __arm_streaming __arm_inout("za") {
- SME_ACLE_FUNC(svmop4a_2x2_za32,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
+ SME_ACLE_FUNC(svmop4a,_2x2,_za32,_mf8_mf8,_fpm)(1, zn, zm, fpmr);
}
More information about the llvm-commits
mailing list