[clang] [llvm] [Clang][AArch64] Implement widening FMMLA intrinsics (PR #165282)
Amina Chabane via llvm-commits
llvm-commits at lists.llvm.org
Thu Nov 20 06:04:39 PST 2025
https://github.com/Amichaxx updated https://github.com/llvm/llvm-project/pull/165282
>From 9004ff2f276a1c84043577a97721049343ae8bed Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Mon, 27 Oct 2025 16:49:03 +0000
Subject: [PATCH 1/6] Implement widening FMMLA intrinsics
- F16 to F32
- MF8 to F32
- MF8 to F16
---
clang/include/clang/Basic/arm_sve.td | 12 ++++++
.../sve-intrinsics/acle_sve_fmmla-f32f16.c | 33 +++++++++++++++
.../sve2-intrinsics/acle_sve2_fmmla-f16mf8.c | 35 ++++++++++++++++
.../sve2-intrinsics/acle_sve2_fmmla-f32mf8.c | 36 ++++++++++++++++
llvm/include/llvm/IR/IntrinsicsAArch64.td | 14 +++++++
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 6 ++-
llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll | 32 +++++++++++++++
.../test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll | 39 ++++++++++++++++++
.../test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll | 41 +++++++++++++++++++
9 files changed, 247 insertions(+), 1 deletion(-)
create mode 100644 clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c
create mode 100644 clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c
create mode 100644 clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c
create mode 100644 llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll
create mode 100644 llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index d2b7b78b9970f..c63da3308d6a0 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1196,6 +1196,18 @@ def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla
let SVETargetGuard = "f64mm", SMETargetGuard = InvalidMode in {
def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd", "d", MergeNone, "aarch64_sve_fmmla">;
+let SVETargetGuard = "sve-f16f32mm", SMETargetGuard = InvalidMode in {
+ def SVMLLA_F32_F16 : SInst<"svmmla[_f32_f16]", "MMdd", "h", MergeNone, "aarch64_sve_fmmla_f16f32", [IsOverloadNone]>;
+}
+
+let SVETargetGuard = "sve2,f8f32mm", SMETargetGuard = InvalidMode in {
+ def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "MM~~>", "m", MergeNone, "aarch64_sve_fmmla_mf8f32", [IsOverloadNone]>;
+}
+
+let SVETargetGuard = "sve2,f8f16mm", SMETargetGuard = InvalidMode in {
+ def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "OO~~>", "m", MergeNone, "aarch64_sve_fmmla_mf8f16", [IsOverloadNone]>;
+}
+
def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn1q">;
def SVTRN2Q : SInst<"svtrn2q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn2q">;
def SVUZP1Q : SInst<"svuzp1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_uzp1q">;
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c
new file mode 100644
index 0000000000000..bebaa059e5c84
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c
@@ -0,0 +1,33 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve-f16f32mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_f32f16(
+// CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float> [[ACC]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z11test_f32f16u13__SVFloat32_tu13__SVFloat16_tS0_(
+// CPP-CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float> [[ACC]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_f32f16(svfloat32_t acc, svfloat16_t a, svfloat16_t b) {
+ return SVE_ACLE_FUNC(svmmla, _f32_f16, , )(acc, a, b);
+}
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c
new file mode 100644
index 0000000000000..a19ad0576bb4b
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c
@@ -0,0 +1,35 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f16mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_f16mf8(
+// CHECK-SAME: <vscale x 8 x half> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z11test_f16mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m(
+// CPP-CHECK-SAME: <vscale x 8 x half> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
+//
+svfloat16_t test_f16mf8(svfloat16_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) {
+ return SVE_ACLE_FUNC(svmmla, _f16_mf8, _fpm, )(acc, a, b, fpmr);
+}
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c
new file mode 100644
index 0000000000000..526f2b1f45927
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c
@@ -0,0 +1,36 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve -target-feature +sve2 -target-feature +f8f32mm -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+// A simple used,unused... macro, long enough to represent any SVE builtin.
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#endif
+
+// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_f32mf8(
+// CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z11test_f32mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m(
+// CPP-CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
+//
+svfloat32_t test_f32mf8(svfloat32_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) {
+ return SVE_ACLE_FUNC(svmmla, _f32_mf8, _fpm, )(acc, a, b, fpmr);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index b81edc385cd43..832f97fc95959 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2807,6 +2807,20 @@ def int_aarch64_sve_sudot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic;
//
def int_aarch64_sve_fmmla : AdvSIMD_3VectorArg_Intrinsic;
+def int_aarch64_sve_fmmla_f16f32
+ : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
+ [ llvm_nxv4f32_ty, llvm_nxv8f16_ty, llvm_nxv8f16_ty ],
+ [IntrNoMem]>;
+
+def int_aarch64_sve_fmmla_mf8f32
+ : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
+ [ llvm_nxv4f32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty ],
+ [IntrNoMem]>;
+
+def int_aarch64_sve_fmmla_mf8f16
+ : DefaultAttrsIntrinsic<[llvm_nxv8f16_ty],
+ [ llvm_nxv8f16_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty ],
+ [IntrNoMem]>;
//
// SVE ACLE: 7.2. BFloat16 extensions
//
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 3b268dcbca600..c756873d0bf7e 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3684,7 +3684,7 @@ let Predicates = [HasSVE, HasMatMulFP32] in {
} // End HasSVE, HasMatMulFP32
let Predicates = [HasSVE_F16F32MM] in {
- def FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16>;
+ defm FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16, int_aarch64_sve_fmmla_f16f32, nxv4f32, nxv8f16>;
} // End HasSVE_F16F32MM
let Predicates = [HasSVE, HasMatMulFP64] in {
@@ -4745,10 +4745,14 @@ defm FMLALLTT_ZZZ : sve2_fp8_mla<0b011, ZPR32, "fmlalltt", nxv4f32, int_aarch64_
let Predicates = [HasSVE2, HasF8F32MM] in {
def FMMLA_ZZZ_BtoS : sve2_fp8_mmla<0b0, ZPR32, "fmmla">;
+ def : Pat<(nxv4f32 (int_aarch64_sve_fmmla_mf8f32 nxv4f32:$acc, nxv16i8:$zn, nxv16i8:$zm)),
+ (FMMLA_ZZZ_BtoS $acc, $zn, $zm)>;
}
let Predicates = [HasSVE2, HasF8F16MM] in {
def FMMLA_ZZZ_BtoH : sve2_fp8_mmla<0b1, ZPR16, "fmmla">;
+ def : Pat<(nxv8f16 (int_aarch64_sve_fmmla_mf8f16 nxv8f16:$acc, nxv16i8:$zn, nxv16i8:$zm)),
+ (FMMLA_ZZZ_BtoH $acc, $zn, $zm)>;
}
let Predicates = [HasSSVE_FP8DOT2] in {
diff --git a/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll b/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll
new file mode 100644
index 0000000000000..ea636d65a479c
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve-f16f32mm < %s | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 4 x float> @_Z1tu13__SVFloat32_tu13__SVFloat16_tS0_(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: _Z1tu13__SVFloat32_tu13__SVFloat16_tS0_:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: str z0, [sp, #2, mul vl]
+; CHECK-NEXT: fmmla z0.s, z1.h, z2.h
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z2, [sp]
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %acc.addr = alloca <vscale x 4 x float>, align 16
+ %a.addr = alloca <vscale x 8 x half>, align 16
+ %b.addr = alloca <vscale x 8 x half>, align 16
+ store <vscale x 4 x float> %acc, ptr %acc.addr, align 16
+ store <vscale x 8 x half> %a, ptr %a.addr, align 16
+ store <vscale x 8 x half> %b, ptr %b.addr, align 16
+ %0 = load <vscale x 4 x float>, ptr %acc.addr, align 16
+ %1 = load <vscale x 8 x half>, ptr %a.addr, align 16
+ %2 = load <vscale x 8 x half>, ptr %b.addr, align 16
+ %3 = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2)
+ ret <vscale x 4 x float> %3
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float>, <vscale x 8 x half>, <vscale x 8 x half>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll b/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll
new file mode 100644
index 0000000000000..0fdd6bf2508e3
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f16mm < %s | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 8 x half> @_Z5test2u13__SVFloat16_tu13__SVMfloat8_tS0_m(<vscale x 8 x half> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i64 %fpmr) {
+; CHECK-LABEL: _Z5test2u13__SVFloat16_tu13__SVMfloat8_tS0_m:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: addvl x8, sp, #3
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp]
+; CHECK-NEXT: str x0, [x8, #8]
+; CHECK-NEXT: msr FPMR, x0
+; CHECK-NEXT: fmmla z0.h, z1.b, z2.b
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %acc.addr = alloca <vscale x 8 x half>, align 16
+ %a.addr = alloca <vscale x 16 x i8>, align 16
+ %b.addr = alloca <vscale x 16 x i8>, align 16
+ %fpmr.addr = alloca i64, align 8
+ store <vscale x 8 x half> %acc, ptr %acc.addr, align 16
+ store <vscale x 16 x i8> %a, ptr %a.addr, align 16
+ store <vscale x 16 x i8> %b, ptr %b.addr, align 16
+ store i64 %fpmr, ptr %fpmr.addr, align 8
+ %0 = load <vscale x 8 x half>, ptr %acc.addr, align 16
+ %1 = load <vscale x 16 x i8>, ptr %a.addr, align 16
+ %2 = load <vscale x 16 x i8>, ptr %b.addr, align 16
+ %3 = load i64, ptr %fpmr.addr, align 8
+ call void @llvm.aarch64.set.fpmr(i64 %3)
+ %4 = call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2)
+ ret <vscale x 8 x half> %4
+}
+
+declare <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half>, <vscale x 16 x i8>, <vscale x 16 x i8>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll b/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll
new file mode 100644
index 0000000000000..007a164ac77da
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f32mm < %s | FileCheck %s --check-prefixes=CHECK
+
+define dso_local <vscale x 4 x float> @_Z5t_varu13__SVFloat32_tu13__SVMfloat8_tS0_m(<vscale x 4 x float> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i64 noundef %fpmr) #0 {
+; CHECK-LABEL: _Z5t_varu13__SVFloat32_tu13__SVMfloat8_tS0_m:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-3
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: addvl x8, sp, #3
+; CHECK-NEXT: str z1, [sp, #1, mul vl]
+; CHECK-NEXT: str z0, [sp, #2, mul vl]
+; CHECK-NEXT: str z2, [sp]
+; CHECK-NEXT: str x0, [x8, #8]
+; CHECK-NEXT: msr FPMR, x0
+; CHECK-NEXT: fmmla z0.s, z1.b, z2.b
+; CHECK-NEXT: addvl sp, sp, #3
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %acc.addr = alloca <vscale x 4 x float>, align 16
+ %a.addr = alloca <vscale x 16 x i8>, align 16
+ %b.addr = alloca <vscale x 16 x i8>, align 16
+ %fpmr.addr = alloca i64, align 8
+ store <vscale x 4 x float> %acc, ptr %acc.addr, align 16
+ store <vscale x 16 x i8> %a, ptr %a.addr, align 16
+ store <vscale x 16 x i8> %b, ptr %b.addr, align 16
+ store i64 %fpmr, ptr %fpmr.addr, align 8
+ %0 = load <vscale x 4 x float>, ptr %acc.addr, align 16
+ %1 = load <vscale x 16 x i8>, ptr %a.addr, align 16
+ %2 = load <vscale x 16 x i8>, ptr %b.addr, align 16
+ %3 = load i64, ptr %fpmr.addr, align 8
+ call void @llvm.aarch64.set.fpmr(i64 %3)
+ %4 = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2)
+ ret <vscale x 4 x float> %4
+}
+
+declare void @llvm.aarch64.set.fpmr(i64)
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i8>)
>From 4b3703398a6d3ebac4eb1a9e6b46022f61f06da1 Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Wed, 12 Nov 2025 14:38:05 +0000
Subject: [PATCH 2/6] - Implemented overloading for fmmla intrinsics, replaced
fixed-type intrinsics - Prototype cleanups - Updated ll tests to remove
unnecessary IR - Removed unused arguments in clang test macros - Removed
redundant check lines in ll tests
---
clang/include/clang/Basic/arm_sve.td | 10 ++---
.../sve-intrinsics/acle_sve_fmmla-f32f16.c | 12 +++---
.../sve-intrinsics/acle_sve_matmul_fp32.c | 4 +-
.../sve-intrinsics/acle_sve_matmul_fp64.c | 4 +-
.../sve2-intrinsics/acle_sve2_fmmla-f16mf8.c | 10 ++---
.../sve2-intrinsics/acle_sve2_fmmla-f32mf8.c | 10 ++---
llvm/include/llvm/IR/IntrinsicsAArch64.td | 18 ++-------
.../lib/Target/AArch64/AArch64SVEInstrInfo.td | 10 ++---
llvm/lib/Target/AArch64/SVEInstrFormats.td | 6 +++
llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll | 30 +++-----------
.../test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll | 37 +++---------------
.../test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll | 39 +++----------------
12 files changed, 53 insertions(+), 137 deletions(-)
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index c63da3308d6a0..3901c88323ff4 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1190,22 +1190,22 @@ def SVSUDOT_LANE_S : SInst<"svsudot_lane[_s32]", "ddqbi", "i", MergeNone, "aarc
}
let SVETargetGuard = "f32mm", SMETargetGuard = InvalidMode in {
-def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla">;
+def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>;
}
let SVETargetGuard = "f64mm", SMETargetGuard = InvalidMode in {
-def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd", "d", MergeNone, "aarch64_sve_fmmla">;
+def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd", "d", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>;
let SVETargetGuard = "sve-f16f32mm", SMETargetGuard = InvalidMode in {
- def SVMLLA_F32_F16 : SInst<"svmmla[_f32_f16]", "MMdd", "h", MergeNone, "aarch64_sve_fmmla_f16f32", [IsOverloadNone]>;
+ def SVMLLA_F32_F16 : SInst<"svmmla[_f32_f16]", "ddhh", "f", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>;
}
let SVETargetGuard = "sve2,f8f32mm", SMETargetGuard = InvalidMode in {
- def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "MM~~>", "m", MergeNone, "aarch64_sve_fmmla_mf8f32", [IsOverloadNone]>;
+ def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "dd~~>", "f", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>;
}
let SVETargetGuard = "sve2,f8f16mm", SMETargetGuard = InvalidMode in {
- def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "OO~~>", "m", MergeNone, "aarch64_sve_fmmla_mf8f16", [IsOverloadNone]>;
+ def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "dd~~>", "h", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>;
}
def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn1q">;
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c
index bebaa059e5c84..ef74024f7b091 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_fmmla-f32f16.c
@@ -10,24 +10,24 @@
#include <arm_sve.h>
#ifdef SVE_OVERLOADED_FORMS
-// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#define SVE_ACLE_FUNC(A1, A3) A1##A3
#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#define SVE_ACLE_FUNC(A1, A2) A1##A2
#endif
+
// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_f32f16(
// CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float> [[ACC]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv8f16(<vscale x 4 x float> [[ACC]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
//
// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z11test_f32f16u13__SVFloat32_tu13__SVFloat16_tS0_(
// CPP-CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 8 x half> [[A:%.*]], <vscale x 8 x half> [[B:%.*]]) #[[ATTR0:[0-9]+]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
-// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float> [[ACC]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv8f16(<vscale x 4 x float> [[ACC]], <vscale x 8 x half> [[A]], <vscale x 8 x half> [[B]])
// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
//
svfloat32_t test_f32f16(svfloat32_t acc, svfloat16_t a, svfloat16_t b) {
- return SVE_ACLE_FUNC(svmmla, _f32_f16, , )(acc, a, b);
+ return SVE_ACLE_FUNC(svmmla, _f32_f16)(acc, a, b);
}
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp32.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp32.c
index 10442f4e31153..7d1efb7b6d954 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp32.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp32.c
@@ -17,12 +17,12 @@
// CHECK-LABEL: @test_svmmla_f32(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32(<vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> [[Y:%.*]], <vscale x 4 x float> [[Z:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv4f32(<vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> [[Y:%.*]], <vscale x 4 x float> [[Z:%.*]])
// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
//
// CPP-CHECK-LABEL: @_Z15test_svmmla_f32u13__SVFloat32_tS_S_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32(<vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> [[Y:%.*]], <vscale x 4 x float> [[Z:%.*]])
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv4f32(<vscale x 4 x float> [[X:%.*]], <vscale x 4 x float> [[Y:%.*]], <vscale x 4 x float> [[Z:%.*]])
// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
//
svfloat32_t test_svmmla_f32(svfloat32_t x, svfloat32_t y, svfloat32_t z) {
diff --git a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp64.c b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp64.c
index 8586a65fa240f..da211c4fba324 100644
--- a/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp64.c
+++ b/clang/test/CodeGen/AArch64/sve-intrinsics/acle_sve_matmul_fp64.c
@@ -17,12 +17,12 @@
// CHECK-LABEL: @test_svmmla_f64(
// CHECK-NEXT: entry:
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fmmla.nxv2f64(<vscale x 2 x double> [[X:%.*]], <vscale x 2 x double> [[Y:%.*]], <vscale x 2 x double> [[Z:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fmmla.nxv2f64.nxv2f64(<vscale x 2 x double> [[X:%.*]], <vscale x 2 x double> [[Y:%.*]], <vscale x 2 x double> [[Z:%.*]])
// CHECK-NEXT: ret <vscale x 2 x double> [[TMP0]]
//
// CPP-CHECK-LABEL: @_Z15test_svmmla_f64u13__SVFloat64_tS_S_(
// CPP-CHECK-NEXT: entry:
-// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fmmla.nxv2f64(<vscale x 2 x double> [[X:%.*]], <vscale x 2 x double> [[Y:%.*]], <vscale x 2 x double> [[Z:%.*]])
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.fmmla.nxv2f64.nxv2f64(<vscale x 2 x double> [[X:%.*]], <vscale x 2 x double> [[Y:%.*]], <vscale x 2 x double> [[Z:%.*]])
// CPP-CHECK-NEXT: ret <vscale x 2 x double> [[TMP0]]
//
svfloat64_t test_svmmla_f64(svfloat64_t x, svfloat64_t y, svfloat64_t z) {
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c
index a19ad0576bb4b..81f5968cd5d66 100644
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c
@@ -11,25 +11,25 @@
#ifdef SVE_OVERLOADED_FORMS
// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3) A1##A3
#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#define SVE_ACLE_FUNC(A1, A2, A3) A1##A2##A3
#endif
// CHECK-LABEL: define dso_local <vscale x 8 x half> @test_f16mf8(
// CHECK-SAME: <vscale x 8 x half> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.nxv8f16.nxv16i8(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
//
// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z11test_f16mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m(
// CPP-CHECK-SAME: <vscale x 8 x half> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.nxv8f16.nxv16i8(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
//
svfloat16_t test_f16mf8(svfloat16_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) {
- return SVE_ACLE_FUNC(svmmla, _f16_mf8, _fpm, )(acc, a, b, fpmr);
+ return SVE_ACLE_FUNC(svmmla, _f16_mf8, _fpm)(acc, a, b, fpmr);
}
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c
index 526f2b1f45927..8af71a6a0500f 100644
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c
@@ -12,25 +12,25 @@
#ifdef SVE_OVERLOADED_FORMS
// A simple used,unused... macro, long enough to represent any SVE builtin.
-#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3, A4_UNUSED) A1##A3
+#define SVE_ACLE_FUNC(A1, A2_UNUSED, A3) A1##A3
#else
-#define SVE_ACLE_FUNC(A1, A2, A3, A4) A1##A2##A3##A4
+#define SVE_ACLE_FUNC(A1, A2, A3) A1##A2##A3
#endif
// CHECK-LABEL: define dso_local <vscale x 4 x float> @test_f32mf8(
// CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv16i8(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
//
// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z11test_f32mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m(
// CPP-CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv16i8(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
//
svfloat32_t test_f32mf8(svfloat32_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) {
- return SVE_ACLE_FUNC(svmmla, _f32_mf8, _fpm, )(acc, a, b, fpmr);
+ return SVE_ACLE_FUNC(svmmla, _f32_mf8, _fpm)(acc, a, b, fpmr);
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 832f97fc95959..c1c202c9bd64e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2805,22 +2805,12 @@ def int_aarch64_sve_sudot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic;
//
// SVE ACLE: 7.4/5. FP64/FP32 matrix multiply extensions
//
-def int_aarch64_sve_fmmla : AdvSIMD_3VectorArg_Intrinsic;
-def int_aarch64_sve_fmmla_f16f32
- : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
- [ llvm_nxv4f32_ty, llvm_nxv8f16_ty, llvm_nxv8f16_ty ],
- [IntrNoMem]>;
-
-def int_aarch64_sve_fmmla_mf8f32
- : DefaultAttrsIntrinsic<[llvm_nxv4f32_ty],
- [ llvm_nxv4f32_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty ],
- [IntrNoMem]>;
+def int_aarch64_sve_fmmla
+ : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+ [ LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1> ],
+ [ IntrNoMem ]>;
-def int_aarch64_sve_fmmla_mf8f16
- : DefaultAttrsIntrinsic<[llvm_nxv8f16_ty],
- [ llvm_nxv8f16_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty ],
- [IntrNoMem]>;
//
// SVE ACLE: 7.2. BFloat16 extensions
//
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index c756873d0bf7e..37823f8795f6b 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -3684,7 +3684,7 @@ let Predicates = [HasSVE, HasMatMulFP32] in {
} // End HasSVE, HasMatMulFP32
let Predicates = [HasSVE_F16F32MM] in {
- defm FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16, int_aarch64_sve_fmmla_f16f32, nxv4f32, nxv8f16>;
+ defm FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16, int_aarch64_sve_fmmla, nxv4f32, nxv8f16>;
} // End HasSVE_F16F32MM
let Predicates = [HasSVE, HasMatMulFP64] in {
@@ -4744,15 +4744,11 @@ defm FMLALLTT_ZZZ : sve2_fp8_mla<0b011, ZPR32, "fmlalltt", nxv4f32, int_aarch64_
} // End HasSSVE_FP8FMA
let Predicates = [HasSVE2, HasF8F32MM] in {
- def FMMLA_ZZZ_BtoS : sve2_fp8_mmla<0b0, ZPR32, "fmmla">;
- def : Pat<(nxv4f32 (int_aarch64_sve_fmmla_mf8f32 nxv4f32:$acc, nxv16i8:$zn, nxv16i8:$zm)),
- (FMMLA_ZZZ_BtoS $acc, $zn, $zm)>;
+ defm FMMLA_ZZZ_BtoS : sve2_fp8_fmmla<0b0, ZPR32, "fmmla", nxv4f32>;
}
let Predicates = [HasSVE2, HasF8F16MM] in {
- def FMMLA_ZZZ_BtoH : sve2_fp8_mmla<0b1, ZPR16, "fmmla">;
- def : Pat<(nxv8f16 (int_aarch64_sve_fmmla_mf8f16 nxv8f16:$acc, nxv16i8:$zn, nxv16i8:$zm)),
- (FMMLA_ZZZ_BtoH $acc, $zn, $zm)>;
+ defm FMMLA_ZZZ_BtoH : sve2_fp8_fmmla<0b1, ZPR16, "fmmla", nxv8f16>;
}
let Predicates = [HasSSVE_FP8DOT2] in {
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 1664f4ad0c8fa..0694b623bfa67 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -11143,6 +11143,12 @@ class sve2_fp8_mmla<bit opc, ZPRRegOp dst_ty, string mnemonic>
let Uses = [FPMR, FPCR];
}
+multiclass sve2_fp8_fmmla<bits<1> opc, ZPRRegOp zprty, string mnemonic, ValueType ResVT> {
+ def NAME : sve2_fp8_mmla<opc, zprty, mnemonic>;
+ def : Pat<(ResVT (int_aarch64_sve_fmmla ResVT:$acc, nxv16i8:$zn, nxv16i8:$zm)),
+ (!cast<Instruction>(NAME) $acc, $zn, $zm)>;
+}
+
class sve_fp8_dot_indexed<bits<4> opc, ZPRRegOp dst_ty, Operand iop_ty, string mnemonic>
: I<(outs dst_ty:$Zda), (ins dst_ty:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, iop_ty:$iop),
mnemonic, "\t$Zda, $Zn, $Zm$iop", "", []>, Sched<[]> {
diff --git a/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll b/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll
index ea636d65a479c..aa856a557d1ec 100644
--- a/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fmmla-f32f16.ll
@@ -1,32 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve-f16f32mm < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve-f16f32mm < %s | FileCheck %s
-define <vscale x 4 x float> @_Z1tu13__SVFloat32_tu13__SVFloat16_tS0_(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) {
-; CHECK-LABEL: _Z1tu13__SVFloat32_tu13__SVFloat16_tS0_:
+define <vscale x 4 x float> @fmmla_f32f16(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b) #0 {
+; CHECK-LABEL: fmmla_f32f16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-3
-; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: str z0, [sp, #2, mul vl]
; CHECK-NEXT: fmmla z0.s, z1.h, z2.h
-; CHECK-NEXT: str z1, [sp, #1, mul vl]
-; CHECK-NEXT: str z2, [sp]
-; CHECK-NEXT: addvl sp, sp, #3
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
- %acc.addr = alloca <vscale x 4 x float>, align 16
- %a.addr = alloca <vscale x 8 x half>, align 16
- %b.addr = alloca <vscale x 8 x half>, align 16
- store <vscale x 4 x float> %acc, ptr %acc.addr, align 16
- store <vscale x 8 x half> %a, ptr %a.addr, align 16
- store <vscale x 8 x half> %b, ptr %b.addr, align 16
- %0 = load <vscale x 4 x float>, ptr %acc.addr, align 16
- %1 = load <vscale x 8 x half>, ptr %a.addr, align 16
- %2 = load <vscale x 8 x half>, ptr %b.addr, align 16
- %3 = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float> %0, <vscale x 8 x half> %1, <vscale x 8 x half> %2)
- ret <vscale x 4 x float> %3
+ %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv8f16(<vscale x 4 x float> %acc, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+ ret <vscale x 4 x float> %out
}
-
-declare <vscale x 4 x float> @llvm.aarch64.sve.fmmla.f16f32(<vscale x 4 x float>, <vscale x 8 x half>, <vscale x 8 x half>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll b/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll
index 0fdd6bf2508e3..99e23e7ab9fd5 100644
--- a/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll
@@ -1,39 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f16mm < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f16mm < %s | FileCheck %s
-define <vscale x 8 x half> @_Z5test2u13__SVFloat16_tu13__SVMfloat8_tS0_m(<vscale x 8 x half> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i64 %fpmr) {
-; CHECK-LABEL: _Z5test2u13__SVFloat16_tu13__SVMfloat8_tS0_m:
+define <vscale x 8 x half> @fmmla_f16mf8(<vscale x 8 x half> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i64 %fpmr) {
+; CHECK-LABEL: fmmla_f16mf8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-3
-; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: addvl x8, sp, #3
-; CHECK-NEXT: str z1, [sp, #1, mul vl]
-; CHECK-NEXT: str z0, [sp, #2, mul vl]
-; CHECK-NEXT: str z2, [sp]
-; CHECK-NEXT: str x0, [x8, #8]
-; CHECK-NEXT: msr FPMR, x0
; CHECK-NEXT: fmmla z0.h, z1.b, z2.b
-; CHECK-NEXT: addvl sp, sp, #3
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
- %acc.addr = alloca <vscale x 8 x half>, align 16
- %a.addr = alloca <vscale x 16 x i8>, align 16
- %b.addr = alloca <vscale x 16 x i8>, align 16
- %fpmr.addr = alloca i64, align 8
- store <vscale x 8 x half> %acc, ptr %acc.addr, align 16
- store <vscale x 16 x i8> %a, ptr %a.addr, align 16
- store <vscale x 16 x i8> %b, ptr %b.addr, align 16
- store i64 %fpmr, ptr %fpmr.addr, align 8
- %0 = load <vscale x 8 x half>, ptr %acc.addr, align 16
- %1 = load <vscale x 16 x i8>, ptr %a.addr, align 16
- %2 = load <vscale x 16 x i8>, ptr %b.addr, align 16
- %3 = load i64, ptr %fpmr.addr, align 8
- call void @llvm.aarch64.set.fpmr(i64 %3)
- %4 = call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2)
- ret <vscale x 8 x half> %4
+ %out = call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+ ret <vscale x 8 x half> %out
}
-
-declare <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half>, <vscale x 16 x i8>, <vscale x 16 x i8>)
diff --git a/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll b/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll
index 007a164ac77da..503baf484ecd6 100644
--- a/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll
@@ -1,41 +1,12 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f32mm < %s | FileCheck %s --check-prefixes=CHECK
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+sve2,+f8f32mm < %s | FileCheck %s
-define dso_local <vscale x 4 x float> @_Z5t_varu13__SVFloat32_tu13__SVMfloat8_tS0_m(<vscale x 4 x float> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i64 noundef %fpmr) #0 {
-; CHECK-LABEL: _Z5t_varu13__SVFloat32_tu13__SVMfloat8_tS0_m:
+define dso_local <vscale x 4 x float> @fmmla_f32mf8(<vscale x 4 x float> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b, i64 noundef %fpmr) #0 {
+; CHECK-LABEL: fmmla_f32mf8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-3
-; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 16 + 24 * VG
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: addvl x8, sp, #3
-; CHECK-NEXT: str z1, [sp, #1, mul vl]
-; CHECK-NEXT: str z0, [sp, #2, mul vl]
-; CHECK-NEXT: str z2, [sp]
-; CHECK-NEXT: str x0, [x8, #8]
-; CHECK-NEXT: msr FPMR, x0
; CHECK-NEXT: fmmla z0.s, z1.b, z2.b
-; CHECK-NEXT: addvl sp, sp, #3
-; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CHECK-NEXT: ret
entry:
- %acc.addr = alloca <vscale x 4 x float>, align 16
- %a.addr = alloca <vscale x 16 x i8>, align 16
- %b.addr = alloca <vscale x 16 x i8>, align 16
- %fpmr.addr = alloca i64, align 8
- store <vscale x 4 x float> %acc, ptr %acc.addr, align 16
- store <vscale x 16 x i8> %a, ptr %a.addr, align 16
- store <vscale x 16 x i8> %b, ptr %b.addr, align 16
- store i64 %fpmr, ptr %fpmr.addr, align 8
- %0 = load <vscale x 4 x float>, ptr %acc.addr, align 16
- %1 = load <vscale x 16 x i8>, ptr %a.addr, align 16
- %2 = load <vscale x 16 x i8>, ptr %b.addr, align 16
- %3 = load i64, ptr %fpmr.addr, align 8
- call void @llvm.aarch64.set.fpmr(i64 %3)
- %4 = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> %0, <vscale x 16 x i8> %1, <vscale x 16 x i8> %2)
- ret <vscale x 4 x float> %4
+ %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+ ret <vscale x 4 x float> %out
}
-
-declare void @llvm.aarch64.set.fpmr(i64)
-
-declare <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i8>)
>From 308d3229a14cd8361f75f4effac5122323f006b3 Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Thu, 13 Nov 2025 11:28:35 +0000
Subject: [PATCH 3/6] Added new fp8 intrinsic in IntrinsicsAArch64.td, updated
tests accordingly Formatting
---
clang/include/clang/Basic/arm_sve.td | 4 ++--
.../AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c | 4 ++--
.../AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c | 4 ++--
llvm/include/llvm/IR/IntrinsicsAArch64.td | 11 ++++++++---
llvm/lib/Target/AArch64/SVEInstrFormats.td | 2 +-
llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll | 2 +-
llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll | 2 +-
7 files changed, 17 insertions(+), 12 deletions(-)
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 3901c88323ff4..676366a9dfa1f 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1201,11 +1201,11 @@ let SVETargetGuard = "sve-f16f32mm", SMETargetGuard = InvalidMode in {
}
let SVETargetGuard = "sve2,f8f32mm", SMETargetGuard = InvalidMode in {
- def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "dd~~>", "f", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>;
+ def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmmla", [IsOverloadCvt]>;
}
let SVETargetGuard = "sve2,f8f16mm", SMETargetGuard = InvalidMode in {
- def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "dd~~>", "h", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>;
+ def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fmmla", [IsOverloadCvt]>;
}
def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn1q">;
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c
index 81f5968cd5d66..3baa4598cfc2b 100644
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c
@@ -20,14 +20,14 @@
// CHECK-SAME: <vscale x 8 x half> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.nxv8f16.nxv16i8(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmmla.nxv8f16.nxv16i8(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
//
// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z11test_f16mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m(
// CPP-CHECK-SAME: <vscale x 8 x half> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.nxv8f16.nxv16i8(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmmla.nxv8f16.nxv16i8(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
//
svfloat16_t test_f16mf8(svfloat16_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) {
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c
index 8af71a6a0500f..5e9469c705c43 100644
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c
@@ -21,14 +21,14 @@
// CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv16i8(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmmla.nxv4f32.nxv16i8(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
//
// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z11test_f32mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m(
// CPP-CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.nxv4f32.nxv16i8(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmmla.nxv4f32.nxv16i8(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
//
svfloat32_t test_f32mf8(svfloat32_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) {
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index c1c202c9bd64e..372116475987c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2807,9 +2807,14 @@ def int_aarch64_sve_sudot_lane : AdvSIMD_SVE_DOT_Indexed_Intrinsic;
//
def int_aarch64_sve_fmmla
- : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
- [ LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1> ],
- [ IntrNoMem ]>;
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
+ [IntrNoMem]>;
+
+def int_aarch64_sve_fp8_fmmla
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
+ [IntrReadMem, IntrInaccessibleMemOnly]>;
//
// SVE ACLE: 7.2. BFloat16 extensions
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 0694b623bfa67..088b114a8fb98 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -11145,7 +11145,7 @@ class sve2_fp8_mmla<bit opc, ZPRRegOp dst_ty, string mnemonic>
multiclass sve2_fp8_fmmla<bits<1> opc, ZPRRegOp zprty, string mnemonic, ValueType ResVT> {
def NAME : sve2_fp8_mmla<opc, zprty, mnemonic>;
- def : Pat<(ResVT (int_aarch64_sve_fmmla ResVT:$acc, nxv16i8:$zn, nxv16i8:$zm)),
+ def : Pat<(ResVT (int_aarch64_sve_fp8_fmmla ResVT:$acc, nxv16i8:$zn, nxv16i8:$zm)),
(!cast<Instruction>(NAME) $acc, $zn, $zm)>;
}
diff --git a/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll b/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll
index 99e23e7ab9fd5..a7cfce3f2706f 100644
--- a/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-fmmla-f16mf8.ll
@@ -7,6 +7,6 @@ define <vscale x 8 x half> @fmmla_f16mf8(<vscale x 8 x half> %acc, <vscale x 16
; CHECK-NEXT: fmmla z0.h, z1.b, z2.b
; CHECK-NEXT: ret
entry:
- %out = call <vscale x 8 x half> @llvm.aarch64.sve.fmmla.mf8f16(<vscale x 8 x half> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+ %out = call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmmla.nxv8f16.nxv16i8(<vscale x 8 x half> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
ret <vscale x 8 x half> %out
}
diff --git a/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll b/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll
index 503baf484ecd6..32b36f12d6268 100644
--- a/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll
+++ b/llvm/test/CodeGen/AArch64/sve2-fmmla-f32mf8.ll
@@ -7,6 +7,6 @@ define dso_local <vscale x 4 x float> @fmmla_f32mf8(<vscale x 4 x float> %acc, <
; CHECK-NEXT: fmmla z0.s, z1.b, z2.b
; CHECK-NEXT: ret
entry:
- %out = call <vscale x 4 x float> @llvm.aarch64.sve.fmmla.mf8f32(<vscale x 4 x float> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
+ %out = call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmmla.nxv4f32.nxv16i82(<vscale x 4 x float> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b)
ret <vscale x 4 x float> %out
}
>From eb6c3222e6a42614f17aca7067056dcba32301a3 Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Mon, 17 Nov 2025 11:28:38 +0000
Subject: [PATCH 4/6] Changed source type to fp8 Renamed IsOverloadCvt flag to
IsOverloadFirstandLast for clarity
---
clang/include/clang/Basic/TargetBuiltins.h | 2 +-
clang/include/clang/Basic/arm_sve.td | 34 +++++++++----------
clang/include/clang/Basic/arm_sve_sme_incl.td | 2 +-
clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 2 +-
llvm/include/llvm/IR/IntrinsicsAArch64.td | 2 +-
5 files changed, 21 insertions(+), 21 deletions(-)
diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index 9bd514349d31d..ab380619dce20 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -397,7 +397,7 @@ namespace clang {
}
bool isOverloadDefault() const { return !(Flags & OverloadKindMask); }
bool isOverloadWhileRW() const { return Flags & IsOverloadWhileRW; }
- bool isOverloadCvt() const { return Flags & IsOverloadCvt; }
+ bool isOverloadFirstandLast() const { return Flags & IsOverloadFirstandLast; }
bool isPrefetch() const { return Flags & IsPrefetch; }
bool isReverseCompare() const { return Flags & ReverseCompare; }
bool isAppendSVALL() const { return Flags & IsAppendSVALL; }
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 676366a9dfa1f..84c7018bcb226 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -883,56 +883,56 @@ multiclass SInstCvtMX<string name, string m_types, string xz_types,
}
// svcvt_s##_f16
-defm SVFCVTZS_S16_F16 : SInstCvtMXZ<"svcvt_s16[_f16]", "ddPO", "dPO", "s", "aarch64_sve_fcvtzs", [IsOverloadCvt]>;
+defm SVFCVTZS_S16_F16 : SInstCvtMXZ<"svcvt_s16[_f16]", "ddPO", "dPO", "s", "aarch64_sve_fcvtzs", [IsOverloadFirstandLast]>;
defm SVFCVTZS_S32_F16 : SInstCvtMXZ<"svcvt_s32[_f16]", "ddPO", "dPO", "i", "aarch64_sve_fcvtzs_i32f16">;
defm SVFCVTZS_S64_F16 : SInstCvtMXZ<"svcvt_s64[_f16]", "ddPO", "dPO", "l", "aarch64_sve_fcvtzs_i64f16">;
// svcvt_s##_f32
-defm SVFCVTZS_S32_F32 : SInstCvtMXZ<"svcvt_s32[_f32]", "ddPM", "dPM", "i", "aarch64_sve_fcvtzs", [IsOverloadCvt]>;
+defm SVFCVTZS_S32_F32 : SInstCvtMXZ<"svcvt_s32[_f32]", "ddPM", "dPM", "i", "aarch64_sve_fcvtzs", [IsOverloadFirstandLast]>;
defm SVFCVTZS_S64_F32 : SInstCvtMXZ<"svcvt_s64[_f32]", "ddPM", "dPM", "l", "aarch64_sve_fcvtzs_i64f32">;
// svcvt_s##_f64
defm SVFCVTZS_S32_F64 : SInstCvtMXZ<"svcvt_s32[_f64]", "ttPd", "tPd", "d", "aarch64_sve_fcvtzs_i32f64">;
-defm SVFCVTZS_S64_F64 : SInstCvtMXZ<"svcvt_s64[_f64]", "ddPN", "dPN", "l", "aarch64_sve_fcvtzs", [IsOverloadCvt]>;
+defm SVFCVTZS_S64_F64 : SInstCvtMXZ<"svcvt_s64[_f64]", "ddPN", "dPN", "l", "aarch64_sve_fcvtzs", [IsOverloadFirstandLast]>;
// svcvt_u##_f16
-defm SVFCVTZU_U16_F16 : SInstCvtMXZ<"svcvt_u16[_f16]", "ddPO", "dPO", "Us", "aarch64_sve_fcvtzu", [IsOverloadCvt]>;
+defm SVFCVTZU_U16_F16 : SInstCvtMXZ<"svcvt_u16[_f16]", "ddPO", "dPO", "Us", "aarch64_sve_fcvtzu", [IsOverloadFirstandLast]>;
defm SVFCVTZU_U32_F16 : SInstCvtMXZ<"svcvt_u32[_f16]", "ddPO", "dPO", "Ui", "aarch64_sve_fcvtzu_i32f16">;
defm SVFCVTZU_U64_F16 : SInstCvtMXZ<"svcvt_u64[_f16]", "ddPO", "dPO", "Ul", "aarch64_sve_fcvtzu_i64f16">;
// svcvt_u##_f32
-defm SVFCVTZU_U32_F32 : SInstCvtMXZ<"svcvt_u32[_f32]", "ddPM", "dPM", "Ui", "aarch64_sve_fcvtzu", [IsOverloadCvt]>;
+defm SVFCVTZU_U32_F32 : SInstCvtMXZ<"svcvt_u32[_f32]", "ddPM", "dPM", "Ui", "aarch64_sve_fcvtzu", [IsOverloadFirstandLast]>;
defm SVFCVTZU_U64_F32 : SInstCvtMXZ<"svcvt_u64[_f32]", "ddPM", "dPM", "Ul", "aarch64_sve_fcvtzu_i64f32">;
// svcvt_u##_f64
defm SVFCVTZU_U32_F64 : SInstCvtMXZ<"svcvt_u32[_f64]", "zzPd", "zPd", "d", "aarch64_sve_fcvtzu_i32f64">;
-defm SVFCVTZU_U64_F64 : SInstCvtMXZ<"svcvt_u64[_f64]", "ddPN", "dPN", "Ul", "aarch64_sve_fcvtzu", [IsOverloadCvt]>;
+defm SVFCVTZU_U64_F64 : SInstCvtMXZ<"svcvt_u64[_f64]", "ddPN", "dPN", "Ul", "aarch64_sve_fcvtzu", [IsOverloadFirstandLast]>;
// svcvt_f16_s##
-defm SVFCVTZS_F16_S16 : SInstCvtMXZ<"svcvt_f16[_s16]", "OOPd", "OPd", "s", "aarch64_sve_scvtf", [IsOverloadCvt]>;
+defm SVFCVTZS_F16_S16 : SInstCvtMXZ<"svcvt_f16[_s16]", "OOPd", "OPd", "s", "aarch64_sve_scvtf", [IsOverloadFirstandLast]>;
defm SVFCVTZS_F16_S32 : SInstCvtMXZ<"svcvt_f16[_s32]", "OOPd", "OPd", "i", "aarch64_sve_scvtf_f16i32">;
defm SVFCVTZS_F16_S64 : SInstCvtMXZ<"svcvt_f16[_s64]", "OOPd", "OPd", "l", "aarch64_sve_scvtf_f16i64">;
// svcvt_f32_s##
-defm SVFCVTZS_F32_S32 : SInstCvtMXZ<"svcvt_f32[_s32]", "MMPd", "MPd", "i", "aarch64_sve_scvtf", [IsOverloadCvt]>;
+defm SVFCVTZS_F32_S32 : SInstCvtMXZ<"svcvt_f32[_s32]", "MMPd", "MPd", "i", "aarch64_sve_scvtf", [IsOverloadFirstandLast]>;
defm SVFCVTZS_F32_S64 : SInstCvtMXZ<"svcvt_f32[_s64]", "MMPd", "MPd", "l", "aarch64_sve_scvtf_f32i64">;
// svcvt_f64_s##
defm SVFCVTZS_F64_S32 : SInstCvtMXZ<"svcvt_f64[_s32]", "ddPt", "dPt", "d", "aarch64_sve_scvtf_f64i32">;
-defm SVFCVTZS_F64_S64 : SInstCvtMXZ<"svcvt_f64[_s64]", "NNPd", "NPd", "l", "aarch64_sve_scvtf", [IsOverloadCvt]>;
+defm SVFCVTZS_F64_S64 : SInstCvtMXZ<"svcvt_f64[_s64]", "NNPd", "NPd", "l", "aarch64_sve_scvtf", [IsOverloadFirstandLast]>;
// svcvt_f16_u##
-defm SVFCVTZU_F16_U16 : SInstCvtMXZ<"svcvt_f16[_u16]", "OOPd", "OPd", "Us", "aarch64_sve_ucvtf", [IsOverloadCvt]>;
+defm SVFCVTZU_F16_U16 : SInstCvtMXZ<"svcvt_f16[_u16]", "OOPd", "OPd", "Us", "aarch64_sve_ucvtf", [IsOverloadFirstandLast]>;
defm SVFCVTZU_F16_U32 : SInstCvtMXZ<"svcvt_f16[_u32]", "OOPd", "OPd", "Ui", "aarch64_sve_ucvtf_f16i32">;
defm SVFCVTZU_F16_U64 : SInstCvtMXZ<"svcvt_f16[_u64]", "OOPd", "OPd", "Ul", "aarch64_sve_ucvtf_f16i64">;
// svcvt_f32_u##
-defm SVFCVTZU_F32_U32 : SInstCvtMXZ<"svcvt_f32[_u32]", "MMPd", "MPd", "Ui", "aarch64_sve_ucvtf", [IsOverloadCvt]>;
+defm SVFCVTZU_F32_U32 : SInstCvtMXZ<"svcvt_f32[_u32]", "MMPd", "MPd", "Ui", "aarch64_sve_ucvtf", [IsOverloadFirstandLast]>;
defm SVFCVTZU_F32_U64 : SInstCvtMXZ<"svcvt_f32[_u64]", "MMPd", "MPd", "Ul", "aarch64_sve_ucvtf_f32i64">;
// svcvt_f64_u##
defm SVFCVTZU_F64_U32 : SInstCvtMXZ<"svcvt_f64[_u32]", "ddPz", "dPz", "d", "aarch64_sve_ucvtf_f64i32">;
-defm SVFCVTZU_F64_U64 : SInstCvtMXZ<"svcvt_f64[_u64]", "NNPd", "NPd", "Ul", "aarch64_sve_ucvtf", [IsOverloadCvt]>;
+defm SVFCVTZU_F64_U64 : SInstCvtMXZ<"svcvt_f64[_u64]", "NNPd", "NPd", "Ul", "aarch64_sve_ucvtf", [IsOverloadFirstandLast]>;
// svcvt_f16_f##
defm SVFCVT_F16_F32 : SInstCvtMXZ<"svcvt_f16[_f32]", "OOPd", "OPd", "f", "aarch64_sve_fcvt_f16f32">;
@@ -1190,22 +1190,22 @@ def SVSUDOT_LANE_S : SInst<"svsudot_lane[_s32]", "ddqbi", "i", MergeNone, "aarc
}
let SVETargetGuard = "f32mm", SMETargetGuard = InvalidMode in {
-def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>;
+def SVMLLA_F32 : SInst<"svmmla[_f32]", "dddd","f", MergeNone, "aarch64_sve_fmmla", [IsOverloadFirstandLast]>;
}
let SVETargetGuard = "f64mm", SMETargetGuard = InvalidMode in {
-def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd", "d", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>;
+def SVMLLA_F64 : SInst<"svmmla[_f64]", "dddd", "d", MergeNone, "aarch64_sve_fmmla", [IsOverloadFirstandLast]>;
let SVETargetGuard = "sve-f16f32mm", SMETargetGuard = InvalidMode in {
- def SVMLLA_F32_F16 : SInst<"svmmla[_f32_f16]", "ddhh", "f", MergeNone, "aarch64_sve_fmmla", [IsOverloadCvt]>;
+ def SVMLLA_F32_F16 : SInst<"svmmla[_f32_f16]", "ddhh", "f", MergeNone, "aarch64_sve_fmmla", [IsOverloadFirstandLast]>;
}
let SVETargetGuard = "sve2,f8f32mm", SMETargetGuard = InvalidMode in {
- def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmmla", [IsOverloadCvt]>;
+ def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmmla", [IsOverloadFirstandLast]>;
}
let SVETargetGuard = "sve2,f8f16mm", SMETargetGuard = InvalidMode in {
- def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fmmla", [IsOverloadCvt]>;
+ def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fmmla", [IsOverloadFirstandLast]>;
}
def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn1q">;
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
index 13e7cf45471c2..7e60e87b12a4d 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -214,7 +214,7 @@ def IsZExtReturn : FlagType<0x00080000>; // Return value is s
def IsOverloadNone : FlagType<0x00100000>; // Intrinsic does not take any overloaded types.
def IsOverloadWhileOrMultiVecCvt : FlagType<0x00200000>; // Use {default type, typeof(operand1)} as overloaded types.
def IsOverloadWhileRW : FlagType<0x00400000>; // Use {pred(default type), typeof(operand0)} as overloaded types.
-def IsOverloadCvt : FlagType<0x00800000>; // Use {typeof(operand0), typeof(last operand)} as overloaded types.
+def IsOverloadFirstandLast : FlagType<0x00800000>; // Use {typeof(operand0), typeof(last operand)} as overloaded types.
def OverloadKindMask : FlagType<0x00E00000>; // When the masked values are all '0', the default type is used as overload type.
def IsByteIndexed : FlagType<0x01000000>;
def IsAppendSVALL : FlagType<0x02000000>; // Appends SV_ALL as the last operand.
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 60f9b86333670..51b9257b8e7a0 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -4427,7 +4427,7 @@ CodeGenFunction::getSVEOverloadTypes(const SVETypeFlags &TypeFlags,
if (TypeFlags.isOverloadWhileRW())
return {getSVEPredType(TypeFlags), Ops[0]->getType()};
- if (TypeFlags.isOverloadCvt())
+ if (TypeFlags.isOverloadFirstandLast())
return {Ops[0]->getType(), Ops.back()->getType()};
if (TypeFlags.isReductionQV() && !ResultType->isScalableTy() &&
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 372116475987c..26e5a410566d0 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2813,7 +2813,7 @@ def int_aarch64_sve_fmmla
def int_aarch64_sve_fp8_fmmla
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
- [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>],
+ [LLVMMatchType<0>, llvm_anyvector_ty, llvm_nxv16i8_ty],
[IntrReadMem, IntrInaccessibleMemOnly]>;
//
>From 51320924962893757751d4911d9abe2400060cdd Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Mon, 17 Nov 2025 16:07:22 +0000
Subject: [PATCH 5/6] clang format
---
clang/include/clang/Basic/TargetBuiltins.h | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index ab380619dce20..d5e8299f8b080 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -397,7 +397,9 @@ namespace clang {
}
bool isOverloadDefault() const { return !(Flags & OverloadKindMask); }
bool isOverloadWhileRW() const { return Flags & IsOverloadWhileRW; }
- bool isOverloadFirstandLast() const { return Flags & IsOverloadFirstandLast; }
+ bool isOverloadFirstandLast() const {
+ return Flags & IsOverloadFirstandLast;
+ }
bool isPrefetch() const { return Flags & IsPrefetch; }
bool isReverseCompare() const { return Flags & ReverseCompare; }
bool isAppendSVALL() const { return Flags & IsAppendSVALL; }
>From 5052f47040a943f1d0fc62c9a56cdbc6e326793d Mon Sep 17 00:00:00 2001
From: Amichaxx <amina.chabane at arm.com>
Date: Thu, 20 Nov 2025 11:38:01 +0000
Subject: [PATCH 6/6] - fp8_fmmla changes
---
clang/include/clang/Basic/arm_sve.td | 4 ++--
.../CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c | 4 ++--
.../CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c | 4 ++--
llvm/include/llvm/IR/IntrinsicsAArch64.td | 2 +-
4 files changed, 7 insertions(+), 7 deletions(-)
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 84c7018bcb226..8e2ec5c974be3 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -1201,11 +1201,11 @@ let SVETargetGuard = "sve-f16f32mm", SMETargetGuard = InvalidMode in {
}
let SVETargetGuard = "sve2,f8f32mm", SMETargetGuard = InvalidMode in {
- def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmmla", [IsOverloadFirstandLast]>;
+ def SVMLLA_F32_MF8 : SInst<"svmmla[_f32_mf8]", "dd~~>", "f", MergeNone, "aarch64_sve_fp8_fmmla">;
}
let SVETargetGuard = "sve2,f8f16mm", SMETargetGuard = InvalidMode in {
- def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fmmla", [IsOverloadFirstandLast]>;
+ def SVMLLA_F16_MF8 : SInst<"svmmla[_f16_mf8]", "dd~~>", "h", MergeNone, "aarch64_sve_fp8_fmmla">;
}
def SVTRN1Q : SInst<"svtrn1q[_{d}]", "ddd", "csilUcUsUiUlhfdb", MergeNone, "aarch64_sve_trn1q">;
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c
index 3baa4598cfc2b..5054821d7168e 100644
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f16mf8.c
@@ -20,14 +20,14 @@
// CHECK-SAME: <vscale x 8 x half> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmmla.nxv8f16.nxv16i8(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmmla.nxv8f16(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
// CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
//
// CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z11test_f16mf8u13__SVFloat16_tu13__SVMfloat8_tS0_m(
// CPP-CHECK-SAME: <vscale x 8 x half> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmmla.nxv8f16.nxv16i8(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.fp8.fmmla.nxv8f16(<vscale x 8 x half> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
// CPP-CHECK-NEXT: ret <vscale x 8 x half> [[TMP0]]
//
svfloat16_t test_f16mf8(svfloat16_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) {
diff --git a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c
index 5e9469c705c43..e1f52e1b423c5 100644
--- a/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c
+++ b/clang/test/CodeGen/AArch64/sve2-intrinsics/acle_sve2_fmmla-f32mf8.c
@@ -21,14 +21,14 @@
// CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: [[ENTRY:.*:]]
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmmla.nxv4f32.nxv16i8(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmmla.nxv4f32(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
// CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
//
// CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z11test_f32mf8u13__SVFloat32_tu13__SVMfloat8_tS0_m(
// CPP-CHECK-SAME: <vscale x 4 x float> [[ACC:%.*]], <vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
// CPP-CHECK-NEXT: [[ENTRY:.*:]]
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmmla.nxv4f32.nxv16i8(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.fp8.fmmla.nxv4f32(<vscale x 4 x float> [[ACC]], <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
// CPP-CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
//
svfloat32_t test_f32mf8(svfloat32_t acc, svmfloat8_t a, svmfloat8_t b, fpm_t fpmr) {
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 26e5a410566d0..9806b2f8811ed 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -2813,7 +2813,7 @@ def int_aarch64_sve_fmmla
def int_aarch64_sve_fp8_fmmla
: DefaultAttrsIntrinsic<[llvm_anyvector_ty],
- [LLVMMatchType<0>, llvm_anyvector_ty, llvm_nxv16i8_ty],
+ [LLVMMatchType<0>, llvm_nxv16i8_ty, llvm_nxv16i8_ty],
[IntrReadMem, IntrInaccessibleMemOnly]>;
//
More information about the llvm-commits
mailing list