[clang] [llvm] [AArch64][SME] Split FP8 FTMOPA intrinsics (PR #203310)
via cfe-commits
cfe-commits at lists.llvm.org
Thu Jun 11 23:31:54 PDT 2026
https://github.com/CarolineConcatto updated https://github.com/llvm/llvm-project/pull/203310
>From 001a36e5548eac09bd160014ed03a5ab9f5ae29d Mon Sep 17 00:00:00 2001
From: CarolineConcatto <caroline.concatto at arm.com>
Date: Thu, 11 Jun 2026 12:56:31 +0000
Subject: [PATCH] [AArch64][SME] Split FP8 FTMOPA intrinsics
Introduce separate FP8 FTMOPA intrinsics for ZA16 and ZA32:
llvm.aarch64.sme.fp8.ftmopa.za16
llvm.aarch64.sme.fp8.ftmopa.za32
The FP8 FTMOPA forms need to model their FPMR dependency, so they
should not share the same intrinsic definitions as the non-FP8 FTMOPA
forms.
Update the Clang SME builtin definitions and AArch64 instruction
patterns to use the new intrinsics, and add AutoUpgrade support for the
previous FP8-shaped llvm.aarch64.sme.ftmopa.* spellings so existing IR and
bitcode continue to work.
This was split out from #154144 because the intrinsic upgrade needs to be
handled separately to avoid breaking existing bitcode.
---
clang/include/clang/Basic/arm_sme.td | 4 +-
.../AArch64/sme2-intrinsics/acle_sme2_tmop.c | 8 ++--
llvm/include/llvm/IR/IntrinsicsAArch64.td | 14 +++++++
llvm/lib/IR/AutoUpgrade.cpp | 15 +++++++
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 4 +-
.../upgrade-sme2-fp8-intrinsics-tmop.ll | 42 +++++++++++++++++++
.../CodeGen/AArch64/sme2-intrinsics-tmop.ll | 4 +-
7 files changed, 81 insertions(+), 10 deletions(-)
create mode 100644 llvm/test/Bitcode/upgrade-sme2-fp8-intrinsics-tmop.ll
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 032c588966032..5137e968bec55 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -915,11 +915,11 @@ let SMETargetGuard = "sme2,sme-tmop,sme-b16b16" in {
}
let SMETargetGuard = "sme2,sme-tmop,sme-f8f16" in {
- def SVTMOPA_ZA16_FPM : Inst<"svtmopa_lane_za16[_{d}_{d}]", "vi2.dd[i>", "m", MergeNone, "aarch64_sme_ftmopa_za16", [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_1>, ImmCheck<4, ImmCheck0_3>]>;
+ def SVTMOPA_ZA16_FPM : Inst<"svtmopa_lane_za16[_{d}_{d}]", "vi2.dd[i>", "m", MergeNone, "aarch64_sme_fp8_ftmopa_za16", [IsStreaming, IsInOutZA, IsOverloadNone], [ImmCheck<0, ImmCheck0_1>, ImmCheck<4, ImmCheck0_3>]>;
}
let SMETargetGuard = "sme2,sme-tmop,sme-f8f32" in {
- def SVTMOPA_ZA32_FPM : Inst<"svtmopa_lane_za32[_{d}_{d}]", "vi2.dd[i>", "m", MergeNone, "aarch64_sme_ftmopa_za32", [IsStreaming, IsInOutZA], [ImmCheck<0, ImmCheck0_3>, ImmCheck<4, ImmCheck0_3>]>;
+ def SVTMOPA_ZA32_FPM : Inst<"svtmopa_lane_za32[_{d}_{d}]", "vi2.dd[i>", "m", MergeNone, "aarch64_sme_fp8_ftmopa_za32", [IsStreaming, IsInOutZA, IsOverloadNone], [ImmCheck<0, ImmCheck0_3>, ImmCheck<4, ImmCheck0_3>]>;
}
multiclass ZAReadz<string n_suffix, string vg_num, string t, string i_prefix, list<ImmCheck> ch> {
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_tmop.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_tmop.c
index 55d0074663bc9..d68a465e092c6 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_tmop.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_tmop.c
@@ -172,13 +172,13 @@ void test_svtmopa_lane_za16_bf16_bf16(svbfloat16x2_t zn, svbfloat16_t zm, svuint
// CHECK-LABEL: @test_svtmopa_lane_za16_mf8_mf8_fpm(
// CHECK-NEXT: entry:
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.ftmopa.za16.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZK:%.*]], i32 3)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.ftmopa.za16(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZK:%.*]], i32 3)
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z34test_svtmopa_lane_za16_mf8_mf8_fpm13svmfloat8x2_tu13__SVMfloat8_tu11__SVUint8_tm(
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ftmopa.za16.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZK:%.*]], i32 3)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.ftmopa.za16(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZK:%.*]], i32 3)
// CPP-CHECK-NEXT: ret void
//
void test_svtmopa_lane_za16_mf8_mf8_fpm(svmfloat8x2_t zn, svmfloat8_t zm, svuint8_t zk, fpm_t fpmr) __arm_streaming __arm_inout("za") {
@@ -188,13 +188,13 @@ void test_svtmopa_lane_za16_mf8_mf8_fpm(svmfloat8x2_t zn, svmfloat8_t zm, svuint
// CHECK-LABEL: @test_svtmopa_lane_za32_mf8_mf8_fpm(
// CHECK-NEXT: entry:
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
-// CHECK-NEXT: tail call void @llvm.aarch64.sme.ftmopa.za32.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZK:%.*]], i32 3)
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.ftmopa.za32(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZK:%.*]], i32 3)
// CHECK-NEXT: ret void
//
// CPP-CHECK-LABEL: @_Z34test_svtmopa_lane_za32_mf8_mf8_fpm13svmfloat8x2_tu13__SVMfloat8_tu11__SVUint8_tm(
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
-// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.ftmopa.za32.nxv16i8(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZK:%.*]], i32 3)
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.ftmopa.za32(i32 1, <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], <vscale x 16 x i8> [[ZK:%.*]], i32 3)
// CPP-CHECK-NEXT: ret void
//
void test_svtmopa_lane_za32_mf8_mf8_fpm(svmfloat8x2_t zn, svmfloat8_t zm, svuint8_t zk, fpm_t fpmr) __arm_streaming __arm_inout("za") {
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index ba0d7c02bf427..6cb96a635f87c 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3142,6 +3142,20 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_sme_sutmopa_za32 : SME_OuterProduct_TMOP_Intrinsic;
def int_aarch64_sme_ustmopa_za32 : SME_OuterProduct_TMOP_Intrinsic;
+ class SME_FP8_OuterProduct_TMOP_Intrinsic
+ : DefaultAttrsIntrinsic<[],
+ [llvm_i32_ty,
+ llvm_nxv16i8_ty,
+ llvm_nxv16i8_ty,
+ llvm_nxv16i8_ty,
+ llvm_nxv16i8_ty,
+ llvm_i32_ty],
+ [ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<5>>,
+ IntrInaccessibleMemOnly]>;
+
+ def int_aarch64_sme_fp8_ftmopa_za16 : SME_FP8_OuterProduct_TMOP_Intrinsic;
+ def int_aarch64_sme_fp8_ftmopa_za32 : SME_FP8_OuterProduct_TMOP_Intrinsic;
+
// 16 and 32 bit multi-vector floating point 8 Quarter Tile Quarter Product
foreach za = ["za16", "za32"] in {
def int_aarch64_sme_fp8_fmop4a_ # za # "_1x1" : SME_OuterProduct_QuarterTile_Single_Single;
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 0770f0f0ff060..74b01200c064b 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -970,6 +970,21 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
}
} else {
// 'aarch64.*'.
+ if (Name.consume_front("sme.ftmopa.")) {
+ // The FP8 FTMOPA intrinsics were split out from the non-FP8 FTMOPA
+ // intrinsics to model their FPMR dependency.
+ Intrinsic::ID ID =
+ StringSwitch<Intrinsic::ID>(Name)
+ .Case("za16.nxv16i8", Intrinsic::aarch64_sme_fp8_ftmopa_za16)
+ .Case("za32.nxv16i8", Intrinsic::aarch64_sme_fp8_ftmopa_za32)
+ .Default(Intrinsic::not_intrinsic);
+ if (ID != Intrinsic::not_intrinsic) {
+ NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID);
+ return true;
+ }
+ return false; // No other 'aarch64.sme.ftmopa.*'.
+ }
+
if (Neon) {
// 'aarch64.neon.*'.
Intrinsic::ID ID = StringSwitch<Intrinsic::ID>(Name)
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 022fed6473486..5a7e4f22a7c0c 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -224,11 +224,11 @@ let Predicates = [HasSME_TMOP, HasSMEB16B16] in {
}
let Predicates = [HasSME_TMOP, HasSMEF8F16] in {
- defm FTMOPA_M2ZZZI_BtoH : sme_tmopa_16b<0b01001, ZZ_b_mul_r, ZPR8, nxv16i8, "ftmopa", int_aarch64_sme_ftmopa_za16, [FPMR, FPCR]>;
+ defm FTMOPA_M2ZZZI_BtoH : sme_tmopa_16b<0b01001, ZZ_b_mul_r, ZPR8, nxv16i8, "ftmopa", int_aarch64_sme_fp8_ftmopa_za16, [FPMR, FPCR]>;
}
let Predicates = [HasSME_TMOP, HasSMEF8F32] in {
- defm FTMOPA_M2ZZZI_BtoS : sme_tmopa_32b<0b01000, ZZ_b_mul_r, ZPR8, nxv16i8, "ftmopa", int_aarch64_sme_ftmopa_za32, [FPMR, FPCR]>;
+ defm FTMOPA_M2ZZZI_BtoS : sme_tmopa_32b<0b01000, ZZ_b_mul_r, ZPR8, nxv16i8, "ftmopa", int_aarch64_sme_fp8_ftmopa_za32, [FPMR, FPCR]>;
}
let Predicates = [HasSME] in {
diff --git a/llvm/test/Bitcode/upgrade-sme2-fp8-intrinsics-tmop.ll b/llvm/test/Bitcode/upgrade-sme2-fp8-intrinsics-tmop.ll
new file mode 100644
index 0000000000000..de9a6b69bd219
--- /dev/null
+++ b/llvm/test/Bitcode/upgrade-sme2-fp8-intrinsics-tmop.ll
@@ -0,0 +1,42 @@
+; RUN: opt -S < %s | FileCheck %s
+; RUN: llvm-as %s -o - | llvm-dis | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define void @ftmopa_za16_nxv16i8(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm, <vscale x 16 x i8> %zk) #0 {
+; CHECK-LABEL: @ftmopa_za16_nxv16i8
+; CHECK: call void @llvm.aarch64.sme.fp8.ftmopa.za16(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm, <vscale x 16 x i8> %zk, i32 0)
+ call void @llvm.aarch64.sme.ftmopa.za16.nxv16i8(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm, <vscale x 16 x i8> %zk, i32 0)
+ ret void
+}
+
+define void @ftmopa_za32_nxv16i8(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm, <vscale x 16 x i8> %zk) #0 {
+; CHECK-LABEL: @ftmopa_za32_nxv16i
+; CHECK: call void @llvm.aarch64.sme.fp8.ftmopa.za32(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm, <vscale x 16 x i8> %zk, i32 0)
+ call void @llvm.aarch64.sme.ftmopa.za32.nxv16i8(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm, <vscale x 16 x i8> %zk, i32 0)
+ ret void
+}
+
+
+define void @ftmopa_za16(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm, <vscale x 16 x i8> %zk) #0 {
+; CHECK-LABEL: @ftmopa_za16
+; CHECK: call void @llvm.aarch64.sme.fp8.ftmopa.za16(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm, <vscale x 16 x i8> %zk, i32 0)
+ call void @llvm.aarch64.sme.ftmopa.za16(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm, <vscale x 16 x i8> %zk, i32 0)
+ ret void
+}
+
+define void @ftmopa_za32(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm, <vscale x 16 x i8> %zk) #0 {
+; CHECK-LABEL: @ftmopa_za32
+; CHECK: call void @llvm.aarch64.sme.fp8.ftmopa.za32(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm, <vscale x 16 x i8> %zk, i32 0)
+ call void @llvm.aarch64.sme.ftmopa.za32(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm, <vscale x 16 x i8> %zk, i32 0)
+ ret void
+}
+
+define void @ftmopa_za32_wrong(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm, <vscale x 16 x i8> %zk) #0 {
+; CHECK-LABEL: @ftmopa_za32
+; CHECK: call void @llvm.aarch64.sme.fp8.ftmopa.za32(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm, <vscale x 16 x i8> %zk, i32 0)
+ call void @llvm.aarch64.sme.ftmopa.za32.nxv8i16(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm, <vscale x 16 x i8> %zk, i32 0)
+ ret void
+}
+
+attributes #0 = {nounwind "target-features" = "+sme2,+sme-tmop,+sme-f8f16,+sme-f8f32" }
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-tmop.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-tmop.ll
index e918137bee27d..4b8615cc0ca00 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-tmop.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-tmop.ll
@@ -119,7 +119,7 @@ define void @ftmopa_za16_f8(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <v
; CHECK-NEXT: mov z28.d, z3.d
; CHECK-NEXT: ftmopa za0.h, { z0.b, z1.b }, z2.b, z28[0]
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.ftmopa.za16.nxv16i8(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm, <vscale x 16 x i8> %zk, i32 0)
+ call void @llvm.aarch64.sme.fp8.ftmopa.za16(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm, <vscale x 16 x i8> %zk, i32 0)
ret void
}
@@ -129,7 +129,7 @@ define void @ftmopa_za32_f8(<vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <v
; CHECK-NEXT: mov z28.d, z3.d
; CHECK-NEXT: ftmopa za0.s, { z0.b, z1.b }, z2.b, z28[0]
; CHECK-NEXT: ret
- call void @llvm.aarch64.sme.ftmopa.za32.nxv16i8(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm, <vscale x 16 x i8> %zk, i32 0)
+ call void @llvm.aarch64.sme.fp8.ftmopa.za32(i32 0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm, <vscale x 16 x i8> %zk, i32 0)
ret void
}
More information about the cfe-commits
mailing list