[clang] [llvm] [AArch64][llvm] Add support for Neon vmmlaq_{f16,f32}_mf8_fpm intrinsics (PR #165431)
Jonathan Thackray via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 29 07:17:33 PDT 2025
https://github.com/jthackray updated https://github.com/llvm/llvm-project/pull/165431
>From ad6aadb9fac080c164a71b8552c79719e3965c4d Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Mon, 27 Oct 2025 16:23:50 +0000
Subject: [PATCH 1/2] [AArch64][llvm] Add support for vmmlaq_[f16,f32]_mf8
intrinsics
Add support for the following new intrinsics:
```
float16x8_t vmmlaq_f16_mf8_fpm(float16x8_t, mfloat8x16_t, mfloat8x16_t, fpm_t);
float32x4_t vmmlaq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t, fpm_t);
```
---
clang/include/clang/Basic/arm_neon.td | 8 ++++++
clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 8 ++++++
.../CodeGen/AArch64/v8.6a-neon-intrinsics.c | 27 ++++++++++++++++++-
llvm/include/llvm/IR/IntrinsicsAArch64.td | 6 +++++
4 files changed, 48 insertions(+), 1 deletion(-)
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index ef196103035e8..8e2174c880ed8 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -1896,6 +1896,14 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "f
def VFMLSL_LANEQ_HIGH : SOpInst<"vfmlsl_laneq_high", "(F>)(F>)F(FQ)I", "hQh", OP_FMLSL_LN_Hi>;
}
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f16mm,neon" in {
+ def VMMLA_F16_MF8 : VInst<"vmmla_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+}
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "f8f32mm,neon" in {
+ def VMMLA_F32_MF8 : VInst<"vmmla_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+}
+
let TargetGuard = "i8mm,neon" in {
def VMMLA : SInst<"vmmla", "..(<<)(<<)", "QUiQi">;
def VUSMMLA : SInst<"vusmmla", "..(<<U)(<<)", "Qi">;
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 60f9b86333670..4075f56e6a032 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -7793,6 +7793,14 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
}
+ case NEON::BI__builtin_neon_vmmlaq_f16_mf8_fpm:
+ return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
+ {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E,
+ "fmmla");
+ case NEON::BI__builtin_neon_vmmlaq_f32_mf8_fpm:
+ return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
+ {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
+ "fmmla");
case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
ExtractLow = true;
[[fallthrough]];
diff --git a/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c
index 6fffcb6c6b391..0d592af59f85c 100644
--- a/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c
@@ -1,5 +1,5 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm -target-feature +f8f16mm -target-feature +f8f32mm -target-feature +fp8 \
// RUN: -disable-O0-optnone -emit-llvm -o - %s \
// RUN: | opt -S -passes=mem2reg,sroa \
// RUN: | FileCheck %s
@@ -32,6 +32,31 @@ uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) {
return vmmlaq_u32(r, a, b);
}
+// CHECK-LABEL: define dso_local <8 x half> @test_vmmlaq_f16_mf8(
+// CHECK-SAME: <8 x half> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[P0]] to <8 x i16>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
+// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT: [[FMMLA1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16(<8 x half> [[FMMLA_I]], <16 x i8> [[P1]], <16 x i8> [[P2]])
+// CHECK-NEXT: ret <8 x half> [[FMMLA1_I]]
+//
+float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
+ return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmmlaq_f32_mf8(
+// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
+// CHECK-NEXT: [[FMMLA_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32(<4 x float> [[P0]], <16 x i8> [[P1]], <16 x i8> [[P2]])
+// CHECK-NEXT: ret <4 x float> [[FMMLA_I]]
+//
+float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
+ return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3);
+}
+
// CHECK-LABEL: define dso_local <4 x i32> @test_vusmmlaq_s32(
// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index b81edc385cd43..78a60e839775e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -217,6 +217,11 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
: DefaultAttrsIntrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
[IntrNoMem]>;
+
+ class AdvSIMD_MatMul_fpm_Intrinsic
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty],
+ [IntrNoMem]>;
}
// Arithmetic ops
@@ -499,6 +504,7 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
+ def int_aarch64_neon_fmmla : AdvSIMD_MatMul_fpm_Intrinsic;
def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
def int_aarch64_neon_bfmmla
>From 525dd5fbadc20a2102e1d70f0ba877f3bae0b429 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Wed, 29 Oct 2025 14:15:29 +0000
Subject: [PATCH 2/2] fixup! [AArch64][llvm] Add support for
vmmlaq_[f16,f32]_mf8 intrinsics
Fix CR comments; don't create a new intrinsic, and split test files
---
clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 4 +-
.../CodeGen/AArch64/v8.6a-neon-intrinsics.c | 27 +------------
.../CodeGen/AArch64/v9.6a-neon-intrinsics.c | 39 +++++++++++++++++++
llvm/include/llvm/IR/IntrinsicsAArch64.td | 7 +---
4 files changed, 43 insertions(+), 34 deletions(-)
create mode 100644 clang/test/CodeGen/AArch64/v9.6a-neon-intrinsics.c
diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
index 4075f56e6a032..6a931992f0cb3 100644
--- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp
@@ -7795,11 +7795,11 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
}
case NEON::BI__builtin_neon_vmmlaq_f16_mf8_fpm:
return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
- {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E,
+ {llvm::FixedVectorType::get(HalfTy, 8), llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E,
"fmmla");
case NEON::BI__builtin_neon_vmmlaq_f32_mf8_fpm:
return EmitFP8NeonCall(Intrinsic::aarch64_neon_fmmla,
- {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
+ {llvm::FixedVectorType::get(FloatTy, 4), llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
"fmmla");
case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
ExtractLow = true;
diff --git a/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c
index 0d592af59f85c..6fffcb6c6b391 100644
--- a/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c
+++ b/clang/test/CodeGen/AArch64/v8.6a-neon-intrinsics.c
@@ -1,5 +1,5 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
-// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm -target-feature +f8f16mm -target-feature +f8f32mm -target-feature +fp8 \
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \
// RUN: -disable-O0-optnone -emit-llvm -o - %s \
// RUN: | opt -S -passes=mem2reg,sroa \
// RUN: | FileCheck %s
@@ -32,31 +32,6 @@ uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) {
return vmmlaq_u32(r, a, b);
}
-// CHECK-LABEL: define dso_local <8 x half> @test_vmmlaq_f16_mf8(
-// CHECK-SAME: <8 x half> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[P0]] to <8 x i16>
-// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
-// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
-// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
-// CHECK-NEXT: [[FMMLA1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16(<8 x half> [[FMMLA_I]], <16 x i8> [[P1]], <16 x i8> [[P2]])
-// CHECK-NEXT: ret <8 x half> [[FMMLA1_I]]
-//
-float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
- return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3);
-}
-
-// CHECK-LABEL: define dso_local <4 x float> @test_vmmlaq_f32_mf8(
-// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] {
-// CHECK-NEXT: [[ENTRY:.*:]]
-// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
-// CHECK-NEXT: [[FMMLA_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32(<4 x float> [[P0]], <16 x i8> [[P1]], <16 x i8> [[P2]])
-// CHECK-NEXT: ret <4 x float> [[FMMLA_I]]
-//
-float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
- return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3);
-}
-
// CHECK-LABEL: define dso_local <4 x i32> @test_vusmmlaq_s32(
// CHECK-SAME: <4 x i32> noundef [[R:%.*]], <16 x i8> noundef [[A:%.*]], <16 x i8> noundef [[B:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: [[ENTRY:.*:]]
diff --git a/clang/test/CodeGen/AArch64/v9.6a-neon-intrinsics.c b/clang/test/CodeGen/AArch64/v9.6a-neon-intrinsics.c
new file mode 100644
index 0000000000000..b88a22701495f
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/v9.6a-neon-intrinsics.c
@@ -0,0 +1,39 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +v9.6a -target-feature +f8f16mm -target-feature +f8f32mm -target-feature +fp8 \
+// RUN: -disable-O0-optnone -emit-llvm -o - %s \
+// RUN: | opt -S -passes=mem2reg,sroa \
+// RUN: | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vmmlaq_f16_mf8(
+// CHECK-SAME: <8 x half> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[P0]] to <8 x i16>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i16> [[TMP0]] to <16 x i8>
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
+// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[TMP1]] to <8 x half>
+// CHECK-NEXT: [[FMMLA1_I:%.*]] = bitcast <16 x i8> [[P1]] to <8 x half>
+// CHECK-NEXT: [[FMMLA2_I:%.*]] = bitcast <16 x i8> [[P2]] to <8 x half>
+// CHECK-NEXT: [[FMMLA3_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fmmla.v8f16.v8f16(<8 x half> [[FMMLA_I]], <8 x half> [[FMMLA1_I]], <8 x half> [[FMMLA2_I]])
+// CHECK-NEXT: ret <8 x half> [[FMMLA3_I]]
+//
+float16x8_t test_vmmlaq_f16_mf8(float16x8_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
+ return vmmlaq_f16_mf8_fpm(p0, p1, p2, p3);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmmlaq_f32_mf8(
+// CHECK-SAME: <4 x float> noundef [[P0:%.*]], <16 x i8> [[P1:%.*]], <16 x i8> [[P2:%.*]], i64 noundef [[P3:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[P3]])
+// CHECK-NEXT: [[FMMLA_I:%.*]] = bitcast <16 x i8> [[P1]] to <4 x float>
+// CHECK-NEXT: [[FMMLA1_I:%.*]] = bitcast <16 x i8> [[P2]] to <4 x float>
+// CHECK-NEXT: [[FMMLA2_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fmmla.v4f32.v4f32(<4 x float> [[P0]], <4 x float> [[FMMLA_I]], <4 x float> [[FMMLA1_I]])
+// CHECK-NEXT: ret <4 x float> [[FMMLA2_I]]
+//
+float32x4_t test_vmmlaq_f32_mf8(float32x4_t p0, mfloat8x16_t p1, mfloat8x16_t p2, fpm_t p3) {
+ return vmmlaq_f32_mf8_fpm(p0, p1, p2, p3);
+}
+
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 78a60e839775e..4cab6e05ba79f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -217,11 +217,6 @@ let TargetPrefix = "aarch64" in { // All intrinsics start with "llvm.aarch64.".
: DefaultAttrsIntrinsic<[llvm_v4f32_ty],
[llvm_v4f32_ty, llvm_v8bf16_ty, llvm_v8bf16_ty],
[IntrNoMem]>;
-
- class AdvSIMD_MatMul_fpm_Intrinsic
- : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
- [LLVMMatchType<0>, llvm_v16i8_ty, llvm_v16i8_ty],
- [IntrNoMem]>;
}
// Arithmetic ops
@@ -504,7 +499,7 @@ let TargetPrefix = "aarch64" in {
def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic;
- def int_aarch64_neon_fmmla : AdvSIMD_MatMul_fpm_Intrinsic;
+ def int_aarch64_neon_fmmla : AdvSIMD_MatMul_Intrinsic;
def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic;
def int_aarch64_neon_bfdot : AdvSIMD_Dot_Intrinsic;
def int_aarch64_neon_bfmmla
More information about the llvm-commits
mailing list