[clang] [llvm] [AArch64] Add intrinsics for SME FP8 FDOT single and multi instructions (PR #119845)

Fri Dec 13 01:44:25 PST 2024

llvmbot wrote:




@llvm/pr-subscribers-clang

Author: Jonathan Thackray (jthackray)

<details>
<summary>Changes</summary>

Add support for the following SME 8 bit floating-point dot-product intrinsics:

* svdot_single_za16_mf8_vg1x2_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm);
* svdot_single_za16_mf8_vg1x4_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm);
* svdot_single_za32_mf8_vg1x2_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm);
* svdot_single_za32_mf8_vg1x4_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm);
* svdot_za16_mf8_vg1x2_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm);
* svdot_za16_mf8_vg1x4_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm);
* svdot_za32_mf8_vg1x2_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm);
* svdot_za32_mf8_vg1x4_fpm(uint32_t slice, svmfloat8x2_t f8x2, svmfloat8_t f8, fpm_t fpm);

---

Patch is 38.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/119845.diff


7 Files Affected:

- (modified) clang/include/clang/Basic/arm_sme.td (+12) 
- (modified) clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c (+167-7) 
- (modified) clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_fp8_fdot.c (+16) 
- (modified) llvm/include/llvm/IR/IntrinsicsAArch64.td (+36) 
- (modified) llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td (+8-9) 
- (modified) llvm/lib/Target/AArch64/SMEInstrFormats.td (+64) 
- (modified) llvm/test/CodeGen/AArch64/sme2-intrinsics-fp8-fdot.ll (+112) 


``````````diff

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 0fae70866cd55e..e7625d8d3e0b8a 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -748,11 +748,23 @@ let SMETargetGuard = "sme2" in {
 let SMETargetGuard = "sme-f8f32" in {
   def SVDOT_LANE_FP8_ZA32_VG1x2 : Inst<"svdot_lane_za32[_mf8]_vg1x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za32_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>;
   def SVDOT_LANE_FP8_ZA32_VG1x4 : Inst<"svdot_lane_za32[_mf8]_vg1x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za32_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_3>]>;
+
+  def SVDOT_SINGLE_FP8_ZA32_VG1x2 : Inst<"svdot[_single]_za32[_mf8]_vg1x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za32_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+  def SVDOT_SINGLE_FP8_ZA32_VG1x4 : Inst<"svdot[_single]_za32[_mf8]_vg1x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za32_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+
+  def SVDOT_MULTI_FP8_ZA32_VG1x2 : Inst<"svdot_za32[_mf8]_vg1x2_fpm", "vm22>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za32_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+  def SVDOT_MULTI_FP8_ZA32_VG1x4 : Inst<"svdot_za32[_mf8]_vg1x4_fpm", "vm44>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za32_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
 }
 
 let SMETargetGuard = "sme-f8f16" in {
   def SVDOT_LANE_FP8_ZA16_VG1x2 : Inst<"svdot_lane_za16[_mf8]_vg1x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
   def SVDOT_LANE_FP8_ZA16_VG1x4 : Inst<"svdot_lane_za16[_mf8]_vg1x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
+
+  def SVDOT_SINGLE_FP8_ZA16_VG1x2 : Inst<"svdot[_single]_za16[_mf8]_vg1x2_fpm", "vm2d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+  def SVDOT_SINGLE_FP8_ZA16_VG1x4 : Inst<"svdot[_single]_za16[_mf8]_vg1x4_fpm", "vm4d>", "m", MergeNone, "aarch64_sme_fp8_fdot_single_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+
+  def SVDOT_MULTI_FP8_ZA16_VG1x2 : Inst<"svdot_za16[_mf8]_vg1x2_fpm", "vm22>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
+  def SVDOT_MULTI_FP8_ZA16_VG1x4 : Inst<"svdot_za16[_mf8]_vg1x4_fpm", "vm44>", "m", MergeNone, "aarch64_sme_fp8_fdot_multi_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], []>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c
index 74d18c32d5b3ab..a151d162e01085 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c
@@ -1,18 +1,18 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 // REQUIRES: aarch64-registered-target
-#include <arm_sme.h>
 
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature +sme-f8f32 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
 #include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
-#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED,A5) A1##A3##A5
 #else
-#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#define SVE_ACLE_FUNC(A1,A2,A3,A4,A5) A1##A2##A3##A4##A5
 #endif
 
 // CHECK-LABEL: define dso_local void @test_svdot_lane_za32_f8_vg1x2(
@@ -32,7 +32,7 @@
 void test_svdot_lane_za32_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn,
                                    svmfloat8_t zm, fpm_t fpmr)
      __arm_streaming __arm_inout("za") {
-  SVE_ACLE_FUNC(svdot_lane_za32,_mf8,_vg1x2_fpm)(slice, zn, zm, 3, fpmr);
+  SVE_ACLE_FUNC(svdot_lane_za32,_mf8,_vg1x2_fpm,,)(slice, zn, zm, 3, fpmr);
 }
 
 // CHECK-LABEL: define dso_local void @test_svdot_lane_za32_f8_vg1x4(
@@ -52,7 +52,7 @@ void test_svdot_lane_za32_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn,
 void test_svdot_lane_za32_f8_vg1x4(uint32_t slice, svmfloat8x4_t zn,
                                    svmfloat8_t zm, fpm_t fpmr)
      __arm_streaming __arm_inout("za") {
-  SVE_ACLE_FUNC(svdot_lane_za32,_mf8,_vg1x4_fpm)(slice, zn, zm, 3, fpmr);
+  SVE_ACLE_FUNC(svdot_lane_za32,_mf8,_vg1x4_fpm,,)(slice, zn, zm, 3, fpmr);
 }
 
 // CHECK-LABEL: define dso_local void @test_svdot_lane_za16_f8_vg1x2(
@@ -72,7 +72,7 @@ void test_svdot_lane_za32_f8_vg1x4(uint32_t slice, svmfloat8x4_t zn,
 void test_svdot_lane_za16_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn,
                                    svmfloat8_t zm, fpm_t fpmr)
      __arm_streaming __arm_inout("za") {
-  SVE_ACLE_FUNC(svdot_lane_za16,_mf8,_vg1x2_fpm)(slice, zn, zm, 3, fpmr);
+  SVE_ACLE_FUNC(svdot_lane_za16,_mf8,_vg1x2_fpm,,)(slice, zn, zm, 3, fpmr);
 }
 
 // CHECK-LABEL: define dso_local void @test_svdot_lane_za16_f8_vg1x4(
@@ -92,5 +92,165 @@ void test_svdot_lane_za16_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn,
 void test_svdot_lane_za16_f8_vg1x4(uint32_t slice, svmfloat8x4_t zn,
                                    svmfloat8_t zm, fpm_t fpmr)
      __arm_streaming __arm_inout("za") {
-  SVE_ACLE_FUNC(svdot_lane_za16,_mf8,_vg1x4_fpm)(slice, zn, zm, 3, fpmr);
+  SVE_ACLE_FUNC(svdot_lane_za16,_mf8,_vg1x4_fpm,,)(slice, zn, zm, 3, fpmr);
+}
+
+// CHECK-LABEL: define dso_local void @test_svdot_single_za32_f8_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fp8.fdot.single.za32.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z31test_svdot_single_za32_f8_vg1x2j13svmfloat8x2_tu13__SVMfloat8_tm(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fp8.fdot.single.za32.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn,
+                                     svmfloat8_t zm, fpm_t fpmr)
+     __arm_streaming __arm_inout("za") {
+  SVE_ACLE_FUNC(svdot,_single,_za32,_mf8,_vg1x2_fpm)(slice, zn, zm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local void @test_svdot_single_za32_f8_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fp8.fdot.single.za32.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z31test_svdot_single_za32_f8_vg1x4j13svmfloat8x4_tu13__SVMfloat8_tm(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fp8.fdot.single.za32.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za32_f8_vg1x4(uint32_t slice, svmfloat8x4_t zn,
+                                     svmfloat8_t zm, fpm_t fpmr)
+     __arm_streaming __arm_inout("za") {
+  SVE_ACLE_FUNC(svdot,_single,_za32,_mf8,_vg1x4_fpm)(slice, zn, zm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local void @test_svdot_multi_za32_f8_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fp8.fdot.multi.za32.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z30test_svdot_multi_za32_f8_vg1x2j13svmfloat8x2_tS_m(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fp8.fdot.multi.za32.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn,
+                                    svmfloat8x2_t zm, fpm_t fpmr)
+     __arm_streaming __arm_inout("za") {
+  SVE_ACLE_FUNC(svdot,,_za32,_mf8,_vg1x2_fpm) (slice, zn, zm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local void @test_svdot_multi_za32_f8_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fp8.fdot.multi.za32.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE2]], <vscale x 16 x i8> [[ZM_COERCE3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z30test_svdot_multi_za32_f8_vg1x4j13svmfloat8x4_tS_m(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fp8.fdot.multi.za32.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE2]], <vscale x 16 x i8> [[ZM_COERCE3]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za32_f8_vg1x4(uint32_t slice, svmfloat8x4_t zn,
+                                    svmfloat8x4_t zm, fpm_t fpmr)
+     __arm_streaming __arm_inout("za") {
+  SVE_ACLE_FUNC(svdot,,_za32,_mf8,_vg1x4_fpm)(slice, zn, zm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local void @test_svdot_single_za16_f8_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fp8.fdot.single.za16.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z31test_svdot_single_za16_f8_vg1x2j13svmfloat8x2_tu13__SVMfloat8_tm(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fp8.fdot.single.za16.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za16_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn,
+                                     svmfloat8_t zm, fpm_t fpmr)
+     __arm_streaming __arm_inout("za") {
+  SVE_ACLE_FUNC(svdot,_single,_za16,_mf8,_vg1x2_fpm)(slice, zn, zm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local void @test_svdot_single_za16_f8_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fp8.fdot.single.za16.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z31test_svdot_single_za16_f8_vg1x4j13svmfloat8x4_tu13__SVMfloat8_tm(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fp8.fdot.single.za16.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_single_za16_f8_vg1x4(uint32_t slice, svmfloat8x4_t zn,
+                                     svmfloat8_t zm, fpm_t fpmr)
+     __arm_streaming __arm_inout("za") {
+  SVE_ACLE_FUNC(svdot,_single,_za16,_mf8,_vg1x4_fpm)(slice, zn, zm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local void @test_svdot_multi_za16_f8_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fp8.fdot.multi.za16.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z30test_svdot_multi_za16_f8_vg1x2j13svmfloat8x2_tS_m(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT:  [[ENTRY:.*:]]
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.fp8.fdot.multi.za16.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]])
+// CPP-CHECK-NEXT:    ret void
+//
+void test_svdot_multi_za16_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn,
+                                    svmfloat8x2_t zm, fpm_t fpmr)
+     __arm_streaming __arm_inout("za") {
+  SVE_ACLE_FUNC(svdot,,_za16,_mf8,_vg1x2_fpm) (slice, zn, zm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local void @test_svdot_multi_za16_f8_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%.*]], <vscale x 16 x i8> [[ZM_COERCE3:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    tail call void @llvm.aarch64.sme.fp8.fdot.multi.za16.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM_COERCE0]], <vscale x 16 x i8> [[ZM_COERCE1]], <vscale x 16 x i8> [[ZM_COERCE2]], <vscale x 16 x i8> [[ZM_COERCE3]])
+// CHECK-NEXT:    ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z30test_svdot_multi_za16_f8_vg1x4j13svmfloat8x4_tS_m(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM_COERCE0:%.*]], <vscale x 16 x i8> [[ZM_COERCE1:%.*]], <vscale x 16 x i8> [[ZM_COERCE2:%....
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/119845