[clang] [llvm] [AArch64] Add intrinsics for SME FP8 FDOT LANE instructions (PR #118492)
Jonathan Thackray via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 5 04:05:23 PST 2024
https://github.com/jthackray updated https://github.com/llvm/llvm-project/pull/118492
>From 609cf3fbdb28c155f4b8c787c1e2cb791c8a292f Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Fri, 29 Nov 2024 11:27:03 +0000
Subject: [PATCH 1/3] [AArch64] Add intrinsics for SME FP8 FDOT LANE
instructions
Co-authored-by: Momchil Velikov <momchil.velikov at arm.com>
Co-authored-by: Marian Lukac <marian.lukac at arm.com>
Co-authored-by: Caroline Concatto <caroline.concatto at arm.com>
---
clang/include/clang/Basic/arm_sme.td | 5 +
clang/include/clang/Basic/arm_sve_sme_incl.td | 1 +
clang/lib/CodeGen/CGBuiltin.cpp | 6 +
.../sme2-intrinsics/acle_sme2_fp8_fdot.c | 57 +++++++
llvm/include/llvm/IR/IntrinsicsAArch64.td | 22 +++
.../lib/Target/AArch64/AArch64SMEInstrInfo.td | 4 +-
llvm/lib/Target/AArch64/SMEInstrFormats.td | 141 ++++++++++++++++++
.../AArch64/sme2-intrinsics-fp8-fdot.ll | 32 ++++
8 files changed, 266 insertions(+), 2 deletions(-)
create mode 100644 clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c
create mode 100644 llvm/test/CodeGen/AArch64/sme2-intrinsics-fp8-fdot.ll
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 0f689e82bdb742..87ed68c03430cd 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -740,6 +740,11 @@ let SMETargetGuard = "sme2" in {
def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
}
+// FDOT
+let SMETargetGuard = "sme2,sme-f8f16" in {
+ def SVDOT_LANE_FP8_ZA16_VG1x2 : Inst<"svdot_lane_za16[_mf8]_vg1x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
+ def SVDOT_LANE_FP8_ZA16_VG1x4 : Inst<"svdot_lane_za16[_mf8]_vg1x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
+}
////////////////////////////////////////////////////////////////////////////////
// SME2p1 - FMOPA, FMOPS (non-widening)
let SMETargetGuard = "sme-b16b16" in {
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
index de10be7bdce0db..e7cc40db7dca6c 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -52,6 +52,7 @@ include "arm_immcheck_incl.td"
// h: half-float
// d: double
// b: bfloat
+// m: mfloat8
// Typespec modifiers
// ------------------
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 7588f8427cdd38..7de5e8bcd439d7 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -10183,6 +10183,8 @@ CodeGenFunction::getSVEType(const SVETypeFlags &TypeFlags) {
case SVETypeFlags::EltTyInt64:
return llvm::ScalableVectorType::get(Builder.getInt64Ty(), 2);
+ case SVETypeFlags::EltTyMFloat8:
+ return llvm::ScalableVectorType::get(Builder.getInt8Ty(), 16);
case SVETypeFlags::EltTyFloat16:
return llvm::ScalableVectorType::get(Builder.getHalfTy(), 8);
case SVETypeFlags::EltTyBFloat16:
@@ -11234,6 +11236,10 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
BuiltinID == SME::BI__builtin_sme_svstr_za)
return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic);
+ // Emit set FPMR for intrinsics that require it
+ if (TypeFlags.setsFPMR())
+ Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr),
+ Ops.pop_back_val());
// Handle builtins which require their multi-vector operands to be swapped
swapCommutativeSMEOperands(BuiltinID, Ops);
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c
new file mode 100644
index 00000000000000..999b1940df80c4
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c
@@ -0,0 +1,57 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// REQUIRES: aarch64-registered-target
+#include <arm_sme.h>
+
+// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+#include <arm_sme.h>
+
+#ifdef SVE_OVERLOADED_FORMS
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: define dso_local void @test_svdot_lane_za16_f8_vg1x2(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 3)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z29test_svdot_lane_za16_f8_vg1x2j13svmfloat8x2_tu13__SVMfloat8_tm(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x2(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZM]], i32 3)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svdot_lane_za16_f8_vg1x2(uint32_t slice, svmfloat8x2_t zn,
+ svmfloat8_t zm, fpm_t fpmr)
+ __arm_streaming __arm_inout("za") {
+ SVE_ACLE_FUNC(svdot_lane_za16,_mf8,_vg1x2_fpm)(slice, zn, zm, 3, fpmr);
+}
+
+
+// CHECK-LABEL: define dso_local void @test_svdot_lane_za16_f8_vg1x4(
+// CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]], i32 3)
+// CHECK-NEXT: ret void
+//
+// CPP-CHECK-LABEL: define dso_local void @_Z29test_svdot_lane_za16_f8_vg1x4j13svmfloat8x4_tu13__SVMfloat8_tm(
+// CPP-CHECK-SAME: i32 noundef [[SLICE:%.*]], <vscale x 16 x i8> [[ZN_COERCE0:%.*]], <vscale x 16 x i8> [[ZN_COERCE1:%.*]], <vscale x 16 x i8> [[ZN_COERCE2:%.*]], <vscale x 16 x i8> [[ZN_COERCE3:%.*]], <vscale x 16 x i8> [[ZM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CPP-CHECK-NEXT: [[ENTRY:.*:]]
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x4(i32 [[SLICE]], <vscale x 16 x i8> [[ZN_COERCE0]], <vscale x 16 x i8> [[ZN_COERCE1]], <vscale x 16 x i8> [[ZN_COERCE2]], <vscale x 16 x i8> [[ZN_COERCE3]], <vscale x 16 x i8> [[ZM]], i32 3)
+// CPP-CHECK-NEXT: ret void
+//
+void test_svdot_lane_za16_f8_vg1x4(uint32_t slice, svmfloat8x4_t zn,
+ svmfloat8_t zm, fpm_t fpmr)
+ __arm_streaming __arm_inout("za") {
+ SVE_ACLE_FUNC(svdot_lane_za16,_mf8,_vg1x4_fpm)(slice, zn, zm, 3, fpmr);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index a91616b9556828..c4ce8c1917c9b6 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3864,3 +3864,25 @@ def int_aarch64_sve_famin_u : AdvSIMD_Pred2VectorArg_Intrinsic;
// Neon absolute maximum and minimum
def int_aarch64_neon_famax : AdvSIMD_2VectorArg_Intrinsic;
def int_aarch64_neon_famin : AdvSIMD_2VectorArg_Intrinsic;
+
+// SME FDOT instructions
+let TargetPrefix = "aarch64" in {
+
+
+class SME2_FP8_FDOT_LANE_VG1x2 :
+ DefaultAttrsIntrinsic<[], [llvm_i32_ty,
+ llvm_nxv16i8_ty, llvm_nxv16i8_ty,
+ llvm_nxv16i8_ty,
+ llvm_i32_ty],
+ [IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<4>>]>;
+
+class SME2_FP8_FDOT_LANE_VG1x4 :
+ DefaultAttrsIntrinsic<[], [llvm_i32_ty,
+ llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty, llvm_nxv16i8_ty,
+ llvm_nxv16i8_ty,
+ llvm_i32_ty],
+ [IntrInaccessibleMemOnly, IntrHasSideEffects, ImmArg<ArgIndex<6>>]>;
+
+ def int_aarch64_sme_fp8_fdot_lane_za16_vg1x2 : SME2_FP8_FDOT_LANE_VG1x2;
+ def int_aarch64_sme_fp8_fdot_lane_za16_vg1x4 : SME2_FP8_FDOT_LANE_VG1x4;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 37ac915d1d8808..f02a4b7bdbfaa8 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -986,8 +986,8 @@ def LUTI4_S_4ZZT2Z : sme2_luti4_vector_vg4_strided<0b00, 0b00, "luti4">;
let Predicates = [HasSMEF8F16] in {
defm FVDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_f8f16<"fvdot", 0b11, 0b110, ZZ_b_mul_r, ZPR4b8>;
-defm FDOT_VG2_M2ZZI_BtoH : sme2p1_multi_vec_array_vg2_index_f8f16<"fdot", 0b11, 0b010, ZZ_b_mul_r, ZPR4b8>;
-defm FDOT_VG4_M4ZZI_BtoH : sme2p1_multi_vec_array_vg4_index_f8f16<"fdot", 0b100, ZZZZ_b_mul_r, ZPR4b8>;
+defm FDOT_VG2_M2ZZI_BtoH : sme2_fp8_fdot_index_za16_vg1x2<"fdot", 0b11, 0b010, ZZ_b_mul_r, int_aarch64_sme_fp8_fdot_lane_za16_vg1x2>;
+defm FDOT_VG4_M4ZZI_BtoH : sme2_fp8_fdot_index_za16_vg1x4<"fdot", 0b100, ZZZZ_b_mul_r, int_aarch64_sme_fp8_fdot_lane_za16_vg1x4>;
defm FDOT_VG2_M2ZZ_BtoH : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0010001, MatrixOp16, ZZ_b, ZPR4b8>;
defm FDOT_VG4_M4ZZ_BtoH : sme2_dot_mla_add_sub_array_vg24_single<"fdot", 0b0110001, MatrixOp16, ZZZZ_b, ZPR4b8>;
// TODO: Replace nxv16i8 by nxv16f8
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 776472e72af05a..4b68df1b5ff29b 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -219,6 +219,37 @@ class SME2_Tile_Movaz_Pat<string name, SDPatternOperator intrinsic, ValueType ou
: Pat<(out_vt (intrinsic tile_imm:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$base, index_ty:$offset)))),
(!cast<Instruction>(name # _PSEUDO) $tile, $base, $offset)>;
+
+// FP8 SME FDOT instructions
+
+// Selection DAG patterns - map to first level of pseudo-instructions (xxx_PSEUDO)
+class SME2_FP8_FMLA_FDOT_Index_VG1x2_Pat<string name, SDPatternOperator intrinsic,
+ ComplexPattern tileslice, Operand offset_ty, Operand imm_ty,
+ ValueType vt = nxv16i8>
+ : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset)),
+ vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i)),
+ (!cast<Instruction>(name # _PSEUDO) $base, $offset,
+ (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1),
+ ZPR4b8:$Zm, imm_ty:$i)>;
+
+class SME2_FP8_FMLA_FDOT_Index_VG1x4_Pat<string name, SDPatternOperator intrinsic,
+ ComplexPattern tileslice, Operand offset_ty, Operand imm_ty,
+ ValueType vt = nxv16i8>
+ : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset)),
+ vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4,
+ vt:$Zm, (i32 imm_ty:$i)),
+ (!cast<Instruction>(name # _PSEUDO) $base, $offset,
+ (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
+ ZPR4b8:$Zm, imm_ty:$i)>;
+
+class sme2_fp8_fmla_fdot_index_pseudo<string name, Operand offset_ty, RegisterOperand src1_ty, RegisterOperand src2_ty, Operand imm_ty>
+ : SMEPseudo2Instr<name, 0>,
+ Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, offset_ty:$offs, src1_ty:$Zn, src2_ty:$Zm, imm_ty:$i), []> {
+ let mayLoad = 1;
+ let SMEMatrixType = SMEMatrixArray;
+ let usesCustomInserter = 1;
+}
+
//===----------------------------------------------------------------------===//
// SME pattern match helpers.
//===----------------------------------------------------------------------===//
@@ -5737,3 +5768,113 @@ multiclass sme2_fmop4a_fp8_fp16_2way<string mnemonic> {
// Multiple vectors
def _M2Z2Z_BtoH : sme2_fp8_fp16_quarter_tile_outer_product<0b1, 0b1, mnemonic, ZZ_b_mul_r_Lo, ZZ_b_mul_r_Hi>;
}
+
+// FP8 SME FDOT instructions
+
+// Selection DAG patterns - map to first level of pseudo-instructions (xxx_PSEUDO)
+
+class SME2_FP8_FDOT_Index_VG1x2_Pat<string name, SDPatternOperator intrinsic,
+ ComplexPattern tileslice, Operand offset_ty, Operand imm_ty,
+ ValueType vt = nxv16i8>
+ : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset)),
+ vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i), i64:$fpmr),
+ (!cast<Instruction>(name # _PSEUDO) $base, $offset,
+ (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1),
+ ZPR4b8:$Zm, imm_ty:$i, GPR64:$fpmr)>;
+
+class SME2_FP8_FDOT_Index_VG1x4_Pat<string name, SDPatternOperator intrinsic,
+ ComplexPattern tileslice, Operand offset_ty, Operand imm_ty,
+ ValueType vt = nxv16i8>
+ : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset)),
+ vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4,
+ vt:$Zm, (i32 imm_ty:$i), i64:$fpmr),
+ (!cast<Instruction>(name # _PSEUDO) $base, $offset,
+ (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
+ ZPR4b8:$Zm, imm_ty:$i, GPR64:$fpmr)>;
+
+// First level pseudo-instructions (xxx_PSEUDO) - transformed to second level pseudo-instructions (xxx_FPMR_PSEUDO)
+// during instruction selection.
+class sme2_fp8_fdot_index_pseudo<string name, Operand offset_ty, RegisterOperand src1_ty, RegisterOperand src2_ty, Operand imm_ty>
+ : SMEPseudo2Instr<name, 0>,
+ Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, offset_ty:$offs, src1_ty:$Zn, src2_ty:$Zm, imm_ty:$i, GPR64:$fpmr), []> {
+ let SMEMatrixType = SMEMatrixArray;
+ let usesCustomInserter = 1;
+}
+
+class sme2_fp8_fdot_pseudo<string name, Operand offset_ty, RegisterOperand src1_ty, RegisterOperand src2_ty>
+ : SMEPseudo2Instr<name, 0>,
+ Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, offset_ty:$offs, src1_ty:$Zn, src2_ty:$Zm, GPR64:$fpmr), []> {
+ let SMEMatrixType = SMEMatrixArray;
+ let usesCustomInserter = 1;
+}
+
+// Second level pseudo-instruction - expanded to real instruction by the AArch64 pseudo instruction expansion pass
+class sme2_fp8_fdot_index_fpmr_pseudo<string name, MatrixOperand matrix_ty, Operand offset_ty,
+ RegisterOperand src1_ty, RegisterOperand src2_ty,
+ Operand imm_ty>
+ : Pseudo<(outs matrix_ty:$ZAda),
+ (ins matrix_ty:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, offset_ty:$offs,
+ src1_ty:$Zn, src2_ty:$Zm, imm_ty:$i, GPR64:$fpmr), []>,
+ SMEPseudo2Instr<name, 1> {
+ let hasNoSchedulingInfo = 1;
+ let Constraints = "$ZAda = $_ZAda";
+}
+
+class sme2_fp8_fdot_fpmr_pseudo<string name, MatrixOperand matrix_ty, Operand offset_ty,
+ RegisterOperand src1_ty, RegisterOperand src2_ty>
+ : Pseudo<(outs matrix_ty:$ZAda),
+ (ins matrix_ty:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, offset_ty:$offs,
+ src1_ty:$Zn, src2_ty:$Zm, GPR64:$fpmr), []>,
+ SMEPseudo2Instr<name, 1> {
+ let hasNoSchedulingInfo = 1;
+ let Constraints = "$ZAda = $_ZAda";
+}
+
+// FDOT instructions
+multiclass sme2_fp8_fdot_index_za16_vg1x2<string mnemonic, bits<2> sz, bits<3> op,
+ RegisterOperand multi_vector_ty, SDPatternOperator intrinsic> {
+ def NAME : sme2_multi_vec_array_vg2_index<sz, {op{2},?,?,op{1-0},?}, MatrixOp16,
+ multi_vector_ty, ZPR4b8,
+ VectorIndexH32b_timm, mnemonic>,
+ SMEPseudo2Instr<NAME, 1>{
+ let Uses=[FPMR, FPCR];
+ let mayLoad = 1;
+
+ bits<3> i;
+ let Inst{11-10} = i{2-1};
+ let Inst{3} = i{0};
+ }
+
+ def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
+ (!cast<Instruction>(NAME) MatrixOp16:$ZAda, MatrixIndexGPR32Op8_11:$Rv, sme_elm_idx0_7:$imm3,
+ multi_vector_ty:$Zn, ZPR4b8:$Zm, VectorIndexH32b_timm:$i), 0>;
+
+
+ def _PSEUDO : sme2_fp8_fmla_fdot_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, ZPR4b8, VectorIndexH32b_timm>;
+
+ def : SME2_FP8_FMLA_FDOT_Index_VG1x2_Pat<NAME, intrinsic, tileslice16, sme_elm_idx0_7, VectorIndexH32b_timm>;
+}
+
+multiclass sme2_fp8_fdot_index_za16_vg1x4<string mnemonic, bits<3> op,
+ RegisterOperand multi_vector_ty, SDPatternOperator intrinsic> {
+ def NAME : sme2_multi_vec_array_vg4_index<0b0,{0b1,?,?,op,?}, MatrixOp16,
+ multi_vector_ty, ZPR4b8,
+ VectorIndexH32b_timm, mnemonic>,
+ SMEPseudo2Instr<NAME, 1> {
+ let Uses=[FPMR, FPCR];
+ let mayLoad = 1;
+
+ bits<3> i;
+ let Inst{11-10} = i{2-1};
+ let Inst{3} = i{0};
+ }
+
+ def : InstAlias<mnemonic # "\t$ZAda[$Rv, $imm3], $Zn, $Zm$i",
+ (!cast<Instruction>(NAME) MatrixOp16:$ZAda, MatrixIndexGPR32Op8_11:$Rv,
+ sme_elm_idx0_7:$imm3, multi_vector_ty:$Zn, ZPR4b8:$Zm, VectorIndexH32b_timm:$i), 0>;
+
+
+ def _PSEUDO : sme2_fp8_fmla_fdot_index_pseudo<NAME, sme_elm_idx0_7, multi_vector_ty, ZPR4b8, VectorIndexH32b_timm>;
+
+ def : SME2_FP8_FMLA_FDOT_Index_VG1x4_Pat<NAME, intrinsic, tileslice16, sme_elm_idx0_7, VectorIndexH32b_timm>;
+}
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp8-fdot.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp8-fdot.ll
new file mode 100644
index 00000000000000..138d9876fabda8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-fp8-fdot.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --filter-out "^[ \t]*//.*$" --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+sme-f8f16,+sme-f8f32 -verify-machineinstrs -force-streaming < %s | FileCheck %s
+
+target triple = "aarch64-linux"
+
+define void @test_fdot16_1x2_indexed(i32 %slice.0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zm) #0 {
+; CHECK-LABEL: test_fdot16_1x2_indexed:
+; CHECK: mov w8, w0
+; CHECK: fdot za.h[w8, 7, vgx2], { z0.b, z1.b }, z2.b[1]
+; CHECK: ret
+ %slice = add i32 %slice.0, 7
+ call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x2(i32 %slice,
+ <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2,
+ <vscale x 16 x i8> %zm, i32 1)
+ ret void
+}
+
+define void @test_fdot16_1x4_indexed(i32 %slice.0, <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4,
+; CHECK-LABEL: test_fdot16_1x4_indexed:
+; CHECK: mov w8, w0
+; CHECK: fdot za.h[w8, 7, vgx4], { z0.b - z3.b }, z4.b[1]
+; CHECK: ret
+ <vscale x 16 x i8> %zm) #0 {
+ %slice = add i32 %slice.0, 7
+ call void @llvm.aarch64.sme.fp8.fdot.lane.za16.vg1x4(i32 %slice,
+ <vscale x 16 x i8> %zn1, <vscale x 16 x i8> %zn2, <vscale x 16 x i8> %zn3, <vscale x 16 x i8> %zn4,
+ <vscale x 16 x i8> %zm, i32 1)
+ ret void
+}
+
+
+attributes #0 = { "target-features" = "+sme,+sme-f8f32,+sme-f8f16" }
>From 605ccdf31aba832b2056a1b89a073dfbdcd54af6 Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Tue, 3 Dec 2024 14:29:37 +0000
Subject: [PATCH 2/3] fixup! [AArch64] Add intrinsics for SME FP8 FDOT LANE
instructions
---
clang/lib/CodeGen/CGBuiltin.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 7de5e8bcd439d7..11cb8da78879e3 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -10183,7 +10183,7 @@ CodeGenFunction::getSVEType(const SVETypeFlags &TypeFlags) {
case SVETypeFlags::EltTyInt64:
return llvm::ScalableVectorType::get(Builder.getInt64Ty(), 2);
- case SVETypeFlags::EltTyMFloat8:
+ case SVETypeFlags::EltTyMFloat8:
return llvm::ScalableVectorType::get(Builder.getInt8Ty(), 16);
case SVETypeFlags::EltTyFloat16:
return llvm::ScalableVectorType::get(Builder.getHalfTy(), 8);
>From 863a80380023f3b375da7d358780d2919a8d58dc Mon Sep 17 00:00:00 2001
From: Jonathan Thackray <jonathan.thackray at arm.com>
Date: Wed, 4 Dec 2024 21:40:35 +0000
Subject: [PATCH 3/3] fixup! [AArch64] Add intrinsics for SME FP8 FDOT LANE
instructions
---
clang/include/clang/Basic/arm_sme.td | 3 +-
.../sme2-intrinsics/acle_sme2_fp8_fdot.c | 10 ++--
llvm/lib/Target/AArch64/SMEInstrFormats.td | 60 -------------------
3 files changed, 7 insertions(+), 66 deletions(-)
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 87ed68c03430cd..003da2eea5e1c9 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -741,10 +741,11 @@ let SMETargetGuard = "sme2" in {
}
// FDOT
-let SMETargetGuard = "sme2,sme-f8f16" in {
+let SMETargetGuard = "sme-f8f16" in {
def SVDOT_LANE_FP8_ZA16_VG1x2 : Inst<"svdot_lane_za16[_mf8]_vg1x2_fpm", "vm2di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x2", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
def SVDOT_LANE_FP8_ZA16_VG1x4 : Inst<"svdot_lane_za16[_mf8]_vg1x4_fpm", "vm4di>", "m", MergeNone, "aarch64_sme_fp8_fdot_lane_za16_vg1x4", [IsStreaming, IsInOutZA, SetsFPMR, IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>;
}
+
////////////////////////////////////////////////////////////////////////////////
// SME2p1 - FMOPA, FMOPS (non-widening)
let SMETargetGuard = "sme-b16b16" in {
diff --git a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c
index 999b1940df80c4..c4e4a57dd2caa3 100644
--- a/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c
+++ b/clang/test/CodeGen/AArch64/sme2-intrinsics/acle_sme2_fp8_fdot.c
@@ -2,11 +2,11 @@
// REQUIRES: aarch64-registered-target
#include <arm_sme.h>
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -triple aarch64 -target-feature +bf16 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sme -target-feature +sme2 -target-feature +sme-f8f16 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
#include <arm_sme.h>
#ifdef SVE_OVERLOADED_FORMS
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 4b68df1b5ff29b..82d84f326ca608 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -5771,66 +5771,6 @@ multiclass sme2_fmop4a_fp8_fp16_2way<string mnemonic> {
// FP8 SME FDOT instructions
-// Selection DAG patterns - map to first level of pseudo-instructions (xxx_PSEUDO)
-
-class SME2_FP8_FDOT_Index_VG1x2_Pat<string name, SDPatternOperator intrinsic,
- ComplexPattern tileslice, Operand offset_ty, Operand imm_ty,
- ValueType vt = nxv16i8>
- : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset)),
- vt:$Zn1, vt:$Zn2, vt:$Zm, (i32 imm_ty:$i), i64:$fpmr),
- (!cast<Instruction>(name # _PSEUDO) $base, $offset,
- (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1),
- ZPR4b8:$Zm, imm_ty:$i, GPR64:$fpmr)>;
-
-class SME2_FP8_FDOT_Index_VG1x4_Pat<string name, SDPatternOperator intrinsic,
- ComplexPattern tileslice, Operand offset_ty, Operand imm_ty,
- ValueType vt = nxv16i8>
- : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, offset_ty:$offset)),
- vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4,
- vt:$Zm, (i32 imm_ty:$i), i64:$fpmr),
- (!cast<Instruction>(name # _PSEUDO) $base, $offset,
- (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3),
- ZPR4b8:$Zm, imm_ty:$i, GPR64:$fpmr)>;
-
-// First level pseudo-instructions (xxx_PSEUDO) - transformed to second level pseudo-instructions (xxx_FPMR_PSEUDO)
-// during instruction selection.
-class sme2_fp8_fdot_index_pseudo<string name, Operand offset_ty, RegisterOperand src1_ty, RegisterOperand src2_ty, Operand imm_ty>
- : SMEPseudo2Instr<name, 0>,
- Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, offset_ty:$offs, src1_ty:$Zn, src2_ty:$Zm, imm_ty:$i, GPR64:$fpmr), []> {
- let SMEMatrixType = SMEMatrixArray;
- let usesCustomInserter = 1;
-}
-
-class sme2_fp8_fdot_pseudo<string name, Operand offset_ty, RegisterOperand src1_ty, RegisterOperand src2_ty>
- : SMEPseudo2Instr<name, 0>,
- Pseudo<(outs), (ins MatrixIndexGPR32Op8_11:$Rv, offset_ty:$offs, src1_ty:$Zn, src2_ty:$Zm, GPR64:$fpmr), []> {
- let SMEMatrixType = SMEMatrixArray;
- let usesCustomInserter = 1;
-}
-
-// Second level pseudo-instruction - expanded to real instruction by the AArch64 pseudo instruction expansion pass
-class sme2_fp8_fdot_index_fpmr_pseudo<string name, MatrixOperand matrix_ty, Operand offset_ty,
- RegisterOperand src1_ty, RegisterOperand src2_ty,
- Operand imm_ty>
- : Pseudo<(outs matrix_ty:$ZAda),
- (ins matrix_ty:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, offset_ty:$offs,
- src1_ty:$Zn, src2_ty:$Zm, imm_ty:$i, GPR64:$fpmr), []>,
- SMEPseudo2Instr<name, 1> {
- let hasNoSchedulingInfo = 1;
- let Constraints = "$ZAda = $_ZAda";
-}
-
-class sme2_fp8_fdot_fpmr_pseudo<string name, MatrixOperand matrix_ty, Operand offset_ty,
- RegisterOperand src1_ty, RegisterOperand src2_ty>
- : Pseudo<(outs matrix_ty:$ZAda),
- (ins matrix_ty:$_ZAda, MatrixIndexGPR32Op8_11:$Rv, offset_ty:$offs,
- src1_ty:$Zn, src2_ty:$Zm, GPR64:$fpmr), []>,
- SMEPseudo2Instr<name, 1> {
- let hasNoSchedulingInfo = 1;
- let Constraints = "$ZAda = $_ZAda";
-}
-
-// FDOT instructions
multiclass sme2_fp8_fdot_index_za16_vg1x2<string mnemonic, bits<2> sz, bits<3> op,
RegisterOperand multi_vector_ty, SDPatternOperator intrinsic> {
def NAME : sme2_multi_vec_array_vg2_index<sz, {op{2},?,?,op{1-0},?}, MatrixOp16,
More information about the llvm-commits
mailing list