[clang] [llvm] [AArch64] Implement intrinsics for F1CVTL/F2CVTL and BF1CVTL/BF2CVTL (PR #116959)
via cfe-commits
cfe-commits at lists.llvm.org
Sun Nov 24 02:20:49 PST 2024
https://github.com/SpencerAbson updated https://github.com/llvm/llvm-project/pull/116959
>From 296492155525985942e1a0fc56b6f0db34e8a7a4 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Wed, 20 Nov 2024 10:57:49 +0000
Subject: [PATCH 1/5] [AArch64] Add intrinsics for F1CVTL/F2CVTL and
BF1CVTL/BF2CVTL
---
clang/include/clang/Basic/TargetBuiltins.h | 1 +
clang/include/clang/Basic/arm_sme.td | 7 ++
clang/include/clang/Basic/arm_sve_sme_incl.td | 2 +
clang/lib/CodeGen/CGBuiltin.cpp | 4 +
.../fp8-intrinsics/acle_sme2_fp8_cvt.c | 81 +++++++++++++++++++
clang/utils/TableGen/SveEmitter.cpp | 6 ++
llvm/include/llvm/IR/IntrinsicsAArch64.td | 17 ++++
.../Target/AArch64/AArch64ISelDAGToDAG.cpp | 34 ++++++++
llvm/lib/Target/AArch64/SMEInstrFormats.td | 2 +-
.../AArch64/sme2-fp8-intrinsics-cvt.ll | 48 +++++++++++
10 files changed, 201 insertions(+), 1 deletion(-)
create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c
create mode 100644 llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll
diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index 89ebf5758a5b55..a14fd2c4b224d8 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -336,6 +336,7 @@ namespace clang {
bool isTupleSet() const { return Flags & IsTupleSet; }
bool isReadZA() const { return Flags & IsReadZA; }
bool isWriteZA() const { return Flags & IsWriteZA; }
+ bool setsFPMR() const { return Flags & SetsFPMR; }
bool isReductionQV() const { return Flags & IsReductionQV; }
uint64_t getBits() const { return Flags; }
bool isFlagSet(uint64_t Flag) const { return Flags & Flag; }
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 0f689e82bdb742..8e7e4395411c6c 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -824,4 +824,11 @@ let SMETargetGuard = "sme-lutv2" in {
def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2.u", "cUc", MergeNone, "aarch64_sme_luti4_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
}
+let SMETargetGuard = "sme2,fp8" in {
+ // Convert from half-precision/BFloat16 to deinterleaved FP8 multi-vector
+ def SVF1CVTL : Inst<"svcvtl1_f16[_mf8]_x2_fpm", "2~n", "h", MergeNone, "aarch64_sme_fp8_f1cvtl_x2", [IsStreaming, IsOverloadNone, SetsFPMR], []>;
+ def SVF1CVTL_BF : Inst<"svcvtl1_bf16[_mf8]_x2_fpm", "2~n", "b", MergeNone, "aarch64_sme_fp8_bf1cvtl_x2", [IsStreaming, IsOverloadNone, SetsFPMR], []>;
+ def SVF2CVTL : Inst<"svcvtl2_f16[_mf8]_x2_fpm", "2~n", "h", MergeNone, "aarch64_sme_fp8_f2cvtl_x2", [IsStreaming, IsOverloadNone, SetsFPMR], []>;
+ def SVF2CVTL_BF : Inst<"svcvtl2_bf16[_mf8]_x2_fpm", "2~n", "b", MergeNone, "aarch64_sme_fp8_bf2cvtl_x2", [IsStreaming, IsOverloadNone, SetsFPMR], []>;
+}
} // let SVETargetGuard = InvalidMode
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
index 50911fb63e818e..7fdf732e506a2e 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -103,6 +103,7 @@ include "arm_immcheck_incl.td"
// M: svfloat32_t
// N: svfloat64_t
// $: svbfloat16_t
+// ~: svmfloat8_t
// J: Prefetch type (sv_prfop)
@@ -235,6 +236,7 @@ def IsInOutZA : FlagType<0x200000000000>;
def IsInZT0 : FlagType<0x400000000000>;
def IsOutZT0 : FlagType<0x800000000000>;
def IsInOutZT0 : FlagType<0x1000000000000>;
+def SetsFPMR : FlagType<0x2000000000000>;
defvar InvalidMode = "";
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 0916e14f182ddd..568ba0ade6422f 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -11182,6 +11182,10 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
BuiltinID == SME::BI__builtin_sme_svstr_za)
return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic);
+ // Emit set FPMR for intrinsics that require it
+ if (TypeFlags.setsFPMR())
+ Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr),
+ Ops.pop_back_val());
// Handle builtins which require their multi-vector operands to be swapped
swapCommutativeSMEOperands(BuiltinID, Ops);
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c
new file mode 100644
index 00000000000000..da2a505a897996
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c
@@ -0,0 +1,81 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: aarch64-registered-target
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
+
+#include <arm_sme.h>
+
+#ifdef SME_OVERLOADED_FORMS
+#define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#else
+#define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#endif
+
+// CHECK-LABEL: @test_cvt1l_f16_x2(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f1cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]])
+// CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z17test_cvt1l_f16_x2u13__SVMfloat8_tm(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f1cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]])
+// CPP-CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
+//
+svfloat16x2_t test_cvt1l_f16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming {
+ return SME_ACLE_FUNC(svcvtl1_f16,_mf8,_x2_fpm)(zn, fpmr);
+}
+
+// CHECK-LABEL: @test_cvt2l_f16_x2(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f2cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]])
+// CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z17test_cvt2l_f16_x2u13__SVMfloat8_tm(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f2cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]])
+// CPP-CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
+//
+svfloat16x2_t test_cvt2l_f16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming {
+ return SME_ACLE_FUNC(svcvtl2_f16,_mf8,_x2_fpm)(zn, fpmr);
+}
+
+// CHECK-LABEL: @test_cvt1l_bf16_x2(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf1cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]])
+// CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z18test_cvt1l_bf16_x2u13__SVMfloat8_tm(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf1cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]])
+// CPP-CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+svbfloat16x2_t test_cvt1l_bf16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming {
+ return SME_ACLE_FUNC(svcvtl1_bf16,_mf8,_x2_fpm)(zn, fpmr);
+}
+
+// CHECK-LABEL: @test_cvt2l_bf16_x2(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf2cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]])
+// CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+// CPP-CHECK-LABEL: @_Z18test_cvt2l_bf16_x2u13__SVMfloat8_tm(
+// CPP-CHECK-NEXT: entry:
+// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf2cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]])
+// CPP-CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
+//
+svbfloat16x2_t test_cvt2l_bf16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming {
+ return SME_ACLE_FUNC(svcvtl2_bf16,_mf8,_x2_fpm)(zn, fpmr);
+}
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index e8883488f32356..ab1bb419e10ab8 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -926,6 +926,12 @@ void SVEType::applyModifier(char Mod) {
Float = false;
BFloat = false;
break;
+ case '~':
+ Float = false;
+ BFloat = false;
+ MFloat = true;
+ ElementBitwidth = 8;
+ break;
case '.':
llvm_unreachable(". is never a type in itself");
break;
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 6a09a8647096f9..cd766f0dda7209 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3813,6 +3813,23 @@ let TargetPrefix = "aarch64" in {
LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>],
[IntrNoMem]>;
+ class SME2_FP8_CVT_X2_Single_Intrinsic
+ : DefaultAttrsIntrinsic<[llvm_nxv8f16_ty, llvm_nxv8f16_ty],
+ [llvm_nxv16i8_ty],
+ [IntrReadMem, IntrInaccessibleMemOnly]>;
+
+ class SME2_FP8_CVT_X2_Single_BF16_Intrinsic
+ : DefaultAttrsIntrinsic<[llvm_nxv8bf16_ty, llvm_nxv8bf16_ty],
+ [llvm_nxv16i8_ty],
+ [IntrReadMem, IntrInaccessibleMemOnly]>;
+ //
+ // CVT from half-precision/BFloat16 to delinterleaved FP8 multi-vectors
+ //
+ def int_aarch64_sme_fp8_f1cvtl_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
+ def int_aarch64_sme_fp8_f2cvtl_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
+
+ def int_aarch64_sme_fp8_bf1cvtl_x2 : SME2_FP8_CVT_X2_Single_BF16_Intrinsic;
+ def int_aarch64_sme_fp8_bf2cvtl_x2 : SME2_FP8_CVT_X2_Single_BF16_Intrinsic;
}
// SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 10dad7675f4eaf..6512efa698c5ca 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -383,6 +383,7 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
void SelectPExtPair(SDNode *N, unsigned Opc);
void SelectWhilePair(SDNode *N, unsigned Opc);
void SelectCVTIntrinsic(SDNode *N, unsigned NumVecs, unsigned Opcode);
+ void SelectCVTIntrinsicFP8(SDNode *N, unsigned NumVecs, unsigned Opcode);
void SelectClamp(SDNode *N, unsigned NumVecs, unsigned Opcode);
void SelectUnaryMultiIntrinsic(SDNode *N, unsigned NumOutVecs,
bool IsTupleInput, unsigned Opc);
@@ -1866,6 +1867,27 @@ void AArch64DAGToDAGISel::SelectCVTIntrinsic(SDNode *N, unsigned NumVecs,
CurDAG->RemoveDeadNode(N);
}
+void AArch64DAGToDAGISel::SelectCVTIntrinsicFP8(SDNode *N, unsigned NumVecs,
+ unsigned Opcode) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ SmallVector<SDValue, 4> Ops(N->op_begin() + 2, N->op_end());
+ Ops.push_back(/*Chain*/ N->getOperand(0));
+
+ SDNode *Instruction =
+ CurDAG->getMachineNode(Opcode, DL, {MVT::Untyped, MVT::Other}, Ops);
+ SDValue SuperReg = SDValue(Instruction, 0);
+
+ for (unsigned i = 0; i < NumVecs; ++i)
+ ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg(
+ AArch64::zsub0 + i, DL, VT, SuperReg));
+
+ // Copy chain
+ unsigned ChainIdx = NumVecs;
+ ReplaceUses(SDValue(N, ChainIdx), SDValue(Instruction, 1));
+ CurDAG->RemoveDeadNode(N);
+}
+
void AArch64DAGToDAGISel::SelectDestructiveMultiIntrinsic(SDNode *N,
unsigned NumVecs,
bool IsZmMulti,
@@ -5547,6 +5569,18 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z);
return;
}
+ case Intrinsic::aarch64_sme_fp8_bf1cvtl_x2:
+ SelectCVTIntrinsicFP8(Node, 2, AArch64::BF1CVTL_2ZZ_BtoH);
+ return;
+ case Intrinsic::aarch64_sme_fp8_f1cvtl_x2:
+ SelectCVTIntrinsicFP8(Node, 2, AArch64::F1CVTL_2ZZ_BtoH);
+ return;
+ case Intrinsic::aarch64_sme_fp8_bf2cvtl_x2:
+ SelectCVTIntrinsicFP8(Node, 2, AArch64::BF2CVTL_2ZZ_BtoH);
+ return;
+ case Intrinsic::aarch64_sme_fp8_f2cvtl_x2:
+ SelectCVTIntrinsicFP8(Node, 2, AArch64::F2CVTL_2ZZ_BtoH);
+ return;
}
} break;
case ISD::INTRINSIC_WO_CHAIN: {
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 8c256b5818ee88..776472e72af05a 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -2412,7 +2412,7 @@ multiclass sme2p1_fp_cvt_vector_vg2_single<string mnemonic, bit l> {
// SME2 multi-vec FP8 up convert two registers
multiclass sme2p1_fp8_cvt_vector_vg2_single<string mnemonic, bits<2> opc, bit L> {
- def _NAME : sme2_cvt_unpk_vector_vg2<opc, 0b110, L, ZZ_h_mul_r, ZPR8, mnemonic>{
+ def NAME : sme2_cvt_unpk_vector_vg2<opc, 0b110, L, ZZ_h_mul_r, ZPR8, mnemonic>{
let Uses = [FPMR, FPCR];
}
}
diff --git a/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll b/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll
new file mode 100644
index 00000000000000..a6102988d1df17
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+fp8 -verify-machineinstrs -force-streaming < %s | FileCheck %s
+
+; F1CVTL / F2CVTL
+
+define { <vscale x 8 x half>, <vscale x 8 x half> } @f1cvtl(<vscale x 16 x i8> %zm) {
+; CHECK-LABEL: f1cvtl:
+; CHECK: // %bb.0:
+; CHECK-NEXT: f1cvtl { z0.h, z1.h }, z0.b
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f1cvtl.x2(<vscale x 16 x i8> %zm)
+ ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+define { <vscale x 8 x half>, <vscale x 8 x half> } @f2cvtl(<vscale x 16 x i8> %zm) {
+; CHECK-LABEL: f2cvtl:
+; CHECK: // %bb.0:
+; CHECK-NEXT: f2cvtl { z0.h, z1.h }, z0.b
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f2cvtl.x2(<vscale x 16 x i8> %zm)
+ ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
+}
+
+; BF1CVTL / BF2CVTL
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @bf1cvtl(<vscale x 16 x i8> %zm) {
+; CHECK-LABEL: bf1cvtl:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bf1cvtl { z0.h, z1.h }, z0.b
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf1cvtl.x2(<vscale x 16 x i8> %zm)
+ ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @bf2cvtl( <vscale x 16 x i8> %zm) {
+; CHECK-LABEL: bf2cvtl:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bf2cvtl { z0.h, z1.h }, z0.b
+; CHECK-NEXT: ret
+ %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf2cvtl.x2(<vscale x 16 x i8> %zm)
+ ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
+}
+
+
+declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f1cvtl.x2(<vscale x 16 x i8>)
+declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f2cvtl.x2(<vscale x 16 x i8>)
+declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf1cvtl.x2(<vscale x 16 x i8>)
+declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf2cvtl.x2(<vscale x 16 x i8>)
\ No newline at end of file
>From ec7a1cbf4df94bcadda42cfa8cfbd43b8df0f10d Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Wed, 20 Nov 2024 11:55:03 +0000
Subject: [PATCH 2/5] [NFC] Fixup format
---
llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 6512efa698c5ca..b51b3bf5038e5d 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -5570,16 +5570,16 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
return;
}
case Intrinsic::aarch64_sme_fp8_bf1cvtl_x2:
- SelectCVTIntrinsicFP8(Node, 2, AArch64::BF1CVTL_2ZZ_BtoH);
+ SelectCVTIntrinsicFP8(Node, 2, AArch64::BF1CVTL_2ZZ_BtoH);
return;
case Intrinsic::aarch64_sme_fp8_f1cvtl_x2:
- SelectCVTIntrinsicFP8(Node, 2, AArch64::F1CVTL_2ZZ_BtoH);
+ SelectCVTIntrinsicFP8(Node, 2, AArch64::F1CVTL_2ZZ_BtoH);
return;
case Intrinsic::aarch64_sme_fp8_bf2cvtl_x2:
- SelectCVTIntrinsicFP8(Node, 2, AArch64::BF2CVTL_2ZZ_BtoH);
+ SelectCVTIntrinsicFP8(Node, 2, AArch64::BF2CVTL_2ZZ_BtoH);
return;
case Intrinsic::aarch64_sme_fp8_f2cvtl_x2:
- SelectCVTIntrinsicFP8(Node, 2, AArch64::F2CVTL_2ZZ_BtoH);
+ SelectCVTIntrinsicFP8(Node, 2, AArch64::F2CVTL_2ZZ_BtoH);
return;
}
} break;
>From 20f5b01b57e177a00de52e22e27b64c0e8fa2f55 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Wed, 20 Nov 2024 14:05:36 +0000
Subject: [PATCH 3/5] [NFC] Remove unnecessary declarations from test
---
llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll | 6 ------
1 file changed, 6 deletions(-)
diff --git a/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll b/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll
index a6102988d1df17..b6f1fd45af010b 100644
--- a/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll
@@ -40,9 +40,3 @@ define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @bf2cvtl( <vscale x 16 x
%res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf2cvtl.x2(<vscale x 16 x i8> %zm)
ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
}
-
-
-declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f1cvtl.x2(<vscale x 16 x i8>)
-declare { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f2cvtl.x2(<vscale x 16 x i8>)
-declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf1cvtl.x2(<vscale x 16 x i8>)
-declare { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf2cvtl.x2(<vscale x 16 x i8>)
\ No newline at end of file
>From 2f1ed17d158105bbf7f497a158c314ad6fd76cc5 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Thu, 21 Nov 2024 10:34:46 +0000
Subject: [PATCH 4/5] Prefer polymorphic intrinsic and move definition to
arm_sve.td
---
clang/include/clang/Basic/arm_sme.td | 7 ---
clang/include/clang/Basic/arm_sve.td | 10 ++--
clang/lib/CodeGen/CGBuiltin.cpp | 8 +--
.../fp8-intrinsics/acle_sme2_fp8_cvt.c | 60 +++++++++----------
llvm/include/llvm/IR/IntrinsicsAArch64.td | 14 +----
.../Target/AArch64/AArch64ISelDAGToDAG.cpp | 20 +++----
.../AArch64/sme2-fp8-intrinsics-cvt.ll | 8 +--
7 files changed, 57 insertions(+), 70 deletions(-)
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 8e7e4395411c6c..0f689e82bdb742 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -824,11 +824,4 @@ let SMETargetGuard = "sme-lutv2" in {
def SVLUTI4_ZT_X4 : SInst<"svluti4_zt_{d}_x4", "4i2.u", "cUc", MergeNone, "aarch64_sme_luti4_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
}
-let SMETargetGuard = "sme2,fp8" in {
- // Convert from half-precision/BFloat16 to deinterleaved FP8 multi-vector
- def SVF1CVTL : Inst<"svcvtl1_f16[_mf8]_x2_fpm", "2~n", "h", MergeNone, "aarch64_sme_fp8_f1cvtl_x2", [IsStreaming, IsOverloadNone, SetsFPMR], []>;
- def SVF1CVTL_BF : Inst<"svcvtl1_bf16[_mf8]_x2_fpm", "2~n", "b", MergeNone, "aarch64_sme_fp8_bf1cvtl_x2", [IsStreaming, IsOverloadNone, SetsFPMR], []>;
- def SVF2CVTL : Inst<"svcvtl2_f16[_mf8]_x2_fpm", "2~n", "h", MergeNone, "aarch64_sme_fp8_f2cvtl_x2", [IsStreaming, IsOverloadNone, SetsFPMR], []>;
- def SVF2CVTL_BF : Inst<"svcvtl2_bf16[_mf8]_x2_fpm", "2~n", "b", MergeNone, "aarch64_sme_fp8_bf2cvtl_x2", [IsStreaming, IsOverloadNone, SetsFPMR], []>;
-}
} // let SVETargetGuard = InvalidMode
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index d492fae4145b92..c81aaee71eda0d 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2422,14 +2422,16 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2" in {
def SVUUNPK_X4 : SInst<"svunpk_{d}[_{3}_x4]", "42.h", "UsUiUl", MergeNone, "aarch64_sve_uunpk_x4", [IsStreaming], []>;
}
-//
-// Multi-vector scaling
-//
-let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,fp8" in {
+let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,fp8" in {
+ // Multi-vector scaling
def FSCALE_SINGLE_X2 : Inst<"svscale[_single_{d}_x2]", "22x", "fhd", MergeNone, "aarch64_sme_fp8_scale_single_x2", [IsStreaming],[]>;
def FSCALE_SINGLE_X4 : Inst<"svscale[_single_{d}_x4]", "44x", "fhd", MergeNone, "aarch64_sme_fp8_scale_single_x4", [IsStreaming],[]>;
def FSCALE_X2 : Inst<"svscale[_{d}_x2]", "222.x", "fhd", MergeNone, "aarch64_sme_fp8_scale_x2", [IsStreaming],[]>;
def FSCALE_X4 : Inst<"svscale[_{d}_x4]", "444.x", "fhd", MergeNone, "aarch64_sme_fp8_scale_x4", [IsStreaming],[]>;
+
+ // Convert from half-precision/BFloat16 to deinterleaved FP8 multi-vector
+ def SVF1CVTL : Inst<"svcvtl1_{d}[_mf8]_x2_fpm", "2~n", "bh", MergeNone, "aarch64_sve_fp8_cvtl1_x2", [IsStreaming, SetsFPMR], []>;
+ def SVF2CVTL : Inst<"svcvtl2_{d}[_mf8]_x2_fpm", "2~n", "bh", MergeNone, "aarch64_sve_fp8_cvtl2_x2", [IsStreaming, SetsFPMR], []>;
}
let SVETargetGuard = "sve2p1", SMETargetGuard = "sme2" in {
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 568ba0ade6422f..27e5453628a5a9 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -10811,6 +10811,10 @@ Value *CodeGenFunction::EmitAArch64SVEBuiltinExpr(unsigned BuiltinID,
else if (TypeFlags.isUndef())
return UndefValue::get(Ty);
else if (Builtin->LLVMIntrinsic != 0) {
+ // Emit set FPMR for intrinsics that require it
+ if (TypeFlags.setsFPMR())
+ Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr),
+ Ops.pop_back_val());
if (TypeFlags.getMergeType() == SVETypeFlags::MergeZeroExp)
InsertExplicitZeroOperand(Builder, Ty, Ops);
@@ -11182,10 +11186,6 @@ Value *CodeGenFunction::EmitAArch64SMEBuiltinExpr(unsigned BuiltinID,
BuiltinID == SME::BI__builtin_sme_svstr_za)
return EmitSMELdrStr(TypeFlags, Ops, Builtin->LLVMIntrinsic);
- // Emit set FPMR for intrinsics that require it
- if (TypeFlags.setsFPMR())
- Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr),
- Ops.pop_back_val());
// Handle builtins which require their multi-vector operands to be swapped
swapCommutativeSMEOperands(BuiltinID, Ops);
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c
index da2a505a897996..4a1fc81a05b953 100644
--- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sme2_fp8_cvt.c
@@ -4,78 +4,78 @@
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sme2 -target-feature +fp8 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-#include <arm_sme.h>
+#include <arm_sve.h>
-#ifdef SME_OVERLOADED_FORMS
-#define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
+#ifdef SVE_OVERLOADED_FORMS
+#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
#else
-#define SME_ACLE_FUNC(A1,A2,A3) A1##A2##A3
+#define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
#endif
-// CHECK-LABEL: @test_cvt1l_f16_x2(
+// CHECK-LABEL: @test_cvtl1_f16_x2(
// CHECK-NEXT: entry:
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
-// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f1cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fp8.cvtl1.x2.nxv8f16(<vscale x 16 x i8> [[ZN:%.*]])
// CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
//
-// CPP-CHECK-LABEL: @_Z17test_cvt1l_f16_x2u13__SVMfloat8_tm(
+// CPP-CHECK-LABEL: @_Z17test_cvtl1_f16_x2u13__SVMfloat8_tm(
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
-// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f1cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]])
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fp8.cvtl1.x2.nxv8f16(<vscale x 16 x i8> [[ZN:%.*]])
// CPP-CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
//
-svfloat16x2_t test_cvt1l_f16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming {
- return SME_ACLE_FUNC(svcvtl1_f16,_mf8,_x2_fpm)(zn, fpmr);
+svfloat16x2_t test_cvtl1_f16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming {
+ return SVE_ACLE_FUNC(svcvtl1_f16,_mf8,_x2_fpm)(zn, fpmr);
}
-// CHECK-LABEL: @test_cvt2l_f16_x2(
+// CHECK-LABEL: @test_cvtl2_f16_x2(
// CHECK-NEXT: entry:
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
-// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f2cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fp8.cvtl2.x2.nxv8f16(<vscale x 16 x i8> [[ZN:%.*]])
// CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
//
-// CPP-CHECK-LABEL: @_Z17test_cvt2l_f16_x2u13__SVMfloat8_tm(
+// CPP-CHECK-LABEL: @_Z17test_cvtl2_f16_x2u13__SVMfloat8_tm(
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
-// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f2cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]])
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fp8.cvtl2.x2.nxv8f16(<vscale x 16 x i8> [[ZN:%.*]])
// CPP-CHECK-NEXT: ret { <vscale x 8 x half>, <vscale x 8 x half> } [[TMP0]]
//
-svfloat16x2_t test_cvt2l_f16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming {
- return SME_ACLE_FUNC(svcvtl2_f16,_mf8,_x2_fpm)(zn, fpmr);
+svfloat16x2_t test_cvtl2_f16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming {
+ return SVE_ACLE_FUNC(svcvtl2_f16,_mf8,_x2_fpm)(zn, fpmr);
}
-// CHECK-LABEL: @test_cvt1l_bf16_x2(
+// CHECK-LABEL: @test_cvtl1_bf16_x2(
// CHECK-NEXT: entry:
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
-// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf1cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fp8.cvtl1.x2.nxv8bf16(<vscale x 16 x i8> [[ZN:%.*]])
// CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
//
-// CPP-CHECK-LABEL: @_Z18test_cvt1l_bf16_x2u13__SVMfloat8_tm(
+// CPP-CHECK-LABEL: @_Z18test_cvtl1_bf16_x2u13__SVMfloat8_tm(
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
-// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf1cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]])
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fp8.cvtl1.x2.nxv8bf16(<vscale x 16 x i8> [[ZN:%.*]])
// CPP-CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
//
-svbfloat16x2_t test_cvt1l_bf16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming {
- return SME_ACLE_FUNC(svcvtl1_bf16,_mf8,_x2_fpm)(zn, fpmr);
+svbfloat16x2_t test_cvtl1_bf16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming {
+ return SVE_ACLE_FUNC(svcvtl1_bf16,_mf8,_x2_fpm)(zn, fpmr);
}
-// CHECK-LABEL: @test_cvt2l_bf16_x2(
+// CHECK-LABEL: @test_cvtl2_bf16_x2(
// CHECK-NEXT: entry:
// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
-// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf2cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]])
+// CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fp8.cvtl2.x2.nxv8bf16(<vscale x 16 x i8> [[ZN:%.*]])
// CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
//
-// CPP-CHECK-LABEL: @_Z18test_cvt2l_bf16_x2u13__SVMfloat8_tm(
+// CPP-CHECK-LABEL: @_Z18test_cvtl2_bf16_x2u13__SVMfloat8_tm(
// CPP-CHECK-NEXT: entry:
// CPP-CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR:%.*]])
-// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf2cvtl.x2(<vscale x 16 x i8> [[ZN:%.*]])
+// CPP-CHECK-NEXT: [[TMP0:%.*]] = tail call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fp8.cvtl2.x2.nxv8bf16(<vscale x 16 x i8> [[ZN:%.*]])
// CPP-CHECK-NEXT: ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } [[TMP0]]
//
-svbfloat16x2_t test_cvt2l_bf16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming {
- return SME_ACLE_FUNC(svcvtl2_bf16,_mf8,_x2_fpm)(zn, fpmr);
+svbfloat16x2_t test_cvtl2_bf16_x2(svmfloat8_t zn, uint64_t fpmr) __arm_streaming {
+ return SVE_ACLE_FUNC(svcvtl2_bf16,_mf8,_x2_fpm)(zn, fpmr);
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index cd766f0dda7209..f09114b57431f8 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3814,22 +3814,14 @@ let TargetPrefix = "aarch64" in {
[IntrNoMem]>;
class SME2_FP8_CVT_X2_Single_Intrinsic
- : DefaultAttrsIntrinsic<[llvm_nxv8f16_ty, llvm_nxv8f16_ty],
- [llvm_nxv16i8_ty],
- [IntrReadMem, IntrInaccessibleMemOnly]>;
-
- class SME2_FP8_CVT_X2_Single_BF16_Intrinsic
- : DefaultAttrsIntrinsic<[llvm_nxv8bf16_ty, llvm_nxv8bf16_ty],
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
[llvm_nxv16i8_ty],
[IntrReadMem, IntrInaccessibleMemOnly]>;
//
// CVT from half-precision/BFloat16 to delinterleaved FP8 multi-vectors
//
- def int_aarch64_sme_fp8_f1cvtl_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
- def int_aarch64_sme_fp8_f2cvtl_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
-
- def int_aarch64_sme_fp8_bf1cvtl_x2 : SME2_FP8_CVT_X2_Single_BF16_Intrinsic;
- def int_aarch64_sme_fp8_bf2cvtl_x2 : SME2_FP8_CVT_X2_Single_BF16_Intrinsic;
+ def int_aarch64_sve_fp8_cvtl1_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
+ def int_aarch64_sve_fp8_cvtl2_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
}
// SVE2.1 - ZIPQ1, ZIPQ2, UZPQ1, UZPQ2
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index b51b3bf5038e5d..f9a4d65eac0922 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -5569,17 +5569,17 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
SelectMultiVectorLuti(Node, 4, AArch64::LUTI4_4ZZT2Z);
return;
}
- case Intrinsic::aarch64_sme_fp8_bf1cvtl_x2:
- SelectCVTIntrinsicFP8(Node, 2, AArch64::BF1CVTL_2ZZ_BtoH);
- return;
- case Intrinsic::aarch64_sme_fp8_f1cvtl_x2:
- SelectCVTIntrinsicFP8(Node, 2, AArch64::F1CVTL_2ZZ_BtoH);
- return;
- case Intrinsic::aarch64_sme_fp8_bf2cvtl_x2:
- SelectCVTIntrinsicFP8(Node, 2, AArch64::BF2CVTL_2ZZ_BtoH);
+ case Intrinsic::aarch64_sve_fp8_cvtl1_x2:
+ if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::FP>(
+ Node->getValueType(0),
+ {AArch64::BF1CVTL_2ZZ_BtoH, AArch64::F1CVTL_2ZZ_BtoH}))
+ SelectCVTIntrinsicFP8(Node, 2, Opc);
return;
- case Intrinsic::aarch64_sme_fp8_f2cvtl_x2:
- SelectCVTIntrinsicFP8(Node, 2, AArch64::F2CVTL_2ZZ_BtoH);
+ case Intrinsic::aarch64_sve_fp8_cvtl2_x2:
+ if (auto Opc = SelectOpcodeFromVT<SelectTypeKind::FP>(
+ Node->getValueType(0),
+ {AArch64::BF2CVTL_2ZZ_BtoH, AArch64::F2CVTL_2ZZ_BtoH}))
+ SelectCVTIntrinsicFP8(Node, 2, Opc);
return;
}
} break;
diff --git a/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll b/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll
index b6f1fd45af010b..076a3ad34eac3c 100644
--- a/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-fp8-intrinsics-cvt.ll
@@ -8,7 +8,7 @@ define { <vscale x 8 x half>, <vscale x 8 x half> } @f1cvtl(<vscale x 16 x i8> %
; CHECK: // %bb.0:
; CHECK-NEXT: f1cvtl { z0.h, z1.h }, z0.b
; CHECK-NEXT: ret
- %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f1cvtl.x2(<vscale x 16 x i8> %zm)
+ %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fp8.cvtl1.x2.nxv8f16(<vscale x 16 x i8> %zm)
ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
}
@@ -17,7 +17,7 @@ define { <vscale x 8 x half>, <vscale x 8 x half> } @f2cvtl(<vscale x 16 x i8> %
; CHECK: // %bb.0:
; CHECK-NEXT: f2cvtl { z0.h, z1.h }, z0.b
; CHECK-NEXT: ret
- %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sme.fp8.f2cvtl.x2(<vscale x 16 x i8> %zm)
+ %res = call { <vscale x 8 x half>, <vscale x 8 x half> } @llvm.aarch64.sve.fp8.cvtl2.x2.nxvbf16(<vscale x 16 x i8> %zm)
ret { <vscale x 8 x half>, <vscale x 8 x half> } %res
}
@@ -28,7 +28,7 @@ define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @bf1cvtl(<vscale x 16 x
; CHECK: // %bb.0:
; CHECK-NEXT: bf1cvtl { z0.h, z1.h }, z0.b
; CHECK-NEXT: ret
- %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf1cvtl.x2(<vscale x 16 x i8> %zm)
+ %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fp8.cvtl1.x2.nxv8bf16(<vscale x 16 x i8> %zm)
ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
}
@@ -37,6 +37,6 @@ define { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @bf2cvtl( <vscale x 16 x
; CHECK: // %bb.0:
; CHECK-NEXT: bf2cvtl { z0.h, z1.h }, z0.b
; CHECK-NEXT: ret
- %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sme.fp8.bf2cvtl.x2(<vscale x 16 x i8> %zm)
+ %res = call { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } @llvm.aarch64.sve.fp8.cvtl2.x2.nxv8bf16(<vscale x 16 x i8> %zm)
ret { <vscale x 8 x bfloat>, <vscale x 8 x bfloat> } %res
}
>From 460b69be442b7c3d181662f1a3ca95038a34e854 Mon Sep 17 00:00:00 2001
From: Spencer Abson <Spencer.Abson at arm.com>
Date: Sun, 24 Nov 2024 10:19:54 +0000
Subject: [PATCH 5/5] [NFC] Fix comments
---
clang/include/clang/Basic/arm_sve.td | 2 +-
llvm/include/llvm/IR/IntrinsicsAArch64.td | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index c81aaee71eda0d..6899a3387cc8c4 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2429,7 +2429,7 @@ let SVETargetGuard = InvalidMode, SMETargetGuard = "sme2,fp8" in {
def FSCALE_X2 : Inst<"svscale[_{d}_x2]", "222.x", "fhd", MergeNone, "aarch64_sme_fp8_scale_x2", [IsStreaming],[]>;
def FSCALE_X4 : Inst<"svscale[_{d}_x4]", "444.x", "fhd", MergeNone, "aarch64_sme_fp8_scale_x4", [IsStreaming],[]>;
- // Convert from half-precision/BFloat16 to deinterleaved FP8 multi-vector
+ // Convert from FP8 to deinterleaved half-precision/BFloat16 multi-vector
def SVF1CVTL : Inst<"svcvtl1_{d}[_mf8]_x2_fpm", "2~n", "bh", MergeNone, "aarch64_sve_fp8_cvtl1_x2", [IsStreaming, SetsFPMR], []>;
def SVF2CVTL : Inst<"svcvtl2_{d}[_mf8]_x2_fpm", "2~n", "bh", MergeNone, "aarch64_sve_fp8_cvtl2_x2", [IsStreaming, SetsFPMR], []>;
}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index f09114b57431f8..a91616b9556828 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3818,7 +3818,7 @@ let TargetPrefix = "aarch64" in {
[llvm_nxv16i8_ty],
[IntrReadMem, IntrInaccessibleMemOnly]>;
//
- // CVT from half-precision/BFloat16 to delinterleaved FP8 multi-vectors
+ // CVT from FP8 to deinterleaved half-precision/BFloat16 multi-vector
//
def int_aarch64_sve_fp8_cvtl1_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
def int_aarch64_sve_fp8_cvtl2_x2 : SME2_FP8_CVT_X2_Single_Intrinsic;
More information about the cfe-commits
mailing list