[clang] [llvm] [AArch64] Add Neon FP8 conversion intrinsics (PR #123612)
Momchil Velikov via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 27 07:25:38 PST 2025
https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/123612
>From 7ff7e9588f6a76dcbe7deb2dc5f78055f71b476b Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 6 Dec 2024 19:24:16 +0000
Subject: [PATCH] [AArch64] Add Neon FP8 conversion intrinsics
[fixup] Add tests, fix calling the wrong LLVM intrinsic
[fixup] Refector much of common code into a helper function (NFC)
[fixup] Add target features test, remove redundant bf16 guard
[fixup] Clear the NoManglingQ flag for FP8
[fixup] Remove instcombine,tailcallelim from test run lines
---
clang/include/clang/Basic/arm_neon.td | 22 ++
clang/include/clang/Basic/arm_neon_incl.td | 2 +
clang/lib/CodeGen/CGBuiltin.cpp | 79 ++++-
clang/lib/CodeGen/CodeGenFunction.h | 7 +
.../fp8-intrinsics/acle_neon_fp8_cvt.c | 316 ++++++++++++++++++
.../acle_neon_fp8_cvt.c | 43 +++
clang/utils/TableGen/NeonEmitter.cpp | 21 +-
llvm/include/llvm/IR/IntrinsicsAArch64.td | 22 ++
.../lib/Target/AArch64/AArch64InstrFormats.td | 46 ++-
llvm/lib/Target/AArch64/AArch64InstrInfo.td | 14 +-
llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll | 112 +++++++
11 files changed, 662 insertions(+), 22 deletions(-)
create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c
create mode 100644 clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c
create mode 100644 llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll
diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index ddc5391eb3fa23..9a6a77640ef5d3 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2119,6 +2119,28 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in {
}
}
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
+ def VBF1CVT_BF16_MF8 : VInst<"vcvt1_bf16_mf8_fpm", "(QB).V", "m">;
+ def VBF1CVT_LOW_BF16_MF8 : VInst<"vcvt1_low_bf16_mf8_fpm", "B.V", "Hm">;
+ def VBF2CVTL_BF16_MF8 : VInst<"vcvt2_bf16_mf8_fpm", "(QB).V", "m">;
+ def VBF2CVTL_LOW_BF16_MF8 : VInst<"vcvt2_low_bf16_mf8_fpm", "B.V", "Hm">;
+ def VBF1CVTL2_HIGH_BF16_MF8 : VInst<"vcvt1_high_bf16_mf8_fpm", "B.V", "Hm">;
+ def VBF2CVTL2_HIGH_BF16_MF8 : VInst<"vcvt2_high_bf16_mf8_fpm", "B.V", "Hm">;
+}
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
+ def VF1CVT_F16_MF8 : VInst<"vcvt1_f16_mf8_fpm", "(>QF).V", "m">;
+ def VF1CVT_LOW_F16_MF8 : VInst<"vcvt1_low_f16_mf8_fpm", "(>F).V", "Hm">;
+ def VF2CVTL_F16_MF8 : VInst<"vcvt2_f16_mf8_fpm", "(>QF).V", "m">;
+ def VF2CVTL_LOW_F16_MF8 : VInst<"vcvt2_low_f16_mf8_fpm", "(>F).V", "Hm">;
+ def VF1CVTL2_HIGH_F16_MF8 : VInst<"vcvt1_high_f16_mf8_fpm", "(>F).V", "Hm">;
+ def VF2CVTL2_HIGH_F16_MF8 : VInst<"vcvt2_high_f16_mf8_fpm", "(>F).V", "Hm">;
+
+ def VCVTN_LOW_F8_F32 : VInst<"vcvt_mf8_f32_fpm", ".(>>QF)(>>QF)V", "m">;
+ def VCVTN_HIGH_F8_F32 : VInst<"vcvt_high_mf8_f32_fpm", ".(q)(>>F)(>>F)V", "Hm">;
+ def VCVTN_F8_F16 : VInst<"vcvt_mf8_f16_fpm", ".(>F)(>F)V", "mQm">;
+}
+
let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
def FAMIN : WInst<"vamin", "...", "fhQdQfQh">;
def FAMAX : WInst<"vamax", "...", "fhQdQfQh">;
diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td
index fd800e5a6278e4..91a2bf3020b9a3 100644
--- a/clang/include/clang/Basic/arm_neon_incl.td
+++ b/clang/include/clang/Basic/arm_neon_incl.td
@@ -243,6 +243,7 @@ def OP_UNAVAILABLE : Operation {
// B: change to BFloat16
// P: change to polynomial category.
// p: change polynomial to equivalent integer category. Otherwise nop.
+// V: change to fpm_t
//
// >: double element width (vector size unchanged).
// <: half element width (vector size unchanged).
@@ -301,6 +302,7 @@ class Inst <string n, string p, string t, Operation o, list<ImmCheck> ch = []>{
class SInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
class IInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
class WInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
+class VInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
// The following instruction classes are implemented via operators
// instead of builtins. As such these declarations are only used for
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 5162ac503b8ebd..0a06ce028a9160 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6759,12 +6759,36 @@ Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
return Builder.CreateCall(F, Ops, name);
}
+Value *CodeGenFunction::EmitFP8NeonCall(Function *F,
+ SmallVectorImpl<Value *> &Ops,
+ Value *FPM, const char *name) {
+ Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), FPM);
+ return EmitNeonCall(F, Ops, name);
+}
+
Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
bool neg) {
int SV = cast<ConstantInt>(V)->getSExtValue();
return ConstantInt::get(Ty, neg ? -SV : SV);
}
+Value *CodeGenFunction::EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0,
+ llvm::Type *Ty1, bool Extract,
+ SmallVectorImpl<llvm::Value *> &Ops,
+ const CallExpr *E,
+ const char *name) {
+ llvm::Type *Tys[] = {Ty0, Ty1};
+ if (Extract) {
+ // Op[0] is mfloat8x16_t, but the intrinsic converts only the lower part of
+ // the vector.
+ Tys[1] = llvm::FixedVectorType::get(Int8Ty, 8);
+ Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
+ }
+ llvm::Value *FPM =
+ EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
+ return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
+}
+
// Right-shift a vector by a constant.
Value *CodeGenFunction::EmitNeonRShiftImm(Value *Vec, Value *Shift,
llvm::Type *Ty, bool usgn,
@@ -12736,6 +12760,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
return V;
unsigned Int;
+ bool ExtractLow = false;
switch (BuiltinID) {
default: return nullptr;
case NEON::BI__builtin_neon_vbsl_v:
@@ -13950,7 +13975,59 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
}
-
+ case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
+ ExtractLow = true;
+ LLVM_FALLTHROUGH;
+ case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm:
+ case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm:
+ return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl1,
+ llvm::FixedVectorType::get(BFloatTy, 8),
+ Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt1");
+ case NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm:
+ ExtractLow = true;
+ LLVM_FALLTHROUGH;
+ case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm:
+ case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm:
+ return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl2,
+ llvm::FixedVectorType::get(BFloatTy, 8),
+ Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt2");
+ case NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm:
+ ExtractLow = true;
+ LLVM_FALLTHROUGH;
+ case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm:
+ case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm:
+ return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl1,
+ llvm::FixedVectorType::get(HalfTy, 8),
+ Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt1");
+ case NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm:
+ ExtractLow = true;
+ LLVM_FALLTHROUGH;
+ case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm:
+ case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm:
+ return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl2,
+ llvm::FixedVectorType::get(HalfTy, 8),
+ Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt2");
+ case NEON::BI__builtin_neon_vcvt_mf8_f32_fpm:
+ return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn,
+ llvm::FixedVectorType::get(Int8Ty, 8),
+ Ops[0]->getType(), false, Ops, E, "vfcvtn");
+ case NEON::BI__builtin_neon_vcvt_mf8_f16_fpm:
+ return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn,
+ llvm::FixedVectorType::get(Int8Ty, 8),
+ llvm::FixedVectorType::get(HalfTy, 4), false, Ops,
+ E, "vfcvtn");
+ case NEON::BI__builtin_neon_vcvtq_mf8_f16_fpm:
+ return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn,
+ llvm::FixedVectorType::get(Int8Ty, 16),
+ llvm::FixedVectorType::get(HalfTy, 8), false, Ops,
+ E, "vfcvtn");
+ case NEON::BI__builtin_neon_vcvt_high_mf8_f32_fpm: {
+ llvm::Type *Ty = llvm::FixedVectorType::get(Int8Ty, 16);
+ Ops[0] = Builder.CreateInsertVector(Ty, PoisonValue::get(Ty), Ops[0],
+ Builder.getInt64(0));
+ return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn2, Ty,
+ Ops[1]->getType(), false, Ops, E, "vfcvtn2");
+ }
case NEON::BI__builtin_neon_vamin_f16:
case NEON::BI__builtin_neon_vaminq_f16:
case NEON::BI__builtin_neon_vamin_f32:
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index fab27d4c22ed80..073231e50a990a 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4692,6 +4692,13 @@ class CodeGenFunction : public CodeGenTypeCache {
SmallVectorImpl<llvm::Value*> &O,
const char *name,
unsigned shift = 0, bool rightshift = false);
+ llvm::Value *EmitFP8NeonCall(llvm::Function *F,
+ SmallVectorImpl<llvm::Value *> &O,
+ llvm::Value *FPM, const char *name);
+ llvm::Value *EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0,
+ llvm::Type *Ty1, bool Extract,
+ SmallVectorImpl<llvm::Value *> &Ops,
+ const CallExpr *E, const char *name);
llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx,
const llvm::ElementCount &Count);
llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx);
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c
new file mode 100644
index 00000000000000..4305b840f2a05b
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c
@@ -0,0 +1,316 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix CHECK-CXX
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -S -O3 -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_bf16_mf8_fpm(
+// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT: ret <8 x bfloat> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z23test_vcvt1_bf16_mf8_fpm13__Mfloat8x8_tm(
+// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT1_I]]
+//
+bfloat16x8_t test_vcvt1_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
+ return vcvt1_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_low_bf16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT: ret <8 x bfloat> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z27test_vcvt1_low_bf16_mf8_fpm14__Mfloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT1_I]]
+//
+bfloat16x8_t test_vcvt1_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+ return vcvt1_low_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_bf16_mf8_fpm(
+// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT: ret <8 x bfloat> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z23test_vcvt2_bf16_mf8_fpm13__Mfloat8x8_tm(
+// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT2_I]]
+//
+bfloat16x8_t test_vcvt2_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
+ return vcvt2_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_low_bf16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT: ret <8 x bfloat> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z27test_vcvt2_low_bf16_mf8_fpm14__Mfloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT2_I]]
+//
+bfloat16x8_t test_vcvt2_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+ return vcvt2_low_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_high_bf16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT: ret <8 x bfloat> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z28test_vcvt1_high_bf16_mf8_fpm14__Mfloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT1_I]]
+//
+bfloat16x8_t test_vcvt1_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+ return vcvt1_high_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_high_bf16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT: ret <8 x bfloat> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z28test_vcvt2_high_bf16_mf8_fpm14__Mfloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT2_I]]
+//
+bfloat16x8_t test_vcvt2_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+ return vcvt2_high_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_f16_mf8_fpm(
+// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT: ret <8 x half> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z22test_vcvt1_f16_mf8_fpm13__Mfloat8x8_tm(
+// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT1_I]]
+//
+float16x8_t test_vcvt1_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
+ return vcvt1_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_low_f16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT: ret <8 x half> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z26test_vcvt1_low_f16_mf8_fpm14__Mfloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT1_I]]
+//
+float16x8_t test_vcvt1_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+ return vcvt1_low_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_f16_mf8_fpm(
+// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT: ret <8 x half> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z22test_vcvt2_f16_mf8_fpm13__Mfloat8x8_tm(
+// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT2_I]]
+//
+float16x8_t test_vcvt2_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
+ return vcvt2_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_low_f16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT: ret <8 x half> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z26test_vcvt2_low_f16_mf8_fpm14__Mfloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT2_I]]
+//
+float16x8_t test_vcvt2_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+ return vcvt2_low_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_high_f16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT: ret <8 x half> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z27test_vcvt1_high_f16_mf8_fpm14__Mfloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT1_I]]
+//
+float16x8_t test_vcvt1_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+ return vcvt1_high_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_high_f16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT: ret <8 x half> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z27test_vcvt2_high_f16_mf8_fpm14__Mfloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT2_I]]
+//
+float16x8_t test_vcvt2_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+ return vcvt2_high_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcvt_mf8_f32_fpm(
+// CHECK-SAME: <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[VFCVTN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-NEXT: ret <8 x i8> [[VFCVTN_I]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z21test_vcvt_mf8_f32_fpm13__Float32x4_tS_m(
+// CHECK-CXX-SAME: <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[VFCVTN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-CXX-NEXT: ret <8 x i8> [[VFCVTN_I]]
+//
+mfloat8x8_t test_vcvt_mf8_f32_fpm(float32x4_t vn, float32x4_t vm, fpm_t fpm) {
+ return vcvt_mf8_f32_fpm(vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcvt_high_mf8_f32_fpm(
+// CHECK-SAME: <8 x i8> [[VD:%.*]], <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VD]], i64 0)
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-NEXT: ret <16 x i8> [[VFCVTN2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vcvt_high_mf8_f32_fpm13__Mfloat8x8_t13__Float32x4_tS0_m(
+// CHECK-CXX-SAME: <8 x i8> [[VD:%.*]], <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VD]], i64 0)
+// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-CXX-NEXT: ret <16 x i8> [[VFCVTN2_I]]
+//
+mfloat8x16_t test_vcvt_high_mf8_f32_fpm(mfloat8x8_t vd, float32x4_t vn,
+ float32x4_t vm, fpm_t fpm) {
+ return vcvt_high_mf8_f32_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcvt_mf8_f16_fpm(
+// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <4 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VN]] to <8 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[VM]] to <8 x i8>
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[VFCVTN2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]])
+// CHECK-NEXT: ret <8 x i8> [[VFCVTN2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z21test_vcvt_mf8_f16_fpm13__Float16x4_tS_m(
+// CHECK-CXX-SAME: <4 x half> noundef [[VN:%.*]], <4 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VN]] to <8 x i8>
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[VM]] to <8 x i8>
+// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[VFCVTN2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]])
+// CHECK-CXX-NEXT: ret <8 x i8> [[VFCVTN2_I]]
+//
+mfloat8x8_t test_vcvt_mf8_f16_fpm(float16x4_t vn, float16x4_t vm, fpm_t fpm) {
+ return vcvt_mf8_f16_fpm(vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcvtq_mf8_f16_fpm(
+// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT: [[ENTRY:.*:]]
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VN]] to <16 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[VM]] to <16 x i8>
+// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT: [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]])
+// CHECK-NEXT: ret <16 x i8> [[VFCVTN2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z22test_vcvtq_mf8_f16_fpm13__Float16x8_tS_m(
+// CHECK-CXX-SAME: <8 x half> noundef [[VN:%.*]], <8 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT: [[ENTRY:.*:]]
+// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VN]] to <16 x i8>
+// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[VM]] to <16 x i8>
+// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT: [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]])
+// CHECK-CXX-NEXT: ret <16 x i8> [[VFCVTN2_I]]
+//
+mfloat8x16_t test_vcvtq_mf8_f16_fpm(float16x8_t vn, float16x8_t vm, fpm_t fpm) {
+ return vcvtq_mf8_f16_fpm(vn, vm, fpm);
+}
diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c
new file mode 100644
index 00000000000000..2c7004c7968a46
--- /dev/null
+++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +faminmax -emit-llvm -verify %s -o /dev/null
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+void test_features(float16x4_t vd4, float16x8_t vd8, float32x4_t va4,
+ mfloat8x8_t v8, mfloat8x16_t v16, fpm_t fpm) {
+ (void) vcvt1_bf16_mf8_fpm(v8, fpm);
+ // expected-error at -1 {{'vcvt1_bf16_mf8_fpm' requires target feature 'fp8'}}
+ (void) vcvt1_low_bf16_mf8_fpm(v16, fpm);
+ // expected-error at -1 {{'vcvt1_low_bf16_mf8_fpm' requires target feature 'fp8'}}
+ (void) vcvt2_bf16_mf8_fpm(v8, fpm);
+ // expected-error at -1 {{'vcvt2_bf16_mf8_fpm' requires target feature 'fp8'}}
+ (void) vcvt2_low_bf16_mf8_fpm(v16, fpm);
+ // expected-error at -1 {{'vcvt2_low_bf16_mf8_fpm' requires target feature 'fp8'}}
+
+ (void) vcvt1_high_bf16_mf8_fpm(v16, fpm);
+ // expected-error at -1 {{'vcvt1_high_bf16_mf8_fpm' requires target feature 'fp8'}}
+ (void) vcvt2_high_bf16_mf8_fpm(v16, fpm);
+ // expected-error at -1 {{'vcvt2_high_bf16_mf8_fpm' requires target feature 'fp8'}}
+
+ (void) vcvt1_f16_mf8_fpm(v8, fpm);
+ // expected-error at -1 {{'vcvt1_f16_mf8_fpm' requires target feature 'fp8'}}
+ (void) vcvt1_low_f16_mf8_fpm(v16, fpm);
+ // expected-error at -1 {{'vcvt1_low_f16_mf8_fpm' requires target feature 'fp8'}}
+ (void) vcvt2_f16_mf8_fpm(v8, fpm);
+ // expected-error at -1 {{'vcvt2_f16_mf8_fpm' requires target feature 'fp8'}}
+ (void) vcvt2_low_f16_mf8_fpm(v16, fpm);
+ // expected-error at -1 {{'vcvt2_low_f16_mf8_fpm' requires target feature 'fp8'}}
+ (void) vcvt1_high_f16_mf8_fpm(v16, fpm);
+ // expected-error at -1 {{'vcvt1_high_f16_mf8_fpm' requires target feature 'fp8'}}
+ (void) vcvt2_high_f16_mf8_fpm(v16, fpm);
+ // expected-error at -1 {{'vcvt2_high_f16_mf8_fpm' requires target feature 'fp8'}}
+ (void) vcvt_mf8_f32_fpm(va4, va4, fpm);
+ // expected-error at -1 {{'vcvt_mf8_f32_fpm' requires target feature 'fp8'}}
+ (void) vcvt_high_mf8_f32_fpm(v8, va4, va4, fpm);
+ // expected-error at -1 {{'vcvt_high_mf8_f32_fpm' requires target feature 'fp8'}}
+ (void) vcvt_mf8_f16_fpm(vd4, vd4, fpm);
+ // expected-error at -1 {{'vcvt_mf8_f16_fpm' requires target feature 'fp8'}}
+ (void) vcvtq_mf8_f16_fpm(vd8, vd8, fpm);
+ // expected-error at -1 {{'vcvtq_mf8_f16_fpm' requires target feature 'fp8'}}
+}
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 7299a49252f0d2..11f33ca17fda8e 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -74,6 +74,7 @@ enum ClassKind {
ClassI, // generic integer instruction, e.g., "i8" suffix
ClassS, // signed/unsigned/poly, e.g., "s8", "u8" or "p8" suffix
ClassW, // width-specific instruction, e.g., "8" suffix
+ ClassV, // void-suffix instruction, no suffix
ClassB, // bitcast arguments with enum argument to specify type
ClassL, // Logical instructions which are op instructions
// but we need to not emit any suffix for in our
@@ -144,7 +145,7 @@ class Type {
private:
TypeSpec TS;
- enum TypeKind { Void, Float, SInt, UInt, Poly, BFloat16, MFloat8 };
+ enum TypeKind { Void, Float, SInt, UInt, Poly, BFloat16, MFloat8, FPM };
TypeKind Kind;
bool Immediate, Constant, Pointer;
// ScalarForMangling and NoManglingQ are really not suited to live here as
@@ -198,6 +199,7 @@ class Type {
bool isVoid() const { return Kind == Void; }
bool isBFloat16() const { return Kind == BFloat16; }
bool isMFloat8() const { return Kind == MFloat8; }
+ bool isFPM() const { return Kind == FPM; }
unsigned getNumElements() const { return Bitwidth / ElementBitwidth; }
unsigned getSizeInBits() const { return Bitwidth; }
unsigned getElementSizeInBits() const { return ElementBitwidth; }
@@ -600,6 +602,7 @@ class NeonEmitter {
const Record *SI = R.getClass("SInst");
const Record *II = R.getClass("IInst");
const Record *WI = R.getClass("WInst");
+ const Record *VI = R.getClass("VInst");
const Record *SOpI = R.getClass("SOpInst");
const Record *IOpI = R.getClass("IOpInst");
const Record *WOpI = R.getClass("WOpInst");
@@ -609,6 +612,7 @@ class NeonEmitter {
ClassMap[SI] = ClassS;
ClassMap[II] = ClassI;
ClassMap[WI] = ClassW;
+ ClassMap[VI] = ClassV;
ClassMap[SOpI] = ClassS;
ClassMap[IOpI] = ClassI;
ClassMap[WOpI] = ClassW;
@@ -641,6 +645,9 @@ class NeonEmitter {
std::string Type::str() const {
if (isVoid())
return "void";
+ if (isFPM())
+ return "fpm_t";
+
std::string S;
if (isInteger() && !isSigned())
@@ -699,6 +706,8 @@ std::string Type::builtin_str() const {
} else if (isMFloat8()) {
assert(ElementBitwidth == 8 && "MFloat8 can only be 8 bits");
S += "m";
+ } else if (isFPM()) {
+ S += "UWi";
} else
switch (ElementBitwidth) {
case 16: S += "h"; break;
@@ -925,6 +934,13 @@ void Type::applyModifiers(StringRef Mods) {
case 'P':
Kind = Poly;
break;
+ case 'V':
+ Kind = FPM;
+ Bitwidth = ElementBitwidth = 64;
+ NumVectors = 0;
+ Immediate = Constant = Pointer = false;
+ ScalarForMangling = NoManglingQ = true;
+ break;
case '>':
assert(ElementBitwidth < 128);
ElementBitwidth *= 2;
@@ -1000,6 +1016,9 @@ std::string Intrinsic::getInstTypeCode(Type T, ClassKind CK) const {
if (CK == ClassB && TargetGuard == "neon")
return "";
+ if (this->CK == ClassV)
+ return "";
+
if (T.isBFloat16())
return "bf16";
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index b31a65d9bcc02a..31c9546376c820 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -993,6 +993,28 @@ def int_aarch64_st64b: Intrinsic<[], !listconcat([llvm_ptr_ty], data512)>;
def int_aarch64_st64bv: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], data512)>;
def int_aarch64_st64bv0: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], data512)>;
+ //
+ // Neon FP8 intrinsics
+ //
+
+ // Conversions
+ class AdvSIMD_FP8_1VectorArg_Long_Intrinsic
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrReadMem, IntrInaccessibleMemOnly]>;
+
+ def int_aarch64_neon_fp8_cvtl1 : AdvSIMD_FP8_1VectorArg_Long_Intrinsic;
+ def int_aarch64_neon_fp8_cvtl2 : AdvSIMD_FP8_1VectorArg_Long_Intrinsic;
+
+ def int_aarch64_neon_fp8_fcvtn
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [llvm_anyvector_ty,
+ LLVMMatchType<1>],
+ [IntrReadMem, IntrInaccessibleMemOnly]>;
+ def int_aarch64_neon_fp8_fcvtn2
+ : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+ [LLVMMatchType<0>,
+ llvm_anyvector_ty,
+ LLVMMatchType<1>],
+ [IntrReadMem, IntrInaccessibleMemOnly]>;
}
def llvm_nxv1i1_ty : LLVMType<nxv1i1>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 6a3a9492e031c6..67b43664548457 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6559,17 +6559,30 @@ class BaseSIMDThreeVectors<bit Q, bit U, bits<2> size, bits<4> op,
// FCVTN (FP16 to FP8)
-multiclass SIMDThreeSameSizeVectorCvt<string asm> {
- def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b01, 0b1110, V64, V64, asm, ".8b",".4h">;
- def v16f8 : BaseSIMDThreeVectors<0b1, 0b0, 0b01, 0b1110, V128, V128, asm, ".16b", ".8h">;
+multiclass SIMD_FP8_CVTN_F16<string asm, SDPatternOperator Op> {
+ let Uses = [FPMR, FPCR], mayLoad = 1 in {
+ def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b01, 0b1110, V64, V64, asm, ".8b",".4h">;
+ def v16f8 : BaseSIMDThreeVectors<0b1, 0b0, 0b01, 0b1110, V128, V128, asm, ".16b", ".8h">;
+ }
+ def : Pat<(v8i8 (Op (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
+ (!cast<Instruction>(NAME # v8f8) V64:$Rn, V64:$Rm)>;
+ def : Pat<(v16i8 (Op (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
+ (!cast<Instruction>(NAME # v16f8) V128:$Rn, V128:$Rm)>;
}
-// TODO : Create v16f8 value type
// FCVTN, FCVTN2 (FP32 to FP8)
-multiclass SIMDThreeVectorCvt<string asm> {
- def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b00, 0b1110, V64, V128, asm, ".8b", ".4s">;
- def 2v16f8 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1110, asm#2, ".16b", ".4s",
- V128, v16i8, v4f32, null_frag>;
+multiclass SIMD_FP8_CVTN_F32<string asm, SDPatternOperator Op> {
+ let Uses = [FPMR, FPCR], mayLoad = 1 in {
+ def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b00, 0b1110, V64, V128, asm, ".8b", ".4s">;
+ def 2v16f8 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1110, asm#2, ".16b", ".4s",
+ V128, v16i8, v4f32, null_frag>;
+ }
+
+ def : Pat<(v8i8 (Op (v4f32 V128:$Rn), (v4f32 V128:$Rm))),
+ (!cast<Instruction>(NAME # v8f8) V128:$Rn, V128:$Rm)>;
+
+ def : Pat<(v16i8 (!cast<SDPatternOperator>(Op # 2) (v16i8 V128:$_Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm))),
+ (!cast<Instruction>(NAME # 2v16f8) V128:$_Rd, V128:$Rn, V128:$Rm)>;
}
// TODO: Create a new Value Type v8f8 and v16f8
@@ -7033,11 +7046,18 @@ multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
//----------------------------------------------------------------------------
// FP8 Advanced SIMD two-register miscellaneous
//----------------------------------------------------------------------------
-multiclass SIMDMixedTwoVectorFP8<bits<2>sz, string asm> {
- def v8f16 : BaseSIMDMixedTwoVector<0b0, 0b1, sz, 0b10111, V64, V128,
- asm, ".8h", ".8b", []>;
- def 2v8f16 : BaseSIMDMixedTwoVector<0b1, 0b1, sz, 0b10111, V128, V128,
- asm#2, ".8h", ".16b", []>;
+multiclass SIMD_FP8_CVTL<bits<2>sz, string asm, ValueType dty, SDPatternOperator Op> {
+ let Uses=[FPMR, FPCR], mayLoad = 1 in {
+ def NAME : BaseSIMDMixedTwoVector<0b0, 0b1, sz, 0b10111, V64, V128,
+ asm, ".8h", ".8b", []>;
+ def NAME#2 : BaseSIMDMixedTwoVector<0b1, 0b1, sz, 0b10111, V128, V128,
+ asm#2, ".8h", ".16b", []>;
+ }
+ def : Pat<(dty (Op (v8i8 V64:$Rn))),
+ (!cast<Instruction>(NAME) V64:$Rn)>;
+
+ def : Pat<(dty (Op (v16i8 V128:$Rn))),
+ (!cast<Instruction>(NAME#2) V128:$Rn)>;
}
class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 9d0bd44544134c..881af6eb951177 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -10324,13 +10324,13 @@ let Predicates = [HasD128] in {
// 2023 Architecture Extensions:
//===----------------------------===//
-let Uses = [FPMR, FPCR], Predicates = [HasFP8] in {
- defm F1CVTL : SIMDMixedTwoVectorFP8<0b00, "f1cvtl">;
- defm F2CVTL : SIMDMixedTwoVectorFP8<0b01, "f2cvtl">;
- defm BF1CVTL : SIMDMixedTwoVectorFP8<0b10, "bf1cvtl">;
- defm BF2CVTL : SIMDMixedTwoVectorFP8<0b11, "bf2cvtl">;
- defm FCVTN_F16_F8 : SIMDThreeSameSizeVectorCvt<"fcvtn">;
- defm FCVTN_F32_F8 : SIMDThreeVectorCvt<"fcvtn">;
+let Predicates = [HasFP8] in {
+ defm F1CVTL : SIMD_FP8_CVTL<0b00, "f1cvtl", v8f16, int_aarch64_neon_fp8_cvtl1>;
+ defm F2CVTL : SIMD_FP8_CVTL<0b01, "f2cvtl", v8f16, int_aarch64_neon_fp8_cvtl2>;
+ defm BF1CVTL : SIMD_FP8_CVTL<0b10, "bf1cvtl", v8bf16, int_aarch64_neon_fp8_cvtl1>;
+ defm BF2CVTL : SIMD_FP8_CVTL<0b11, "bf2cvtl", v8bf16, int_aarch64_neon_fp8_cvtl2>;
+ defm FCVTN_F16 : SIMD_FP8_CVTN_F16<"fcvtn", int_aarch64_neon_fp8_fcvtn>;
+ defm FCVTN_F32 : SIMD_FP8_CVTN_F32<"fcvtn", int_aarch64_neon_fp8_fcvtn>;
defm FSCALE : SIMDThreeVectorFscale<0b1, 0b1, 0b111, "fscale", int_aarch64_neon_fp8_fscale>;
} // End let Predicates = [HasFP8]
diff --git a/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll b/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll
new file mode 100644
index 00000000000000..6070380d24234b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8 < %s | FileCheck %s
+
+define <8 x bfloat> @test_vbfcvtl1_low(<8 x i8> %vn) {
+; CHECK-LABEL: test_vbfcvtl1_low:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bf1cvtl v0.8h, v0.8b
+; CHECK-NEXT: ret
+ %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> %vn)
+ ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_vbfcvtl1_high(<16 x i8> %vn) {
+; CHECK-LABEL: test_vbfcvtl1_high:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bf1cvtl2 v0.8h, v0.16b
+; CHECK-NEXT: ret
+ %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> %vn)
+ ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_vbfcvtl2_low(<8 x i8> %vn) {
+; CHECK-LABEL: test_vbfcvtl2_low:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bf2cvtl v0.8h, v0.8b
+; CHECK-NEXT: ret
+ %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> %vn)
+ ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_vbfcvtl2_high(<16 x i8> %vn) {
+; CHECK-LABEL: test_vbfcvtl2_high:
+; CHECK: // %bb.0:
+; CHECK-NEXT: bf2cvtl2 v0.8h, v0.16b
+; CHECK-NEXT: ret
+ %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> %vn)
+ ret <8 x bfloat> %res
+}
+
+
+define <8 x half> @test_vfcvtl1_low(<8 x i8> %vn) {
+; CHECK-LABEL: test_vfcvtl1_low:
+; CHECK: // %bb.0:
+; CHECK-NEXT: f1cvtl v0.8h, v0.8b
+; CHECK-NEXT: ret
+ %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> %vn)
+ ret <8 x half> %res
+}
+
+define <8 x half> @test_vfcvtl1_high(<16 x i8> %vn) {
+; CHECK-LABEL: test_vfcvtl1_high:
+; CHECK: // %bb.0:
+; CHECK-NEXT: f1cvtl2 v0.8h, v0.16b
+; CHECK-NEXT: ret
+ %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> %vn)
+ ret <8 x half> %res
+}
+
+define <8 x half> @test_vfcvtl2_low(<8 x i8> %vn) {
+; CHECK-LABEL: test_vfcvtl2_low:
+; CHECK: // %bb.0:
+; CHECK-NEXT: f2cvtl v0.8h, v0.8b
+; CHECK-NEXT: ret
+ %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> %vn)
+ ret <8 x half> %res
+}
+
+define <8 x half> @test_vfcvtl2_high(<16 x i8> %vn) {
+; CHECK-LABEL: test_vfcvtl2_high:
+; CHECK: // %bb.0:
+; CHECK-NEXT: f2cvtl2 v0.8h, v0.16b
+; CHECK-NEXT: ret
+ %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> %vn)
+ ret <8 x half> %res
+}
+
+define <8 x i8> @test_vcvtn_low_f8_f32(<4 x float> %vn, <4 x float> %vm) {
+; CHECK-LABEL: test_vcvtn_low_f8_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtn v0.8b, v0.4s, v1.4s
+; CHECK-NEXT: ret
+ %res = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> %vn, <4 x float> %vm)
+ ret <8 x i8> %res
+}
+
+define <16 x i8> @test_vcvtn_high_f8_f32(<16 x i8> %vd, <4 x float> %vn, <4 x float> %vm) {
+; CHECK-LABEL: test_vcvtn_high_f8_f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtn2 v0.16b, v1.4s, v2.4s
+; CHECK-NEXT: ret
+ %res = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> %vd, <4 x float> %vn, <4 x float> %vm)
+ ret <16 x i8> %res
+}
+
+
+define <8 x i8> @test_vcvtn_f8_f16(<4 x half> %vn, <4 x half> %vm) {
+; CHECK-LABEL: test_vcvtn_f8_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtn v0.8b, v0.4h, v1.4h
+; CHECK-NEXT: ret
+ %res = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> %vn, <4 x half> %vm)
+ ret <8 x i8> %res
+}
+
+define <16 x i8> @test_vcvtn2_f8_f16(<8 x half> %vn, <8 x half> %vm) {
+; CHECK-LABEL: test_vcvtn2_f8_f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtn v0.16b, v0.8h, v1.8h
+; CHECK-NEXT: ret
+ %res = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> %vn, <8 x half> %vm)
+ ret <16 x i8> %res
+}
More information about the llvm-commits
mailing list