[clang] [llvm] [AArch64] Implement NEON FP8 fused multiply-add intrinsics (non-indexed) (PR #120273)

Tue Jan 14 03:58:50 PST 2025

https://github.com/momchil-velikov updated https://github.com/llvm/llvm-project/pull/120273

>From e8a216c5effbf426ada5b9deb89fc2b5d9405f7c Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 6 Dec 2024 13:09:23 +0000
Subject: [PATCH 01/19] [AArch64] Refactor implementation of FP8 types (NFC)

* The FP8 scalar type (`__mfp8`) was described as a vector type
* The FP8 vector types were described/assumed to have
  integer element type (the element type ought to be `__mfp8`),
* Add support for `m` type specifier (denoting `__mfp8`)
  in `DecodeTypeFromStr` and create SVE builtin prototypes using
  the specifier, instead of `int8_t`.
---
 clang/include/clang/AST/Type.h                |  5 +++
 .../clang/Basic/AArch64SVEACLETypes.def       | 24 +++++++++---
 clang/lib/AST/ASTContext.cpp                  | 37 +++++++++++++++----
 clang/lib/AST/ItaniumMangle.cpp               |  5 +++
 clang/lib/AST/Type.cpp                        |  4 +-
 clang/lib/CodeGen/CodeGenTypes.cpp            | 13 +++++--
 clang/lib/CodeGen/Targets/AArch64.cpp         |  7 +++-
 clang/utils/TableGen/SveEmitter.cpp           |  4 +-
 8 files changed, 76 insertions(+), 23 deletions(-)

diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 09c98f642852fc3..aa313719a657552 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2518,6 +2518,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
   bool isFloat32Type() const;
   bool isDoubleType() const;
   bool isBFloat16Type() const;
+  bool isMFloat8Type() const;
   bool isFloat128Type() const;
   bool isIbm128Type() const;
   bool isRealType() const;         // C99 6.2.5p17 (real floating + integer)
@@ -8532,6 +8533,10 @@ inline bool Type::isBFloat16Type() const {
   return isSpecificBuiltinType(BuiltinType::BFloat16);
 }
 
+inline bool Type::isMFloat8Type() const {
+  return isSpecificBuiltinType(BuiltinType::MFloat8);
+}
+
 inline bool Type::isFloat128Type() const {
   return isSpecificBuiltinType(BuiltinType::Float128);
 }
diff --git a/clang/include/clang/Basic/AArch64SVEACLETypes.def b/clang/include/clang/Basic/AArch64SVEACLETypes.def
index 063cac1f4a58ee7..6b704b386536c9b 100644
--- a/clang/include/clang/Basic/AArch64SVEACLETypes.def
+++ b/clang/include/clang/Basic/AArch64SVEACLETypes.def
@@ -57,6 +57,11 @@
 //  - IsBF true for vector of brain float elements.
 //===----------------------------------------------------------------------===//
 
+#ifndef SVE_SCALAR_TYPE
+#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits) \
+  SVE_TYPE(Name, Id, SingletonId)
+#endif
+
 #ifndef SVE_VECTOR_TYPE
 #define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \
   SVE_TYPE(Name, Id, SingletonId)
@@ -72,6 +77,11 @@
   SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, false, true)
 #endif
 
+#ifndef SVE_VECTOR_TYPE_MFLOAT
+#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) \
+  SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, false, false)
+#endif
+
 #ifndef SVE_VECTOR_TYPE_FLOAT
 #define SVE_VECTOR_TYPE_FLOAT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) \
   SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, true, false)
@@ -125,8 +135,7 @@ SVE_VECTOR_TYPE_FLOAT("__SVFloat64_t", "__SVFloat64_t", SveFloat64, SveFloat64Ty
 
 SVE_VECTOR_TYPE_BFLOAT("__SVBfloat16_t", "__SVBfloat16_t", SveBFloat16, SveBFloat16Ty, 8, 16, 1)
 
-// This is a 8 bits opaque type.
-SVE_VECTOR_TYPE_INT("__SVMfloat8_t", "__SVMfloat8_t",  SveMFloat8, SveMFloat8Ty, 16, 8, 1, false)
+SVE_VECTOR_TYPE_MFLOAT("__SVMfloat8_t", "__SVMfloat8_t",  SveMFloat8, SveMFloat8Ty, 16, 8, 1)
 
 //
 // x2
@@ -148,7 +157,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x2_t", "svfloat64x2_t", SveFloat64x2, Sv
 
 SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x2_t", "svbfloat16x2_t", SveBFloat16x2, SveBFloat16x2Ty, 8, 16, 2)
 
-SVE_VECTOR_TYPE_INT("__clang_svmfloat8x2_t", "svmfloat8x2_t", SveMFloat8x2, SveMFloat8x2Ty, 16, 8, 2, false)
+SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x2_t", "svmfloat8x2_t", SveMFloat8x2, SveMFloat8x2Ty, 16, 8, 2)
 
 //
 // x3
@@ -170,7 +179,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x3_t", "svfloat64x3_t", SveFloat64x3, Sv
 
 SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x3_t", "svbfloat16x3_t", SveBFloat16x3, SveBFloat16x3Ty, 8, 16, 3)
 
-SVE_VECTOR_TYPE_INT("__clang_svmfloat8x3_t", "svmfloat8x3_t", SveMFloat8x3, SveMFloat8x3Ty, 16, 8, 3, false)
+SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x3_t", "svmfloat8x3_t", SveMFloat8x3, SveMFloat8x3Ty, 16, 8, 3)
 
 //
 // x4
@@ -192,7 +201,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x4_t", "svfloat64x4_t", SveFloat64x4, Sv
 
 SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x4_t", "svbfloat16x4_t", SveBFloat16x4, SveBFloat16x4Ty, 8, 16, 4)
 
-SVE_VECTOR_TYPE_INT("__clang_svmfloat8x4_t", "svmfloat8x4_t", SveMFloat8x4, SveMFloat8x4Ty, 16, 8, 4, false)
+SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x4_t", "svmfloat8x4_t", SveMFloat8x4, SveMFloat8x4Ty, 16, 8, 4)
 
 SVE_PREDICATE_TYPE_ALL("__SVBool_t", "__SVBool_t", SveBool, SveBoolTy, 16, 1)
 SVE_PREDICATE_TYPE_ALL("__clang_svboolx2_t", "svboolx2_t", SveBoolx2, SveBoolx2Ty, 16, 2)
@@ -200,11 +209,13 @@ SVE_PREDICATE_TYPE_ALL("__clang_svboolx4_t", "svboolx4_t", SveBoolx4, SveBoolx4T
 
 SVE_OPAQUE_TYPE("__SVCount_t", "__SVCount_t", SveCount, SveCountTy)
 
-AARCH64_VECTOR_TYPE_MFLOAT("__mfp8", "__mfp8", MFloat8, MFloat8Ty, 1, 8, 1)
+SVE_SCALAR_TYPE("__mfp8", "__mfp8", MFloat8, MFloat8Ty, 8)
+
 AARCH64_VECTOR_TYPE_MFLOAT("__MFloat8x8_t", "__MFloat8x8_t", MFloat8x8, MFloat8x8Ty, 8, 8, 1)
 AARCH64_VECTOR_TYPE_MFLOAT("__MFloat8x16_t", "__MFloat8x16_t", MFloat8x16, MFloat8x16Ty, 16, 8, 1)
 
 #undef SVE_VECTOR_TYPE
+#undef SVE_VECTOR_TYPE_MFLOAT
 #undef SVE_VECTOR_TYPE_BFLOAT
 #undef SVE_VECTOR_TYPE_FLOAT
 #undef SVE_VECTOR_TYPE_INT
@@ -213,4 +224,5 @@ AARCH64_VECTOR_TYPE_MFLOAT("__MFloat8x16_t", "__MFloat8x16_t", MFloat8x16, MFloa
 #undef SVE_OPAQUE_TYPE
 #undef AARCH64_VECTOR_TYPE_MFLOAT
 #undef AARCH64_VECTOR_TYPE
+#undef SVE_SCALAR_TYPE
 #undef SVE_TYPE
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 8f04b58841964a6..df12c110bf357e7 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -2275,6 +2275,11 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
     Width = NumEls * ElBits * NF;                                              \
     Align = NumEls * ElBits;                                                   \
     break;
+#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits)              \
+  case BuiltinType::Id:                                                        \
+    Width = Bits;                                                              \
+    Align = Bits;                                                              \
+    break;
 #include "clang/Basic/AArch64SVEACLETypes.def"
 #define PPC_VECTOR_TYPE(Name, Id, Size)                                        \
   case BuiltinType::Id:                                                        \
@@ -4395,15 +4400,18 @@ ASTContext::getBuiltinVectorTypeInfo(const BuiltinType *Ty) const {
                                ElBits, NF)                                     \
   case BuiltinType::Id:                                                        \
     return {BFloat16Ty, llvm::ElementCount::getScalable(NumEls), NF};
+#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls,     \
+                               ElBits, NF)                                     \
+  case BuiltinType::Id:                                                        \
+    return {MFloat8Ty, llvm::ElementCount::getScalable(NumEls), NF};
 #define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \
   case BuiltinType::Id:                                                        \
     return {BoolTy, llvm::ElementCount::getScalable(NumEls), NF};
 #define AARCH64_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, \
                                    ElBits, NF)                                 \
   case BuiltinType::Id:                                                        \
-    return {getIntTypeForBitwidth(ElBits, false),                              \
-            llvm::ElementCount::getFixed(NumEls), NF};
-#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
+    return {MFloat8Ty, llvm::ElementCount::getFixed(NumEls), NF};
+#define SVE_TYPE(Name, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
 
 #define RVV_VECTOR_TYPE_INT(Name, Id, SingletonId, NumEls, ElBits, NF,         \
@@ -4465,11 +4473,16 @@ QualType ASTContext::getScalableVectorType(QualType EltTy, unsigned NumElts,
       EltTySize == ElBits && NumElts == (NumEls * NF) && NumFields == 1) {     \
     return SingletonId;                                                        \
   }
+#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls,     \
+                               ElBits, NF)                                     \
+  if (EltTy->isMFloat8Type() && EltTySize == ElBits &&                         \
+      NumElts == (NumEls * NF) && NumFields == 1) {                            \
+    return SingletonId;                                                        \
+  }
 #define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \
   if (EltTy->isBooleanType() && NumElts == (NumEls * NF) && NumFields == 1)    \
     return SingletonId;
-#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
-#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId)
+#define SVE_TYPE(Name, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
   } else if (Target->hasRISCVVTypes()) {
     uint64_t EltTySize = getTypeSize(EltTy);
@@ -12216,8 +12229,15 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
                                              RequiresICE, false);
     assert(!RequiresICE && "Can't require vector ICE");
 
-    // TODO: No way to make AltiVec vectors in builtins yet.
-    Type = Context.getVectorType(ElementType, NumElements, VectorKind::Generic);
+    if (ElementType == Context.MFloat8Ty) {
+      assert((NumElements == 8 || NumElements == 16) &&
+             "Invalid number of elements");
+      Type = NumElements == 8 ? Context.MFloat8x8Ty : Context.MFloat8x16Ty;
+    } else {
+      // TODO: No way to make AltiVec vectors in builtins yet.
+      Type =
+          Context.getVectorType(ElementType, NumElements, VectorKind::Generic);
+    }
     break;
   }
   case 'E': {
@@ -12273,6 +12293,9 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
   case 'p':
     Type = Context.getProcessIDType();
     break;
+  case 'm':
+    Type = Context.MFloat8Ty;
+    break;
   }
 
   // If there are modifiers and if we're allowed to parse them, go for it.
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 47aa9b40dab845b..9404f9fd9b151d5 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -3438,6 +3438,11 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
     type_name = MangledName;                                                   \
     Out << (type_name == Name ? "u" : "") << type_name.size() << type_name;    \
     break;
+#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits)              \
+  case BuiltinType::Id:                                                        \
+    type_name = MangledName;                                                   \
+    Out << (type_name == Name ? "u" : "") << type_name.size() << type_name;    \
+    break;
 #include "clang/Basic/AArch64SVEACLETypes.def"
 #define PPC_VECTOR_TYPE(Name, Id, Size)                                        \
   case BuiltinType::Id:                                                        \
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index caa0ac858a1bea6..fde0746a175705f 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2527,9 +2527,7 @@ bool Type::isSVESizelessBuiltinType() const {
 #define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId)                 \
   case BuiltinType::Id:                                                        \
     return true;
-#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId)                \
-  case BuiltinType::Id:                                                        \
-    return false;
+#define SVE_TYPE(Name, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
     default:
       return false;
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 09191a4901f4932..fd3327cf9acd890 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -507,13 +507,15 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
   case BuiltinType::Id:
 #define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId)                \
   case BuiltinType::Id:
-#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
+#define SVE_TYPE(Name, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
       {
         ASTContext::BuiltinVectorTypeInfo Info =
             Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(Ty));
-        auto VTy =
-            llvm::VectorType::get(ConvertType(Info.ElementType), Info.EC);
+        auto *EltTy = Info.ElementType->isMFloat8Type()
+                          ? llvm::Type::getInt8Ty(getLLVMContext())
+                          : ConvertType(Info.ElementType);
+        auto *VTy = llvm::VectorType::get(EltTy, Info.EC);
         switch (Info.NumVectors) {
         default:
           llvm_unreachable("Expected 1, 2, 3 or 4 vectors!");
@@ -529,6 +531,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
       }
     case BuiltinType::SveCount:
       return llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount");
+    case BuiltinType::MFloat8:
+      return llvm::VectorType::get(llvm::Type::getInt8Ty(getLLVMContext()), 1,
+                                   false);
 #define PPC_VECTOR_TYPE(Name, Id, Size) \
     case BuiltinType::Id: \
       ResultType = \
@@ -650,6 +655,8 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
     // An ext_vector_type of Bool is really a vector of bits.
     llvm::Type *IRElemTy = VT->isExtVectorBoolType()
                                ? llvm::Type::getInt1Ty(getLLVMContext())
+                           : VT->getElementType()->isMFloat8Type()
+                               ? llvm::Type::getInt8Ty(getLLVMContext())
                                : ConvertType(VT->getElementType());
     ResultType = llvm::FixedVectorType::get(IRElemTy, VT->getNumElements());
     break;
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index 7db67ecba07c8f4..7158b94a2b3d6aa 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -244,6 +244,7 @@ AArch64ABIInfo::convertFixedToScalableVectorType(const VectorType *VT) const {
 
     case BuiltinType::SChar:
     case BuiltinType::UChar:
+    case BuiltinType::MFloat8:
       return llvm::ScalableVectorType::get(
           llvm::Type::getInt8Ty(getVMContext()), 16);
 
@@ -781,8 +782,10 @@ bool AArch64ABIInfo::passAsPureScalableType(
     NPred += Info.NumVectors;
   else
     NVec += Info.NumVectors;
-  auto VTy = llvm::ScalableVectorType::get(CGT.ConvertType(Info.ElementType),
-                                           Info.EC.getKnownMinValue());
+  llvm::Type *EltTy = Info.ElementType->isMFloat8Type()
+                          ? llvm::Type::getInt8Ty(getVMContext())
+                          : CGT.ConvertType(Info.ElementType);
+  auto *VTy = llvm::ScalableVectorType::get(EltTy, Info.EC.getKnownMinValue());
 
   if (CoerceToSeq.size() + Info.NumVectors > 12)
     return false;
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index 97b768db3a31355..3cd7aed9469ab8f 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -448,7 +448,7 @@ std::string SVEType::builtinBaseType() const {
   case TypeKind::PredicatePattern:
     return "i";
   case TypeKind::Fpm:
-    return "Wi";
+    return "UWi";
   case TypeKind::Predicate:
     return "b";
   case TypeKind::BFloat16:
@@ -456,7 +456,7 @@ std::string SVEType::builtinBaseType() const {
     return "y";
   case TypeKind::MFloat8:
     assert(ElementBitwidth == 8 && "Invalid MFloat8!");
-    return "c";
+    return "m";
   case TypeKind::Float:
     switch (ElementBitwidth) {
     case 16:

>From 01ee70b38a31e3c9e19fdc1fd70d68ee1fe4b83f Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Mon, 13 Jan 2025 16:28:20 +0000
Subject: [PATCH 02/19] [fixup] Add a comment about special case of mapping FP8
 vectors to LLVM vector types

---
 clang/lib/CodeGen/CodeGenTypes.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index fd3327cf9acd890..5b3048ae2f21cb8 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -512,6 +512,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
       {
         ASTContext::BuiltinVectorTypeInfo Info =
             Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(Ty));
+        // The `__mfp8` type maps to `<1 x i8>` which can't be used to build
+        // a <N x i8> vector type, hence bypass the call to `ConvertType` for
+        // the element type and create the vector type directly.
         auto *EltTy = Info.ElementType->isMFloat8Type()
                           ? llvm::Type::getInt8Ty(getLLVMContext())
                           : ConvertType(Info.ElementType);

>From a4d998ffa444f7ba539a6375f6043ac80cdef05c Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 6 Dec 2024 15:44:58 +0000
Subject: [PATCH 03/19] [Clang][AArch64] Allow FP8 Neon vector types to be used
 by __builtin_shufflevector

The Neon vector types for FP8 (`__MFloat8x8_t` and `__MFloat8x16_t`) are
implemented as builtin types and need a special case in
`__builtin_shufflevector`.
---
 clang/include/clang/AST/Type.h                |   4 +
 .../clang/Basic/DiagnosticSemaKinds.td        |   5 +
 clang/lib/AST/Type.cpp                        |  13 ++
 clang/lib/Sema/SemaChecking.cpp               |  39 +++++-
 .../AArch64/builtin-shufflevector-fp8.c       | 123 ++++++++++++++++++
 clang/test/Sema/builtin-shufflevector.c       |  30 +++++
 6 files changed, 208 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c
 create mode 100644 clang/test/Sema/builtin-shufflevector.c

diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index aa313719a657552..fbc62f61ad5a553 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2404,6 +2404,10 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
   /// SVE vector or predicate, excluding tuple types such as svint32x4_t.
   bool isSveVLSBuiltinType() const;
 
+  /// Determines if this is a *builtin* NEON vector type, a type not built with
+  /// `neon_vector_type`
+  bool isNeonVectorBuiltinType() const;
+
   /// Returns the representative type for the element of an SVE builtin type.
   /// This is used to represent fixed-length SVE vectors created with the
   /// 'arm_sve_vector_bits' type attribute as VectorType.
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index d4e897868f1a9a6..b43f2d9143117ea 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10564,6 +10564,9 @@ def err_vec_builtin_incompatible_vector : Error<
 def err_vsx_builtin_nonconstant_argument : Error<
   "argument %0 to %1 must be a 2-bit unsigned literal (i.e. 0, 1, 2 or 3)">;
 
+def err_shufflevector_incompatible_index_vector : Error<
+  "second argument for __builtin_shufflevector must be integer vector "
+  "with length equal to the length of the first argument">;
 def err_shufflevector_nonconstant_argument : Error<
   "index for __builtin_shufflevector must be a constant integer">;
 def err_shufflevector_argument_too_large : Error<
@@ -10571,6 +10574,8 @@ def err_shufflevector_argument_too_large : Error<
   "of vector elements">;
 def err_shufflevector_minus_one_is_undefined_behavior_constexpr : Error<
   "index for __builtin_shufflevector not within the bounds of the input vectors; index of -1 found at position %0 is not permitted in a constexpr context">;
+def err_shufflevector_unsupported_result_vector_type : Error<
+  "unsupported vector type for the result">;
 
 def err_convertvector_non_vector : Error<
   "first argument to __builtin_convertvector must be a vector">;
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index fde0746a175705f..fb55bab0a67de68 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2576,6 +2576,19 @@ bool Type::isSveVLSBuiltinType() const {
   return false;
 }
 
+bool Type::isNeonVectorBuiltinType() const {
+  if (const BuiltinType *BT = getAs<BuiltinType>()) {
+    switch (BT->getKind()) {
+    case BuiltinType::MFloat8x8:
+    case BuiltinType::MFloat8x16:
+      return true;
+    default:
+      return false;
+    }
+  }
+  return false;
+}
+
 QualType Type::getSizelessVectorEltType(const ASTContext &Ctx) const {
   assert(isSizelessVectorType() && "Must be sizeless vector type");
   // Currently supports SVE and RVV
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 28dcfaac2e84f57..1a407c7a6cece66 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5130,24 +5130,32 @@ ExprResult Sema::BuiltinShuffleVector(CallExpr *TheCall) {
     QualType LHSType = TheCall->getArg(0)->getType();
     QualType RHSType = TheCall->getArg(1)->getType();
 
-    if (!LHSType->isVectorType() || !RHSType->isVectorType())
+    if (!LHSType->isVectorType() && !LHSType->isNeonVectorBuiltinType())
       return ExprError(
-          Diag(TheCall->getBeginLoc(), diag::err_vec_builtin_non_vector)
-          << TheCall->getDirectCallee() << /*isMorethantwoArgs*/ false
+          Diag(TheCall->getBeginLoc(), diag::err_builtin_non_vector_type)
+          << "first" << TheCall->getDirectCallee()
+          << /*isMorethantwoArgs*/ false
           << SourceRange(TheCall->getArg(0)->getBeginLoc(),
                          TheCall->getArg(1)->getEndLoc()));
 
-    numElements = LHSType->castAs<VectorType>()->getNumElements();
+    if (auto *Ty = LHSType->getAs<BuiltinType>()) {
+      assert(Ty->getKind() == BuiltinType::MFloat8x8 ||
+             Ty->getKind() == BuiltinType::MFloat8x16);
+      numElements = Ty->getKind() == BuiltinType::MFloat8x8 ? 8 : 16;
+    } else {
+      numElements = LHSType->castAs<VectorType>()->getNumElements();
+    }
+
     unsigned numResElements = TheCall->getNumArgs() - 2;
 
     // Check to see if we have a call with 2 vector arguments, the unary shuffle
     // with mask.  If so, verify that RHS is an integer vector type with the
     // same number of elts as lhs.
     if (TheCall->getNumArgs() == 2) {
-      if (!RHSType->hasIntegerRepresentation() ||
+      if (!RHSType->isVectorType() || !RHSType->hasIntegerRepresentation() ||
           RHSType->castAs<VectorType>()->getNumElements() != numElements)
         return ExprError(Diag(TheCall->getBeginLoc(),
-                              diag::err_vec_builtin_incompatible_vector)
+                              diag::err_shufflevector_incompatible_index_vector)
                          << TheCall->getDirectCallee()
                          << /*isMorethantwoArgs*/ false
                          << SourceRange(TheCall->getArg(1)->getBeginLoc(),
@@ -5160,6 +5168,25 @@ ExprResult Sema::BuiltinShuffleVector(CallExpr *TheCall) {
                        << SourceRange(TheCall->getArg(0)->getBeginLoc(),
                                       TheCall->getArg(1)->getEndLoc()));
     } else if (numElements != numResElements) {
+      if (auto *Ty = LHSType->getAs<BuiltinType>()) {
+        assert(Ty->getKind() == BuiltinType::MFloat8x8 ||
+               Ty->getKind() == BuiltinType::MFloat8x16);
+        switch (numResElements) {
+        case 8:
+          resType = Context.MFloat8x8Ty;
+          break;
+        case 16:
+          resType = Context.MFloat8x16Ty;
+          break;
+        default:
+          return ExprError(Diag(TheCall->getBeginLoc(),
+                                diag::err_shufflevector_unsupported_result_vector_type)
+                           << TheCall->getDirectCallee()
+                           << /*isMorethantwoArgs*/ false
+                           << SourceRange(TheCall->getArg(0)->getBeginLoc(),
+                                          TheCall->getArg(1)->getEndLoc()));
+        }
+      }
       QualType eltType = LHSType->castAs<VectorType>()->getElementType();
       resType =
           Context.getVectorType(eltType, numResElements, VectorKind::Generic);
diff --git a/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c b/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c
new file mode 100644
index 000000000000000..45ea81275095374
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c
@@ -0,0 +1,123 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple aarch64-linux -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+typedef __attribute__((neon_vector_type(8))) signed char int8x8_t;
+typedef __attribute__((neon_vector_type(16))) signed char int8x16_t;
+
+typedef __MFloat8x8_t mfloat8x8_t;
+typedef __MFloat8x16_t mfloat8x16_t;
+
+// CHECK-LABEL: define dso_local <8 x i8> @f0(
+// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[X]], <8 x i8> [[X]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE]]
+//
+mfloat8x8_t f0(mfloat8x8_t x) {
+  return __builtin_shufflevector(x, x, 3, 2, 1, 0, 3, 2, 1, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i8> @f1(
+// CHECK-SAME: <8 x i8> [[X:%.*]], <8 x i8> noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MASK:%.*]] = and <8 x i8> [[P]], splat (i8 7)
+// CHECK-NEXT:    [[SHUF_IDX:%.*]] = extractelement <8 x i8> [[MASK]], i64 0
+// CHECK-NEXT:    [[SHUF_ELT:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX]]
+// CHECK-NEXT:    [[SHUF_INS:%.*]] = insertelement <8 x i8> poison, i8 [[SHUF_ELT]], i64 0
+// CHECK-NEXT:    [[SHUF_IDX1:%.*]] = extractelement <8 x i8> [[MASK]], i64 1
+// CHECK-NEXT:    [[SHUF_ELT2:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX1]]
+// CHECK-NEXT:    [[SHUF_INS3:%.*]] = insertelement <8 x i8> [[SHUF_INS]], i8 [[SHUF_ELT2]], i64 1
+// CHECK-NEXT:    [[SHUF_IDX4:%.*]] = extractelement <8 x i8> [[MASK]], i64 2
+// CHECK-NEXT:    [[SHUF_ELT5:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX4]]
+// CHECK-NEXT:    [[SHUF_INS6:%.*]] = insertelement <8 x i8> [[SHUF_INS3]], i8 [[SHUF_ELT5]], i64 2
+// CHECK-NEXT:    [[SHUF_IDX7:%.*]] = extractelement <8 x i8> [[MASK]], i64 3
+// CHECK-NEXT:    [[SHUF_ELT8:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX7]]
+// CHECK-NEXT:    [[SHUF_INS9:%.*]] = insertelement <8 x i8> [[SHUF_INS6]], i8 [[SHUF_ELT8]], i64 3
+// CHECK-NEXT:    [[SHUF_IDX10:%.*]] = extractelement <8 x i8> [[MASK]], i64 4
+// CHECK-NEXT:    [[SHUF_ELT11:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX10]]
+// CHECK-NEXT:    [[SHUF_INS12:%.*]] = insertelement <8 x i8> [[SHUF_INS9]], i8 [[SHUF_ELT11]], i64 4
+// CHECK-NEXT:    [[SHUF_IDX13:%.*]] = extractelement <8 x i8> [[MASK]], i64 5
+// CHECK-NEXT:    [[SHUF_ELT14:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX13]]
+// CHECK-NEXT:    [[SHUF_INS15:%.*]] = insertelement <8 x i8> [[SHUF_INS12]], i8 [[SHUF_ELT14]], i64 5
+// CHECK-NEXT:    [[SHUF_IDX16:%.*]] = extractelement <8 x i8> [[MASK]], i64 6
+// CHECK-NEXT:    [[SHUF_ELT17:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX16]]
+// CHECK-NEXT:    [[SHUF_INS18:%.*]] = insertelement <8 x i8> [[SHUF_INS15]], i8 [[SHUF_ELT17]], i64 6
+// CHECK-NEXT:    [[SHUF_IDX19:%.*]] = extractelement <8 x i8> [[MASK]], i64 7
+// CHECK-NEXT:    [[SHUF_ELT20:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX19]]
+// CHECK-NEXT:    [[SHUF_INS21:%.*]] = insertelement <8 x i8> [[SHUF_INS18]], i8 [[SHUF_ELT20]], i64 7
+// CHECK-NEXT:    ret <8 x i8> [[SHUF_INS21]]
+//
+mfloat8x8_t f1(mfloat8x8_t x, int8x8_t p) {
+  return __builtin_shufflevector(x, p);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @f3(
+// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[X]], <16 x i8> [[X]], <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE]]
+//
+mfloat8x16_t f3(mfloat8x16_t x) {
+  return __builtin_shufflevector(x, x, 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2,
+                                 1, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @f4(
+// CHECK-SAME: <16 x i8> [[X:%.*]], <16 x i8> noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MASK:%.*]] = and <16 x i8> [[P]], splat (i8 15)
+// CHECK-NEXT:    [[SHUF_IDX:%.*]] = extractelement <16 x i8> [[MASK]], i64 0
+// CHECK-NEXT:    [[SHUF_ELT:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX]]
+// CHECK-NEXT:    [[SHUF_INS:%.*]] = insertelement <16 x i8> poison, i8 [[SHUF_ELT]], i64 0
+// CHECK-NEXT:    [[SHUF_IDX1:%.*]] = extractelement <16 x i8> [[MASK]], i64 1
+// CHECK-NEXT:    [[SHUF_ELT2:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX1]]
+// CHECK-NEXT:    [[SHUF_INS3:%.*]] = insertelement <16 x i8> [[SHUF_INS]], i8 [[SHUF_ELT2]], i64 1
+// CHECK-NEXT:    [[SHUF_IDX4:%.*]] = extractelement <16 x i8> [[MASK]], i64 2
+// CHECK-NEXT:    [[SHUF_ELT5:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX4]]
+// CHECK-NEXT:    [[SHUF_INS6:%.*]] = insertelement <16 x i8> [[SHUF_INS3]], i8 [[SHUF_ELT5]], i64 2
+// CHECK-NEXT:    [[SHUF_IDX7:%.*]] = extractelement <16 x i8> [[MASK]], i64 3
+// CHECK-NEXT:    [[SHUF_ELT8:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX7]]
+// CHECK-NEXT:    [[SHUF_INS9:%.*]] = insertelement <16 x i8> [[SHUF_INS6]], i8 [[SHUF_ELT8]], i64 3
+// CHECK-NEXT:    [[SHUF_IDX10:%.*]] = extractelement <16 x i8> [[MASK]], i64 4
+// CHECK-NEXT:    [[SHUF_ELT11:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX10]]
+// CHECK-NEXT:    [[SHUF_INS12:%.*]] = insertelement <16 x i8> [[SHUF_INS9]], i8 [[SHUF_ELT11]], i64 4
+// CHECK-NEXT:    [[SHUF_IDX13:%.*]] = extractelement <16 x i8> [[MASK]], i64 5
+// CHECK-NEXT:    [[SHUF_ELT14:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX13]]
+// CHECK-NEXT:    [[SHUF_INS15:%.*]] = insertelement <16 x i8> [[SHUF_INS12]], i8 [[SHUF_ELT14]], i64 5
+// CHECK-NEXT:    [[SHUF_IDX16:%.*]] = extractelement <16 x i8> [[MASK]], i64 6
+// CHECK-NEXT:    [[SHUF_ELT17:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX16]]
+// CHECK-NEXT:    [[SHUF_INS18:%.*]] = insertelement <16 x i8> [[SHUF_INS15]], i8 [[SHUF_ELT17]], i64 6
+// CHECK-NEXT:    [[SHUF_IDX19:%.*]] = extractelement <16 x i8> [[MASK]], i64 7
+// CHECK-NEXT:    [[SHUF_ELT20:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX19]]
+// CHECK-NEXT:    [[SHUF_INS21:%.*]] = insertelement <16 x i8> [[SHUF_INS18]], i8 [[SHUF_ELT20]], i64 7
+// CHECK-NEXT:    [[SHUF_IDX22:%.*]] = extractelement <16 x i8> [[MASK]], i64 8
+// CHECK-NEXT:    [[SHUF_ELT23:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX22]]
+// CHECK-NEXT:    [[SHUF_INS24:%.*]] = insertelement <16 x i8> [[SHUF_INS21]], i8 [[SHUF_ELT23]], i64 8
+// CHECK-NEXT:    [[SHUF_IDX25:%.*]] = extractelement <16 x i8> [[MASK]], i64 9
+// CHECK-NEXT:    [[SHUF_ELT26:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX25]]
+// CHECK-NEXT:    [[SHUF_INS27:%.*]] = insertelement <16 x i8> [[SHUF_INS24]], i8 [[SHUF_ELT26]], i64 9
+// CHECK-NEXT:    [[SHUF_IDX28:%.*]] = extractelement <16 x i8> [[MASK]], i64 10
+// CHECK-NEXT:    [[SHUF_ELT29:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX28]]
+// CHECK-NEXT:    [[SHUF_INS30:%.*]] = insertelement <16 x i8> [[SHUF_INS27]], i8 [[SHUF_ELT29]], i64 10
+// CHECK-NEXT:    [[SHUF_IDX31:%.*]] = extractelement <16 x i8> [[MASK]], i64 11
+// CHECK-NEXT:    [[SHUF_ELT32:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX31]]
+// CHECK-NEXT:    [[SHUF_INS33:%.*]] = insertelement <16 x i8> [[SHUF_INS30]], i8 [[SHUF_ELT32]], i64 11
+// CHECK-NEXT:    [[SHUF_IDX34:%.*]] = extractelement <16 x i8> [[MASK]], i64 12
+// CHECK-NEXT:    [[SHUF_ELT35:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX34]]
+// CHECK-NEXT:    [[SHUF_INS36:%.*]] = insertelement <16 x i8> [[SHUF_INS33]], i8 [[SHUF_ELT35]], i64 12
+// CHECK-NEXT:    [[SHUF_IDX37:%.*]] = extractelement <16 x i8> [[MASK]], i64 13
+// CHECK-NEXT:    [[SHUF_ELT38:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX37]]
+// CHECK-NEXT:    [[SHUF_INS39:%.*]] = insertelement <16 x i8> [[SHUF_INS36]], i8 [[SHUF_ELT38]], i64 13
+// CHECK-NEXT:    [[SHUF_IDX40:%.*]] = extractelement <16 x i8> [[MASK]], i64 14
+// CHECK-NEXT:    [[SHUF_ELT41:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX40]]
+// CHECK-NEXT:    [[SHUF_INS42:%.*]] = insertelement <16 x i8> [[SHUF_INS39]], i8 [[SHUF_ELT41]], i64 14
+// CHECK-NEXT:    [[SHUF_IDX43:%.*]] = extractelement <16 x i8> [[MASK]], i64 15
+// CHECK-NEXT:    [[SHUF_ELT44:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX43]]
+// CHECK-NEXT:    [[SHUF_INS45:%.*]] = insertelement <16 x i8> [[SHUF_INS42]], i8 [[SHUF_ELT44]], i64 15
+// CHECK-NEXT:    ret <16 x i8> [[SHUF_INS45]]
+//
+mfloat8x16_t f4(mfloat8x16_t x, int8x16_t p) {
+  return __builtin_shufflevector(x, p);
+}
diff --git a/clang/test/Sema/builtin-shufflevector.c b/clang/test/Sema/builtin-shufflevector.c
new file mode 100644
index 000000000000000..c2dabb9d6585a2a
--- /dev/null
+++ b/clang/test/Sema/builtin-shufflevector.c
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple aarch64 -fsyntax-only -verify %s
+
+// REQUIRES: aarch64-registered-target
+
+typedef __attribute__((neon_vector_type(8))) signed char int8x8_t;
+typedef __attribute__((neon_vector_type(16))) signed char int8x16_t;
+
+typedef __MFloat8x8_t mfloat8x8_t;
+typedef __MFloat8x16_t mfloat8x16_t;
+
+int8x8_t non_vector(int x) {
+  return __builtin_shufflevector(x, x, 3, 2, 1, 0, 3, 2, 1, 0);
+  // expected-error at -1 {{first argument to '__builtin_shufflevector' must be of vector type}}
+}
+
+mfloat8x8_t unsuported_vector(mfloat8x8_t x) {
+  return __builtin_shufflevector(x, x, 3, 2, 1, 0, 3, 2, 1, 0, 0);
+  // expected-error at -1 {{unsupported vector type for the result}}
+}
+
+int8x8_t non_vector_index(int8x8_t x, int p) {
+  return __builtin_shufflevector(x, p);
+  // expected-error at -1 {{second argument for __builtin_shufflevector must be integer vector with length equal to the length of the first argument}}
+}
+
+int8x8_t bad_vector_index_length(int8x8_t x, int8x16_t p) {
+  return __builtin_shufflevector(x, p);
+  // expected-error at -1 {{second argument for __builtin_shufflevector must be integer vector with length equal to the length of the first argument}}
+}
+

>From 9496937a20f0ac206188bd4732da106cef670aaa Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Mon, 16 Dec 2024 09:58:22 +0000
Subject: [PATCH 04/19] [fixup] Fix formatting (NFC)

---
 clang/lib/Sema/SemaChecking.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 1a407c7a6cece66..8093a80dcc33b5b 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5179,12 +5179,12 @@ ExprResult Sema::BuiltinShuffleVector(CallExpr *TheCall) {
           resType = Context.MFloat8x16Ty;
           break;
         default:
-          return ExprError(Diag(TheCall->getBeginLoc(),
-                                diag::err_shufflevector_unsupported_result_vector_type)
-                           << TheCall->getDirectCallee()
-                           << /*isMorethantwoArgs*/ false
-                           << SourceRange(TheCall->getArg(0)->getBeginLoc(),
-                                          TheCall->getArg(1)->getEndLoc()));
+          return ExprError(
+              Diag(TheCall->getBeginLoc(),
+                   diag::err_shufflevector_unsupported_result_vector_type)
+              << TheCall->getDirectCallee() << /*isMorethantwoArgs*/ false
+              << SourceRange(TheCall->getArg(0)->getBeginLoc(),
+                             TheCall->getArg(1)->getEndLoc()));
         }
       }
       QualType eltType = LHSType->castAs<VectorType>()->getElementType();

>From e6da3ea7803a08cc796f7bbe5254f2350b6050cc Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Mon, 13 Jan 2025 17:03:18 +0000
Subject: [PATCH 05/19] [fixup] Address review comments, minor tweaks

---
 clang/lib/AST/Type.cpp                           |  6 ++++--
 .../CodeGen/AArch64/builtin-shufflevector-fp8.c  | 16 ++++++++--------
 clang/test/Sema/builtin-shufflevector.c          |  1 -
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index fb55bab0a67de68..a1dea017c866e70 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2579,8 +2579,10 @@ bool Type::isSveVLSBuiltinType() const {
 bool Type::isNeonVectorBuiltinType() const {
   if (const BuiltinType *BT = getAs<BuiltinType>()) {
     switch (BT->getKind()) {
-    case BuiltinType::MFloat8x8:
-    case BuiltinType::MFloat8x16:
+#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId)                \
+  case BuiltinType::Id:
+#define SVE_TYPE(Name, Id, SingletonId)
+#include "clang/Basic/AArch64SVEACLETypes.def"
       return true;
     default:
       return false;
diff --git a/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c b/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c
index 45ea81275095374..74daa7a98914f92 100644
--- a/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c
+++ b/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c
@@ -9,17 +9,17 @@ typedef __attribute__((neon_vector_type(16))) signed char int8x16_t;
 typedef __MFloat8x8_t mfloat8x8_t;
 typedef __MFloat8x16_t mfloat8x16_t;
 
-// CHECK-LABEL: define dso_local <8 x i8> @f0(
+// CHECK-LABEL: define dso_local <8 x i8> @test_8x8(
 // CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[X]], <8 x i8> [[X]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
 // CHECK-NEXT:    ret <8 x i8> [[SHUFFLE]]
 //
-mfloat8x8_t f0(mfloat8x8_t x) {
+mfloat8x8_t test_8x8(mfloat8x8_t x) {
   return __builtin_shufflevector(x, x, 3, 2, 1, 0, 3, 2, 1, 0);
 }
 
-// CHECK-LABEL: define dso_local <8 x i8> @f1(
+// CHECK-LABEL: define dso_local <8 x i8> @test_8x8_v(
 // CHECK-SAME: <8 x i8> [[X:%.*]], <8 x i8> noundef [[P:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[MASK:%.*]] = and <8 x i8> [[P]], splat (i8 7)
@@ -49,22 +49,22 @@ mfloat8x8_t f0(mfloat8x8_t x) {
 // CHECK-NEXT:    [[SHUF_INS21:%.*]] = insertelement <8 x i8> [[SHUF_INS18]], i8 [[SHUF_ELT20]], i64 7
 // CHECK-NEXT:    ret <8 x i8> [[SHUF_INS21]]
 //
-mfloat8x8_t f1(mfloat8x8_t x, int8x8_t p) {
+mfloat8x8_t test_8x8_v(mfloat8x8_t x, int8x8_t p) {
   return __builtin_shufflevector(x, p);
 }
 
-// CHECK-LABEL: define dso_local <16 x i8> @f3(
+// CHECK-LABEL: define dso_local <16 x i8> @test_8x16(
 // CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[X]], <16 x i8> [[X]], <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 // CHECK-NEXT:    ret <16 x i8> [[SHUFFLE]]
 //
-mfloat8x16_t f3(mfloat8x16_t x) {
+mfloat8x16_t test_8x16(mfloat8x16_t x) {
   return __builtin_shufflevector(x, x, 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2,
                                  1, 0);
 }
 
-// CHECK-LABEL: define dso_local <16 x i8> @f4(
+// CHECK-LABEL: define dso_local <16 x i8> @test_8x16_v(
 // CHECK-SAME: <16 x i8> [[X:%.*]], <16 x i8> noundef [[P:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[MASK:%.*]] = and <16 x i8> [[P]], splat (i8 15)
@@ -118,6 +118,6 @@ mfloat8x16_t f3(mfloat8x16_t x) {
 // CHECK-NEXT:    [[SHUF_INS45:%.*]] = insertelement <16 x i8> [[SHUF_INS42]], i8 [[SHUF_ELT44]], i64 15
 // CHECK-NEXT:    ret <16 x i8> [[SHUF_INS45]]
 //
-mfloat8x16_t f4(mfloat8x16_t x, int8x16_t p) {
+mfloat8x16_t test_8x16_v(mfloat8x16_t x, int8x16_t p) {
   return __builtin_shufflevector(x, p);
 }
diff --git a/clang/test/Sema/builtin-shufflevector.c b/clang/test/Sema/builtin-shufflevector.c
index c2dabb9d6585a2a..9094f51b9f7e5f0 100644
--- a/clang/test/Sema/builtin-shufflevector.c
+++ b/clang/test/Sema/builtin-shufflevector.c
@@ -6,7 +6,6 @@ typedef __attribute__((neon_vector_type(8))) signed char int8x8_t;
 typedef __attribute__((neon_vector_type(16))) signed char int8x16_t;
 
 typedef __MFloat8x8_t mfloat8x8_t;
-typedef __MFloat8x16_t mfloat8x16_t;
 
 int8x8_t non_vector(int x) {
   return __builtin_shufflevector(x, x, 3, 2, 1, 0, 3, 2, 1, 0);

>From 42b36c55b3d17a12c89ce6d05f6dfe57a8513b24 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Thu, 9 Jan 2025 14:02:37 +0000
Subject: [PATCH 06/19] FP8 bitcast

---
 .../clang/Basic/DiagnosticSemaKinds.td        |   2 +
 clang/include/clang/Sema/Sema.h               |   5 +
 clang/lib/Sema/SemaCast.cpp                   |  22 ++
 clang/lib/Sema/SemaExpr.cpp                   |  48 +++++
 clang/test/CodeGen/AArch64/fp8-cast.c         | 193 ++++++++++++++++++
 clang/test/Sema/aarch64-fp8-cast.c            | 102 +++++++++
 6 files changed, 372 insertions(+)
 create mode 100644 clang/test/CodeGen/AArch64/fp8-cast.c
 create mode 100644 clang/test/Sema/aarch64-fp8-cast.c

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index b43f2d9143117ea..9ba671d0bbe78cd 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3415,6 +3415,8 @@ def err_typecheck_vector_not_convertable : Error<
   "cannot convert between vector values of different size (%0 and %1)">;
 def err_typecheck_vector_not_convertable_non_scalar : Error<
   "cannot convert between vector and non-scalar values (%0 and %1)">;
+def err_typecheck_vector_not_convertable_non_vector : Error<
+  "cannot convert between vector type %0 and non-vector type %1">;
 def err_typecheck_vector_lengths_not_equal : Error<
   "vector operands do not have the same number of elements (%0 and %1)">;
 def warn_typecheck_vector_element_sizes_not_equal : Warning<
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index a41f16f6dc8c9b7..8751e016b96fbc0 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -7429,6 +7429,11 @@ class Sema final : public SemaBase {
   /// the perspective of SVE bitcasts.
   bool isValidSveBitcast(QualType srcType, QualType destType);
 
+  /// Check for bitcast beween a regular vector type and builtin Neon vector
+  /// type.
+  bool isValidNeonVectorBuiltinTypeBitcast(SourceRange OpRange, QualType SrcTy,
+                                           QualType DstTy);
+
   /// Are the two types matrix types and do they have the same dimensions i.e.
   /// do they have the same number of rows and the same number of columns?
   bool areMatrixTypesOfTheSameDimension(QualType srcTy, QualType destTy);
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index f98857f852b5af3..a619dd556c51c6d 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -2386,6 +2386,16 @@ static TryCastResult TryReinterpretCast(Sema &Self, ExprResult &SrcExpr,
     return TC_Success;
   }
 
+  // Allow bitcasting between a regular vector type or a scalar, and a builtin
+  // Neon vector type.
+  if ((SrcType->isNeonVectorBuiltinType() ||
+       DestType->isNeonVectorBuiltinType())) {
+    if (!Self.isValidNeonVectorBuiltinTypeBitcast(OpRange, SrcType, DestType))
+      return TC_Failed;
+    Kind = CK_BitCast;
+    return TC_Success;
+  }
+
   // Allow reinterpret_casts between vectors of the same size and
   // between vectors and integers of the same size.
   bool destIsVector = DestType->isVectorType();
@@ -3009,6 +3019,18 @@ void CastOperation::CheckCStyleCast() {
     return;
   }
 
+  // Allow bitcasting between a regular vector type and a builtin Neon vector
+  // type.
+  if (SrcType->isNeonVectorBuiltinType() ||
+      DestType->isNeonVectorBuiltinType()) {
+    if (!Self.isValidNeonVectorBuiltinTypeBitcast(OpRange, SrcType, DestType)) {
+      SrcExpr = ExprError();
+      return;
+    }
+    Kind = CK_BitCast;
+    return;
+  }
+
   // Allow bitcasting between compatible RVV vector types.
   if ((SrcType->isVectorType() || DestType->isVectorType()) &&
       Self.RISCV().isValidRVVBitcast(SrcType, DestType)) {
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index ae40895980d90a9..1fc621ac490b6e1 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -7530,6 +7530,54 @@ bool Sema::isValidSveBitcast(QualType srcTy, QualType destTy) {
          ValidScalableConversion(destTy, srcTy);
 }
 
+bool Sema::isValidNeonVectorBuiltinTypeBitcast(SourceRange OpRange,
+                                               QualType SrcTy, QualType DstTy) {
+  assert(SrcTy->isNeonVectorBuiltinType() || DstTy->isNeonVectorBuiltinType());
+
+  auto checkCast = [&](QualType BT, QualType OT) -> unsigned {
+    if (OT->isNeonVectorBuiltinType()) {
+      if (BT.getCanonicalType() != OT.getCanonicalType())
+        return diag::err_invalid_conversion_between_vectors;
+      return 0;
+    }
+
+    if (!OT->isVectorType()) {
+      if (!OT->isScalarType())
+        return diag::err_typecheck_vector_not_convertable_non_scalar;
+
+      if (!OT->isIntegralType(Context))
+        return diag::err_typecheck_vector_not_convertable_non_vector;
+    }
+
+    uint64_t OTLen;
+    QualType OTEltTy;
+    if (const VectorType *VT = OT->getAs<VectorType>()) {
+      OTLen = VT->getNumElements();
+      OTEltTy = VT->getElementType();
+    } else {
+      OTLen = 1;
+      OTEltTy = OT;
+    }
+
+    uint64_t BTLen =
+        BT->getAs<BuiltinType>()->getKind() == BuiltinType::MFloat8x8 ? 8u
+                                                                      : 16u;
+    if (BTLen * 8u != OTLen * Context.getTypeSize(OTEltTy))
+      return diag::err_invalid_conversion_between_vectors;
+    return 0;
+  };
+
+  QualType BT = SrcTy;
+  QualType OT = DstTy;
+  if (!SrcTy->isNeonVectorBuiltinType())
+    std::swap(BT, OT);
+
+  if (unsigned msg = checkCast(BT, OT))
+    return Diag(OpRange.getBegin(), msg) << BT << OT << OpRange;
+
+  return true;
+}
+
 bool Sema::areMatrixTypesOfTheSameDimension(QualType srcTy, QualType destTy) {
   if (!destTy->isMatrixType() || !srcTy->isMatrixType())
     return false;
diff --git a/clang/test/CodeGen/AArch64/fp8-cast.c b/clang/test/CodeGen/AArch64/fp8-cast.c
new file mode 100644
index 000000000000000..c3a06b0d45157db
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-cast.c
@@ -0,0 +1,193 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix CHECK-CXX
+
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -disable-O0-optnone -Werror -Wall -S -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// Bitcast between FP8 Neon vectors
+// CHECK-LABEL: define dso_local <8 x i8> @test_f8_f8(
+// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[X]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z10test_f8_f8u13__MFloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <8 x i8> [[X]]
+//
+mfloat8x8_t test_f8_f8(mfloat8x8_t x) {
+    return (mfloat8x8_t) x;
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @testq_f8_f8(
+// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[X]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z11testq_f8_f8u14__MFloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <16 x i8> [[X]]
+//
+mfloat8x16_t testq_f8_f8(mfloat8x16_t x) {
+    return (mfloat8x16_t) x;
+}
+
+// Bitcast between FP8 and int8 Neon vectors
+// CHECK-LABEL: define dso_local <8 x i8> @test_f8_s8(
+// CHECK-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[X]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z10test_f8_s810__Int8x8_t(
+// CHECK-CXX-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <8 x i8> [[X]]
+//
+mfloat8x8_t test_f8_s8(int8x8_t x) {
+    return (mfloat8x8_t) x;
+}
+
+// CHECK-LABEL: define dso_local <8 x i8> @test_s8_f8(
+// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <8 x i8> [[X]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x i8> @_Z10test_s8_f8u13__MFloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <8 x i8> [[X]]
+//
+int8x8_t test_s8_f8(mfloat8x8_t x) {
+    return (int8x8_t) x;
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @testq_f8_s8(
+// CHECK-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[X]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z11testq_f8_s811__Int8x16_t(
+// CHECK-CXX-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <16 x i8> [[X]]
+//
+mfloat8x16_t testq_f8_s8(int8x16_t x) {
+    return (mfloat8x16_t) x;
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @testq_s8_f8(
+// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    ret <16 x i8> [[X]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <16 x i8> @_Z11testq_s8_f8u14__MFloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    ret <16 x i8> [[X]]
+//
+int8x16_t testq_s8_f8(mfloat8x16_t x) {
+    return (int8x16_t) x;
+}
+
+// Bitcast between FP8 and float32 Neon vectors
+// CHECK-LABEL: define dso_local <8 x i8> @test_f8_f32(
+// CHECK-SAME: <2 x float> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[X]] to <8 x i8>
+// CHECK-NEXT:    ret <8 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z11test_f8_f3213__Float32x2_t(
+// CHECK-CXX-SAME: <2 x float> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <2 x float> [[X]] to <8 x i8>
+// CHECK-CXX-NEXT:    ret <8 x i8> [[TMP0]]
+//
+mfloat8x8_t test_f8_f32(float32x2_t x) {
+    return (mfloat8x8_t) x;
+}
+
+// CHECK-LABEL: define dso_local <2 x float> @test_f32_f8(
+// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[X]] to <2 x float>
+// CHECK-NEXT:    ret <2 x float> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z11test_f32_f8u13__MFloat8x8_t(
+// CHECK-CXX-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x i8> [[X]] to <2 x float>
+// CHECK-CXX-NEXT:    ret <2 x float> [[TMP0]]
+//
+float32x2_t test_f32_f8(mfloat8x8_t x) {
+    return (float32x2_t) x;
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @testq_f8_f32(
+// CHECK-SAME: <4 x float> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[X]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z12testq_f8_f3213__Float32x4_t(
+// CHECK-CXX-SAME: <4 x float> noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x float> [[X]] to <16 x i8>
+// CHECK-CXX-NEXT:    ret <16 x i8> [[TMP0]]
+//
+mfloat8x16_t testq_f8_f32(float32x4_t x) {
+    return (mfloat8x16_t) x;
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @testq_f32_f8(
+// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[X]] to <4 x float>
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z12testq_f32_f8u14__MFloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[X]] to <4 x float>
+// CHECK-CXX-NEXT:    ret <4 x float> [[TMP0]]
+//
+float32x4_t testq_f32_f8(mfloat8x16_t x) {
+    return (float32x4_t) x;
+}
+
+// Bitcast between FP8 and poly128_t (which is integral)
+// CHECK-LABEL: define dso_local <16 x i8> @testq_f8_p128(
+// CHECK-SAME: i128 noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[X]] to <16 x i8>
+// CHECK-NEXT:    ret <16 x i8> [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z13testq_f8_p128o(
+// CHECK-CXX-SAME: i128 noundef [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast i128 [[X]] to <16 x i8>
+// CHECK-CXX-NEXT:    ret <16 x i8> [[TMP0]]
+//
+mfloat8x16_t testq_f8_p128(poly128_t x) {
+    return (mfloat8x16_t) x;
+}
+
+// CHECK-LABEL: define dso_local i128 @testq_p128_f8(
+// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[X]] to i128
+// CHECK-NEXT:    ret i128 [[TMP0]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef i128 @_Z13testq_p128_f8u14__MFloat8x16_t(
+// CHECK-CXX-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <16 x i8> [[X]] to i128
+// CHECK-CXX-NEXT:    ret i128 [[TMP0]]
+//
+poly128_t testq_p128_f8(mfloat8x16_t x) {
+    return (poly128_t) x;
+}
diff --git a/clang/test/Sema/aarch64-fp8-cast.c b/clang/test/Sema/aarch64-fp8-cast.c
new file mode 100644
index 000000000000000..a7b7d2a4bfafd3b
--- /dev/null
+++ b/clang/test/Sema/aarch64-fp8-cast.c
@@ -0,0 +1,102 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -verify -emit-llvm -o - %s
+
+// REQUIRES: aarch64-registered-target
+
+// Bitcast between FP8 Neon vectors
+mfloat8x8_t err_test_f8_f8(mfloat8x16_t x) {
+    return (mfloat8x8_t) x;
+// expected-error at -1 {{invalid conversion between vector type 'mfloat8x16_t' (aka '__MFloat8x16_t') and 'mfloat8x8_t' (aka '__MFloat8x8_t') of different size}}
+}
+
+mfloat8x16_t err_testq_f8_f8(mfloat8x8_t x) {
+    return (mfloat8x16_t) x;
+// expected-error at -1 {{invalid conversion between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and 'mfloat8x16_t' (aka '__MFloat8x16_t') of different size}}
+}
+
+// Bitcast between FP8 and int8 Neon vectors
+mfloat8x8_t err_test_f8_s8(int8x16_t x) {
+    return (mfloat8x8_t) x;
+// expected-error at -1 {{invalid conversion between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and 'int8x16_t' (vector of 16 'int8_t' values) of different size}}
+}
+
+int8x8_t err_test_s8_f8(mfloat8x16_t x) {
+    return (int8x8_t) x;
+// expected-error at -1 {{invalid conversion between vector type 'mfloat8x16_t' (aka '__MFloat8x16_t') and 'int8x8_t' (vector of 8 'int8_t' values) of different size}}
+}
+
+mfloat8x16_t err_testq_f8_s8(int8x8_t x) {
+    return (mfloat8x16_t) x;
+// expected-error at -1 {{invalid conversion between vector type 'mfloat8x16_t' (aka '__MFloat8x16_t') and 'int8x8_t' (vector of 8 'int8_t' values) of different size}}
+}
+
+int8x16_t err_testq_s8_f8(mfloat8x8_t x) {
+    return (int8x16_t) x;
+// expected-error at -1 {{invalid conversion between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and 'int8x16_t' (vector of 16 'int8_t' values) of different size}}
+}
+
+// Bitcast between FP8 and float32 Neon vectors
+mfloat8x8_t err_test_f8_f32(float32x4_t x) {
+    return (mfloat8x8_t) x;
+// expected-error at -1 {{invalid conversion between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and 'float32x4_t' (vector of 4 'float32_t' values) of different size}}
+}
+
+float32x2_t err_test_f32_f8(mfloat8x16_t x) {
+    return (float32x2_t) x;
+// expected-error at -1 {{invalid conversion between vector type 'mfloat8x16_t' (aka '__MFloat8x16_t') and 'float32x2_t' (vector of 2 'float32_t' values) of different size}}
+}
+
+mfloat8x16_t err_testq_f8_f32(float32x2_t x) {
+    return (mfloat8x16_t) x;
+// expected-error at -1 {{invalid conversion between vector type 'mfloat8x16_t' (aka '__MFloat8x16_t') and 'float32x2_t' (vector of 2 'float32_t' values) of different size}}
+}
+
+float32x4_t err_testq_f32_f8(mfloat8x8_t x) {
+    return (float32x4_t) x;
+// expected-error at -1 {{invalid conversion between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and 'float32x4_t' (vector of 4 'float32_t' values) of different size}}
+}
+
+// Bitcast between FP8 and poly128_t (which is integral)
+mfloat8x8_t err_testq_f8_p128(poly128_t x) {
+    return (mfloat8x8_t) x;
+// expected-error at -1 {{invalid conversion between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and 'poly128_t' (aka 'unsigned __int128') of different size}}
+}
+
+poly128_t err_testq_p128_f8(mfloat8x8_t x) {
+    return (poly128_t) x;
+// expected-error at -1 {{invalid conversion between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and 'poly128_t' (aka 'unsigned __int128') of different size}}
+}
+
+// Bitcast between FP8 and a non-integral type
+mfloat8x8_t err_test_f8_ptr(void *p) {
+    return (mfloat8x8_t) p;
+// expected-error at -1 {{cannot convert between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and non-vector type 'void *'}}
+}
+
+void *err_test_ptr_f8(mfloat8x8_t v) {
+    return (void *) v;
+// expected-error at -1 {{cannot convert between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and non-vector type 'void *'}}
+}
+
+mfloat8x8_t err_test_f8_dbl(double v) {
+    return (mfloat8x8_t) v;
+// expected-error at -1 {{cannot convert between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and non-vector type 'double'}}
+}
+
+double err_test_dbl_f8(mfloat8x8_t v) {
+    return (double) v;
+// expected-error at -1 {{cannot convert between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and non-vector type 'double'}}
+}
+
+struct S {
+    char ch[16];
+};
+
+mfloat8x16_t err_test_f8_agg(struct S s) {
+    return (mfloat8x16_t) s;
+// expected-error at -1 {{cannot convert between vector and non-scalar values ('mfloat8x16_t' (aka '__MFloat8x16_t') and 'struct S')}}
+}
+
+struct S err_test_agg_f8(mfloat8x16_t v) {
+    return (struct S) v;
+// expected-error at -1 {{cannot convert between vector and non-scalar values ('mfloat8x16_t' (aka '__MFloat8x16_t') and 'struct S')}}
+}

>From de28a7692f27d9d2ecac57b308e0da38f5f335af Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 6 Dec 2024 19:24:16 +0000
Subject: [PATCH 07/19] [AArch64] Add Neon FP8 conversion intrinsics

---
 clang/include/clang/Basic/arm_neon.td         |  23 ++++
 clang/include/clang/Basic/arm_neon_incl.td    |   2 +
 clang/lib/CodeGen/CGBuiltin.cpp               | 120 +++++++++++++++++-
 clang/lib/CodeGen/CodeGenFunction.h           |   3 +
 clang/utils/TableGen/NeonEmitter.cpp          |  22 +++-
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  22 ++++
 .../lib/Target/AArch64/AArch64InstrFormats.td |  46 +++++--
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  14 +-
 8 files changed, 230 insertions(+), 22 deletions(-)

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index ef89fa4358dfeb0..b79b4749371b501 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2125,6 +2125,29 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in {
   }
 }
 
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,bf16,neon" in {
+  def VBF1CVT_BF16_MF8        : VInst<"vcvt1_bf16_mf8_fpm",      "(QB).V", "m">;
+  def VBF1CVT_LOW_BF16_MF8    : VInst<"vcvt1_low_bf16_mf8_fpm",  "B.V",    "Qm">;
+  def VBF2CVTL_BF16_MF8       : VInst<"vcvt2_bf16_mf8_fpm",      "(QB).V", "m">;
+  def VBF2CVTL_LOW_BF16_MF8   : VInst<"vcvt2_low_bf16_mf8_fpm",  "B.V",    "Qm">;
+  def VBF1CVTL2_HIGH_BF16_MF8 : VInst<"vcvt1_high_bf16_mf8_fpm", "B.V",    "Qm">;
+  def VBF2CVTL2_HIGH_BF16_MF8 : VInst<"vcvt2_high_bf16_mf8_fpm", "B.V",    "Qm">;
+}
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
+  def VF1CVT_F16_MF8        : VInst<"vcvt1_f16_mf8_fpm",      "(>QF).V", "m">;
+  def VF1CVT_LOW_F16_MF8    : VInst<"vcvt1_low_f16_mf8_fpm",  "(>F).V",  "Qm">;
+  def VF2CVTL_F16_MF8       : VInst<"vcvt2_f16_mf8_fpm",      "(>QF).V", "m">;
+  def VF2CVTL_LOW_F16_MF8   : VInst<"vcvt2_low_f16_mf8_fpm",  "(>F).V",  "Qm">;
+  def VF1CVTL2_HIGH_F16_MF8 : VInst<"vcvt1_high_f16_mf8_fpm", "(>F).V",  "Qm">;
+  def VF2CVTL2_HIGH_F16_MF8 : VInst<"vcvt2_high_f16_mf8_fpm", "(>F).V",  "Qm">;
+
+  def VCVTN_LOW_F8_F32  : VInst<"vcvt_mf8_f32_fpm",      ".(>>QF)(>>QF)V",  "m">;
+  def VCVTN_HIGH_F8_F32 : VInst<"vcvt_high_mf8_f32_fpm", ".(q)(>>F)(>>F)V", "Qm">;
+  def VCVTN_F8_F16      : VInst<"vcvt_mf8_f16_fpm",      ".(>F)(>F)V",      "m">;
+  def VCVTNQ_F8_F16     : VInst<"vcvtq_mf8_f16_fpm",     ".(>F)(>F)V",      "Qm">;
+}
+
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
   def FAMIN : WInst<"vamin", "...", "fhQdQfQh">;
   def FAMAX : WInst<"vamax", "...", "fhQdQfQh">;
diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td
index fd800e5a6278e4a..91a2bf3020b9a3f 100644
--- a/clang/include/clang/Basic/arm_neon_incl.td
+++ b/clang/include/clang/Basic/arm_neon_incl.td
@@ -243,6 +243,7 @@ def OP_UNAVAILABLE : Operation {
 // B: change to BFloat16
 // P: change to polynomial category.
 // p: change polynomial to equivalent integer category. Otherwise nop.
+// V: change to fpm_t
 //
 // >: double element width (vector size unchanged).
 // <: half element width (vector size unchanged).
@@ -301,6 +302,7 @@ class Inst <string n, string p, string t, Operation o, list<ImmCheck> ch = []>{
 class SInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
 class IInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
 class WInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
+class VInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
 
 // The following instruction classes are implemented via operators
 // instead of builtins. As such these declarations are only used for
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index ca03fb665d423d8..ce7b2620cba9cf8 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6898,6 +6898,13 @@ Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
     return Builder.CreateCall(F, Ops, name);
 }
 
+Value *CodeGenFunction::EmitFP8NeonCall(Function *F,
+                                        SmallVectorImpl<Value *> &Ops,
+                                        Value *FPM, const char *name) {
+  Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), FPM);
+  return EmitNeonCall(F, Ops, name);
+}
+
 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
                                             bool neg) {
   int SV = cast<ConstantInt>(V)->getSExtValue();
@@ -14054,7 +14061,118 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
   }
-
+  case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_cvtl1;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(BFloatTy, 8);
+    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower part of
+    // the vector.
+    if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm) {
+      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+                                               /*isQuad*/ false));
+      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
+    } else
+      Tys[1] = Ops[0]->getType();
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt1");
+  }
+  case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_cvtl2;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(BFloatTy, 8);
+    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
+    //  part of the vector.
+    if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm) {
+      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+                                               /*isQuad*/ false));
+      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
+    } else
+      Tys[1] = Ops[0]->getType();
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt2");
+  }
+  case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_cvtl1;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(HalfTy, 8);
+    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
+    //  part of the vector.
+    if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm) {
+      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+                                               /*isQuad*/ false));
+      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
+    } else
+      Tys[1] = Ops[0]->getType();
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt1");
+  }
+  case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_cvtl2;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(HalfTy, 8);
+    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
+    //  part of the vector.
+    if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm) {
+      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+                                               /*isQuad*/ false));
+      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
+    } else
+      Tys[1] = Ops[0]->getType();
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt2");
+  }
+  case NEON::BI__builtin_neon_vcvt_mf8_f32_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_fcvtn;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 8);
+    Tys[1] = Ops[0]->getType();
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn");
+  }
+  case NEON::BI__builtin_neon_vcvt_mf8_f16_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_fcvtn;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 8);
+    // Gets the expected type, because arm_neon.h casts float16x4_t to int8x8_t
+    Tys[1] = llvm::FixedVectorType::get(HalfTy, 4);
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn");
+  }
+  case NEON::BI__builtin_neon_vcvtq_mf8_f16_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_fcvtn;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 16);
+    // Gets the expected type, because arm_neon.h casts float16x8_t to int8x16_t
+    Tys[1] = llvm::FixedVectorType::get(HalfTy, 8);
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn");
+  }
+  case NEON::BI__builtin_neon_vcvt_high_mf8_f32_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_fcvtn2;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 16);
+    Tys[1] = Ops[1]->getType();
+    Ops[0] = Builder.CreateInsertVector(Tys[0], PoisonValue::get(Tys[0]),
+                                        Ops[0], Builder.getInt64(0));
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn2");
+  }
   case NEON::BI__builtin_neon_vamin_f16:
   case NEON::BI__builtin_neon_vaminq_f16:
   case NEON::BI__builtin_neon_vamin_f32:
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 457e1477bb2eed7..a9d5462a1320145 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4682,6 +4682,9 @@ class CodeGenFunction : public CodeGenTypeCache {
                             SmallVectorImpl<llvm::Value*> &O,
                             const char *name,
                             unsigned shift = 0, bool rightshift = false);
+  llvm::Value *EmitFP8NeonCall(llvm::Function *F,
+                               SmallVectorImpl<llvm::Value *> &O,
+                               llvm::Value *FPM, const char *name);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx,
                              const llvm::ElementCount &Count);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx);
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index d7d649dd2456d58..61067937acf7297 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -74,6 +74,7 @@ enum ClassKind {
   ClassI,     // generic integer instruction, e.g., "i8" suffix
   ClassS,     // signed/unsigned/poly, e.g., "s8", "u8" or "p8" suffix
   ClassW,     // width-specific instruction, e.g., "8" suffix
+  ClassV,     // void-suffix instruction, no suffix
   ClassB,     // bitcast arguments with enum argument to specify type
   ClassL,     // Logical instructions which are op instructions
               // but we need to not emit any suffix for in our
@@ -144,7 +145,7 @@ class Type {
 private:
   TypeSpec TS;
 
-  enum TypeKind { Void, Float, SInt, UInt, Poly, BFloat16, MFloat8 };
+  enum TypeKind { Void, Float, SInt, UInt, Poly, BFloat16, MFloat8, FPM };
   TypeKind Kind;
   bool Immediate, Constant, Pointer;
   // ScalarForMangling and NoManglingQ are really not suited to live here as
@@ -198,6 +199,7 @@ class Type {
   bool isVoid() const { return Kind == Void; }
   bool isBFloat16() const { return Kind == BFloat16; }
   bool isMFloat8() const { return Kind == MFloat8; }
+  bool isFPM() const { return Kind == FPM; }
   unsigned getNumElements() const { return Bitwidth / ElementBitwidth; }
   unsigned getSizeInBits() const { return Bitwidth; }
   unsigned getElementSizeInBits() const { return ElementBitwidth; }
@@ -600,6 +602,7 @@ class NeonEmitter {
     const Record *SI = R.getClass("SInst");
     const Record *II = R.getClass("IInst");
     const Record *WI = R.getClass("WInst");
+    const Record *VI = R.getClass("VInst");
     const Record *SOpI = R.getClass("SOpInst");
     const Record *IOpI = R.getClass("IOpInst");
     const Record *WOpI = R.getClass("WOpInst");
@@ -609,6 +612,7 @@ class NeonEmitter {
     ClassMap[SI] = ClassS;
     ClassMap[II] = ClassI;
     ClassMap[WI] = ClassW;
+    ClassMap[VI] = ClassV;
     ClassMap[SOpI] = ClassS;
     ClassMap[IOpI] = ClassI;
     ClassMap[WOpI] = ClassW;
@@ -641,6 +645,9 @@ class NeonEmitter {
 std::string Type::str() const {
   if (isVoid())
     return "void";
+  if (isFPM())
+    return "fpm_t";
+
   std::string S;
 
   if (isInteger() && !isSigned())
@@ -699,6 +706,8 @@ std::string Type::builtin_str() const {
   } else if (isMFloat8()) {
     assert(ElementBitwidth == 8 && "MFloat8 can only be 8 bits");
     S += "m";
+  } else if (isFPM()) {
+    S += "UWi";
   } else
     switch (ElementBitwidth) {
     case 16: S += "h"; break;
@@ -888,6 +897,7 @@ void Type::applyTypespec(bool &Quad) {
     case 'm':
       Kind = MFloat8;
       ElementBitwidth = 8;
+      NoManglingQ = true;
       break;
     default:
       llvm_unreachable("Unhandled type code!");
@@ -925,6 +935,13 @@ void Type::applyModifiers(StringRef Mods) {
     case 'P':
       Kind = Poly;
       break;
+    case 'V':
+      Kind = FPM;
+      Bitwidth = ElementBitwidth = 64;
+      NumVectors = 0;
+      Immediate = Constant = Pointer = false;
+      ScalarForMangling = NoManglingQ = true;
+      break;
     case '>':
       assert(ElementBitwidth < 128);
       ElementBitwidth *= 2;
@@ -1000,6 +1017,9 @@ std::string Intrinsic::getInstTypeCode(Type T, ClassKind CK) const {
   if (CK == ClassB && TargetGuard == "neon")
     return "";
 
+  if (this->CK == ClassV)
+    return "";
+
   if (T.isBFloat16())
     return "bf16";
 
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index cc7a81e15f66099..3f841f86f31d8ff 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1004,6 +1004,28 @@ def int_aarch64_st64b: Intrinsic<[], !listconcat([llvm_ptr_ty], data512)>;
 def int_aarch64_st64bv: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], data512)>;
 def int_aarch64_st64bv0: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], data512)>;
 
+  //
+  // Neon FP8 intrinsics
+  //
+
+  // Conversions
+  class AdvSIMD_FP8_1VectorArg_Long_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrReadMem, IntrInaccessibleMemOnly]>;
+
+  def int_aarch64_neon_fp8_cvtl1   : AdvSIMD_FP8_1VectorArg_Long_Intrinsic;
+  def int_aarch64_neon_fp8_cvtl2   : AdvSIMD_FP8_1VectorArg_Long_Intrinsic;
+
+  def int_aarch64_neon_fp8_fcvtn
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [llvm_anyvector_ty,
+                             LLVMMatchType<1>],
+                            [IntrReadMem, IntrInaccessibleMemOnly]>;
+  def int_aarch64_neon_fp8_fcvtn2
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [LLVMMatchType<0>,
+                             llvm_anyvector_ty,
+                             LLVMMatchType<1>],
+                            [IntrReadMem, IntrInaccessibleMemOnly]>;
 }
 
 def llvm_nxv1i1_ty  : LLVMType<nxv1i1>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index f527f7e4eafbc19..0ecc35f61903ba9 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6558,17 +6558,30 @@ class BaseSIMDThreeVectors<bit Q, bit U, bits<2> size, bits<4> op,
 
 
 // FCVTN (FP16 to FP8)
-multiclass SIMDThreeSameSizeVectorCvt<string asm> {
-   def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b01, 0b1110, V64, V64, asm, ".8b",".4h">;
-   def v16f8 : BaseSIMDThreeVectors<0b1, 0b0, 0b01, 0b1110,  V128, V128, asm, ".16b", ".8h">;
+multiclass SIMD_FP8_CVTN_F16<string asm, SDPatternOperator Op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b01, 0b1110, V64, V64, asm, ".8b",".4h">;
+    def v16f8 : BaseSIMDThreeVectors<0b1, 0b0, 0b01, 0b1110,  V128, V128, asm, ".16b", ".8h">;
+  }
+  def : Pat<(v8i8 (Op (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
+            (!cast<Instruction>(NAME # v8f8) V64:$Rn, V64:$Rm)>;
+  def : Pat<(v16i8 (Op (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
+            (!cast<Instruction>(NAME # v16f8) V128:$Rn, V128:$Rm)>;
 }
 
-// TODO : Create v16f8 value type
 // FCVTN, FCVTN2 (FP32 to FP8)
-multiclass SIMDThreeVectorCvt<string asm> {
-   def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b00, 0b1110, V64, V128, asm, ".8b", ".4s">;
-   def 2v16f8 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1110, asm#2, ".16b", ".4s",
-                                           V128, v16i8, v4f32, null_frag>;
+multiclass SIMD_FP8_CVTN_F32<string asm, SDPatternOperator Op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b00, 0b1110, V64, V128, asm, ".8b", ".4s">;
+    def 2v16f8 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1110, asm#2, ".16b", ".4s",
+                                            V128, v16i8, v4f32, null_frag>;
+  }
+
+  def : Pat<(v8i8 (Op (v4f32 V128:$Rn), (v4f32 V128:$Rm))),
+            (!cast<Instruction>(NAME # v8f8) V128:$Rn, V128:$Rm)>;
+
+  def : Pat<(v16i8 (!cast<SDPatternOperator>(Op # 2) (v16i8 V128:$_Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm))),
+            (!cast<Instruction>(NAME # 2v16f8) V128:$_Rd, V128:$Rn, V128:$Rm)>;
 }
 
 // TODO: Create a new Value Type v8f8 and v16f8
@@ -7032,11 +7045,18 @@ multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
 //----------------------------------------------------------------------------
 // FP8 Advanced SIMD two-register miscellaneous
 //----------------------------------------------------------------------------
-multiclass SIMDMixedTwoVectorFP8<bits<2>sz, string asm> {
-  def v8f16 : BaseSIMDMixedTwoVector<0b0, 0b1, sz, 0b10111, V64, V128,
-                                     asm, ".8h", ".8b", []>;
-  def 2v8f16 : BaseSIMDMixedTwoVector<0b1, 0b1, sz, 0b10111, V128, V128,
-                                     asm#2, ".8h", ".16b", []>;
+multiclass SIMD_FP8_CVTL<bits<2>sz, string asm, ValueType dty, SDPatternOperator Op> {
+  let Uses=[FPMR, FPCR], mayLoad = 1 in {
+    def NAME : BaseSIMDMixedTwoVector<0b0, 0b1, sz, 0b10111, V64, V128,
+                                      asm, ".8h", ".8b", []>;
+    def NAME#2 : BaseSIMDMixedTwoVector<0b1, 0b1, sz, 0b10111, V128, V128,
+                                        asm#2, ".8h", ".16b", []>;
+  }
+  def : Pat<(dty (Op (v8i8 V64:$Rn))),
+            (!cast<Instruction>(NAME) V64:$Rn)>;
+
+  def : Pat<(dty (Op (v16i8 V128:$Rn))),
+            (!cast<Instruction>(NAME#2) V128:$Rn)>;
 }
 
 class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c6f5cdcd1d5fe7f..b415d843ac2e2d7 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -10301,13 +10301,13 @@ let Predicates = [HasD128] in {
 // 2023 Architecture Extensions:
 //===----------------------------===//
 
-let Uses = [FPMR, FPCR], Predicates = [HasFP8] in {
-  defm F1CVTL  : SIMDMixedTwoVectorFP8<0b00, "f1cvtl">;
-  defm F2CVTL  : SIMDMixedTwoVectorFP8<0b01, "f2cvtl">;
-  defm BF1CVTL : SIMDMixedTwoVectorFP8<0b10, "bf1cvtl">;
-  defm BF2CVTL : SIMDMixedTwoVectorFP8<0b11, "bf2cvtl">;
-  defm FCVTN_F16_F8 : SIMDThreeSameSizeVectorCvt<"fcvtn">;
-  defm FCVTN_F32_F8 : SIMDThreeVectorCvt<"fcvtn">;
+let Predicates = [HasFP8] in {
+  defm F1CVTL  : SIMD_FP8_CVTL<0b00, "f1cvtl", v8f16, int_aarch64_neon_fp8_cvtl1>;
+  defm F2CVTL  : SIMD_FP8_CVTL<0b01, "f2cvtl", v8f16, int_aarch64_neon_fp8_cvtl2>;
+  defm BF1CVTL : SIMD_FP8_CVTL<0b10, "bf1cvtl", v8bf16, int_aarch64_neon_fp8_cvtl1>;
+  defm BF2CVTL : SIMD_FP8_CVTL<0b11, "bf2cvtl", v8bf16, int_aarch64_neon_fp8_cvtl2>;
+  defm FCVTN_F16 : SIMD_FP8_CVTN_F16<"fcvtn", int_aarch64_neon_fp8_fcvtn>;
+  defm FCVTN_F32 : SIMD_FP8_CVTN_F32<"fcvtn", int_aarch64_neon_fp8_fcvtn>;
   defm FSCALE : SIMDThreeVectorFscale<0b1, 0b1, 0b111, "fscale", int_aarch64_neon_fp8_fscale>;
 } // End let Predicates = [HasFP8]
 

>From 66730cd674366d5dc67d2ce490272b624263f3d6 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 10 Dec 2024 18:14:02 +0000
Subject: [PATCH 08/19] [fixup] Add tests, fix calling the wrong LLVM intrinsic

---
 clang/lib/CodeGen/CGBuiltin.cpp               |   4 +-
 .../fp8-intrinsics/acle_neon_fp8_cvt.c        | 308 ++++++++++++++++++
 llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll     | 112 +++++++
 3 files changed, 422 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c
 create mode 100644 llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index ce7b2620cba9cf8..66501b0e365aef6 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14105,7 +14105,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Tys[0] = llvm::FixedVectorType::get(HalfTy, 8);
     // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
     //  part of the vector.
-    if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm) {
+    if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm) {
       Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
                                                /*isQuad*/ false));
       Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
@@ -14123,7 +14123,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Tys[0] = llvm::FixedVectorType::get(HalfTy, 8);
     // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
     //  part of the vector.
-    if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm) {
+    if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm) {
       Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
                                                /*isQuad*/ false));
       Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c
new file mode 100644
index 000000000000000..7543938f4871038
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c
@@ -0,0 +1,308 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix CHECK-CXX
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -S -O3 -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_bf16_mf8_fpm(
+// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z23test_vcvt1_bf16_mf8_fpmu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+bfloat16x8_t test_vcvt1_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
+  return vcvt1_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_low_bf16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z27test_vcvt1_low_bf16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+bfloat16x8_t test_vcvt1_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt1_low_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_bf16_mf8_fpm(
+// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z23test_vcvt2_bf16_mf8_fpmu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+bfloat16x8_t test_vcvt2_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
+  return vcvt2_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_low_bf16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z27test_vcvt2_low_bf16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+bfloat16x8_t test_vcvt2_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt2_low_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_high_bf16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z28test_vcvt1_high_bf16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+bfloat16x8_t test_vcvt1_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt1_high_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_high_bf16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z28test_vcvt2_high_bf16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+bfloat16x8_t test_vcvt2_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt2_high_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_f16_mf8_fpm(
+// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z22test_vcvt1_f16_mf8_fpmu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+float16x8_t test_vcvt1_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
+  return vcvt1_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_low_f16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z26test_vcvt1_low_f16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+float16x8_t test_vcvt1_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt1_low_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_f16_mf8_fpm(
+// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z22test_vcvt2_f16_mf8_fpmu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+float16x8_t test_vcvt2_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
+  return vcvt2_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_low_f16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z26test_vcvt2_low_f16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+float16x8_t test_vcvt2_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt2_low_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_high_f16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z27test_vcvt1_high_f16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+float16x8_t test_vcvt1_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt1_high_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_high_f16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z27test_vcvt2_high_f16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+float16x8_t test_vcvt2_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt2_high_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcvt_mf8_f32_fpm(
+// CHECK-SAME: <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VFCVTN_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-NEXT:    ret <8 x i8> [[VFCVTN_I]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z21test_vcvt_mf8_f32_fpm13__Float32x4_tS_m(
+// CHECK-CXX-SAME: <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VFCVTN_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-CXX-NEXT:    ret <8 x i8> [[VFCVTN_I]]
+//
+mfloat8x8_t test_vcvt_mf8_f32_fpm(float32x4_t vn, float32x4_t vm, fpm_t fpm) {
+  return vcvt_mf8_f32_fpm(vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcvt_high_mf8_f32_fpm(
+// CHECK-SAME: <8 x i8> [[VD:%.*]], <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VD]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-NEXT:    ret <16 x i8> [[VFCVTN2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vcvt_high_mf8_f32_fpmu13__MFloat8x8_t13__Float32x4_tS_m(
+// CHECK-CXX-SAME: <8 x i8> [[VD:%.*]], <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VD]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-CXX-NEXT:    ret <16 x i8> [[VFCVTN2_I]]
+//
+mfloat8x16_t test_vcvt_high_mf8_f32_fpm(mfloat8x8_t vd, float32x4_t vn,
+                                    float32x4_t vm, fpm_t fpm) {
+  return vcvt_high_mf8_f32_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcvt_mf8_f16_fpm(
+// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <4 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VFCVTN2_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]])
+// CHECK-NEXT:    ret <8 x i8> [[VFCVTN2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z21test_vcvt_mf8_f16_fpm13__Float16x4_tS_m(
+// CHECK-CXX-SAME: <4 x half> noundef [[VN:%.*]], <4 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VFCVTN2_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]])
+// CHECK-CXX-NEXT:    ret <8 x i8> [[VFCVTN2_I]]
+//
+mfloat8x8_t test_vcvt_mf8_f16_fpm(float16x4_t vn, float16x4_t vm, fpm_t fpm) {
+  return vcvt_mf8_f16_fpm(vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcvtq_mf8_f16_fpm(
+// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]])
+// CHECK-NEXT:    ret <16 x i8> [[VFCVTN2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z22test_vcvtq_mf8_f16_fpm13__Float16x8_tS_m(
+// CHECK-CXX-SAME: <8 x half> noundef [[VN:%.*]], <8 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]])
+// CHECK-CXX-NEXT:    ret <16 x i8> [[VFCVTN2_I]]
+//
+mfloat8x16_t test_vcvtq_mf8_f16_fpm(float16x8_t vn, float16x8_t vm, fpm_t fpm) {
+  return vcvtq_mf8_f16_fpm(vn, vm, fpm);
+}
diff --git a/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll b/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll
new file mode 100644
index 000000000000000..6070380d24234b4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8 < %s | FileCheck %s
+
+define <8 x bfloat> @test_vbfcvtl1_low(<8 x i8> %vn) {
+; CHECK-LABEL: test_vbfcvtl1_low:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bf1cvtl v0.8h, v0.8b
+; CHECK-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> %vn)
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_vbfcvtl1_high(<16 x i8> %vn) {
+; CHECK-LABEL: test_vbfcvtl1_high:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bf1cvtl2 v0.8h, v0.16b
+; CHECK-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> %vn)
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_vbfcvtl2_low(<8 x i8> %vn) {
+; CHECK-LABEL: test_vbfcvtl2_low:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bf2cvtl v0.8h, v0.8b
+; CHECK-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> %vn)
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_vbfcvtl2_high(<16 x i8> %vn) {
+; CHECK-LABEL: test_vbfcvtl2_high:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bf2cvtl2 v0.8h, v0.16b
+; CHECK-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> %vn)
+  ret <8 x bfloat> %res
+}
+
+
+define <8 x half> @test_vfcvtl1_low(<8 x i8> %vn) {
+; CHECK-LABEL: test_vfcvtl1_low:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    f1cvtl v0.8h, v0.8b
+; CHECK-NEXT:    ret
+   %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> %vn)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_vfcvtl1_high(<16 x i8> %vn) {
+; CHECK-LABEL: test_vfcvtl1_high:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    f1cvtl2 v0.8h, v0.16b
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> %vn)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_vfcvtl2_low(<8 x i8> %vn) {
+; CHECK-LABEL: test_vfcvtl2_low:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    f2cvtl v0.8h, v0.8b
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> %vn)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_vfcvtl2_high(<16 x i8> %vn) {
+; CHECK-LABEL: test_vfcvtl2_high:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    f2cvtl2 v0.8h, v0.16b
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> %vn)
+  ret <8 x half> %res
+}
+
+define <8 x i8> @test_vcvtn_low_f8_f32(<4 x float> %vn, <4 x float> %vm) {
+; CHECK-LABEL: test_vcvtn_low_f8_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn v0.8b, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %res = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> %vn, <4 x float> %vm)
+  ret <8 x i8> %res
+}
+
+define <16 x i8> @test_vcvtn_high_f8_f32(<16 x i8> %vd, <4 x float> %vn, <4 x float> %vm) {
+; CHECK-LABEL: test_vcvtn_high_f8_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn2 v0.16b, v1.4s, v2.4s
+; CHECK-NEXT:    ret
+  %res = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> %vd, <4 x float> %vn, <4 x float> %vm)
+  ret <16 x i8> %res
+}
+
+
+define <8 x i8> @test_vcvtn_f8_f16(<4 x half> %vn, <4 x half> %vm) {
+; CHECK-LABEL: test_vcvtn_f8_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn v0.8b, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %res = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> %vn, <4 x half> %vm)
+  ret <8 x i8> %res
+}
+
+define <16 x i8> @test_vcvtn2_f8_f16(<8 x half> %vn, <8 x half> %vm) {
+; CHECK-LABEL: test_vcvtn2_f8_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn v0.16b, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %res = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> %vn, <8 x half> %vm)
+  ret <16 x i8> %res
+}

>From f7436eff1ede039b89b1c1c089a39a9808993be4 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 17 Dec 2024 10:49:19 +0000
Subject: [PATCH 09/19] [fixup] Refector much of common code into a helper
 function (NFC)

---
 clang/lib/CodeGen/CGBuiltin.cpp     | 171 +++++++++++-----------------
 clang/lib/CodeGen/CodeGenFunction.h |   4 +
 2 files changed, 69 insertions(+), 106 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 66501b0e365aef6..f0e94a76fca6f22 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6911,6 +6911,23 @@ Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
   return ConstantInt::get(Ty, neg ? -SV : SV);
 }
 
+Value *CodeGenFunction::EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0,
+                                           llvm::Type *Ty1, bool Extract,
+                                           SmallVectorImpl<llvm::Value *> &Ops,
+                                           const CallExpr *E,
+                                           const char *name) {
+  llvm::Type *Tys[] = {Ty0, Ty1};
+  if (Extract) {
+    // Op[0] is mfloat8x16_t, but the intrinsic converts only the lower part of
+    // the vector.
+    Tys[1] = llvm::FixedVectorType::get(Int8Ty, 8);
+    Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
+  }
+  llvm::Value *FPM =
+      EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
+  return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
+}
+
 // Right-shift a vector by a constant.
 Value *CodeGenFunction::EmitNeonRShiftImm(Value *Vec, Value *Shift,
                                           llvm::Type *Ty, bool usgn,
@@ -12847,6 +12864,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return V;
 
   unsigned Int;
+  bool ExtractLow = false;
   switch (BuiltinID) {
   default: return nullptr;
   case NEON::BI__builtin_neon_vbsl_v:
@@ -14061,117 +14079,58 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
   }
-  case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm:
   case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
-  case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_cvtl1;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(BFloatTy, 8);
-    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower part of
-    // the vector.
-    if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm) {
-      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
-                                               /*isQuad*/ false));
-      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
-    } else
-      Tys[1] = Ops[0]->getType();
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt1");
-  }
-  case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm:
+    ExtractLow = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl1,
+                              llvm::FixedVectorType::get(BFloatTy, 8),
+                              Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt1");
   case NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm:
-  case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_cvtl2;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(BFloatTy, 8);
-    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
-    //  part of the vector.
-    if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm) {
-      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
-                                               /*isQuad*/ false));
-      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
-    } else
-      Tys[1] = Ops[0]->getType();
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt2");
-  }
-  case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm:
+    ExtractLow = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl2,
+                              llvm::FixedVectorType::get(BFloatTy, 8),
+                              Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt2");
   case NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm:
-  case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_cvtl1;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(HalfTy, 8);
-    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
-    //  part of the vector.
-    if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm) {
-      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
-                                               /*isQuad*/ false));
-      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
-    } else
-      Tys[1] = Ops[0]->getType();
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt1");
-  }
-  case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm:
+    ExtractLow = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl1,
+                              llvm::FixedVectorType::get(HalfTy, 8),
+                              Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt1");
   case NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm:
-  case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_cvtl2;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(HalfTy, 8);
-    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
-    //  part of the vector.
-    if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm) {
-      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
-                                               /*isQuad*/ false));
-      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
-    } else
-      Tys[1] = Ops[0]->getType();
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt2");
-  }
-  case NEON::BI__builtin_neon_vcvt_mf8_f32_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_fcvtn;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 8);
-    Tys[1] = Ops[0]->getType();
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn");
-  }
-  case NEON::BI__builtin_neon_vcvt_mf8_f16_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_fcvtn;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 8);
-    // Gets the expected type, because arm_neon.h casts float16x4_t to int8x8_t
-    Tys[1] = llvm::FixedVectorType::get(HalfTy, 4);
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn");
-  }
-  case NEON::BI__builtin_neon_vcvtq_mf8_f16_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_fcvtn;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 16);
-    // Gets the expected type, because arm_neon.h casts float16x8_t to int8x16_t
-    Tys[1] = llvm::FixedVectorType::get(HalfTy, 8);
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn");
-  }
+    ExtractLow = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl2,
+                              llvm::FixedVectorType::get(HalfTy, 8),
+                              Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt2");
+  case NEON::BI__builtin_neon_vcvt_mf8_f32_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn,
+                              llvm::FixedVectorType::get(Int8Ty, 8),
+                              Ops[0]->getType(), false, Ops, E, "vfcvtn");
+  case NEON::BI__builtin_neon_vcvt_mf8_f16_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn,
+                              llvm::FixedVectorType::get(Int8Ty, 8),
+                              llvm::FixedVectorType::get(HalfTy, 4), false, Ops,
+                              E, "vfcvtn");
+  case NEON::BI__builtin_neon_vcvtq_mf8_f16_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn,
+                              llvm::FixedVectorType::get(Int8Ty, 16),
+                              llvm::FixedVectorType::get(HalfTy, 8), false, Ops,
+                              E, "vfcvtn");
   case NEON::BI__builtin_neon_vcvt_high_mf8_f32_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_fcvtn2;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 16);
-    Tys[1] = Ops[1]->getType();
-    Ops[0] = Builder.CreateInsertVector(Tys[0], PoisonValue::get(Tys[0]),
-                                        Ops[0], Builder.getInt64(0));
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn2");
+    llvm::Type *Ty = llvm::FixedVectorType::get(Int8Ty, 16);
+    Ops[0] = Builder.CreateInsertVector(Ty, PoisonValue::get(Ty), Ops[0],
+                                        Builder.getInt64(0));
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn2,
+                              Ty, Ops[1]->getType(), false, Ops, E, "vfcvtn2");
   }
   case NEON::BI__builtin_neon_vamin_f16:
   case NEON::BI__builtin_neon_vaminq_f16:
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index a9d5462a1320145..010b44f4d38d96d 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4685,6 +4685,10 @@ class CodeGenFunction : public CodeGenTypeCache {
   llvm::Value *EmitFP8NeonCall(llvm::Function *F,
                                SmallVectorImpl<llvm::Value *> &O,
                                llvm::Value *FPM, const char *name);
+  llvm::Value *EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0,
+                                  llvm::Type *Ty1, bool Extract,
+                                  SmallVectorImpl<llvm::Value *> &Ops,
+                                  const CallExpr *E, const char *name);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx,
                              const llvm::ElementCount &Count);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx);

>From 67627ef46ce5d98629fc4f98815d4e5dd47a95ef Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 17 Dec 2024 11:28:49 +0000
Subject: [PATCH 10/19] [fixup] Add target features test, remove redundant bf16
 guard

---
 clang/include/clang/Basic/arm_neon.td         |  2 +-
 .../acle_neon_fp8_cvt.c                       | 43 +++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index b79b4749371b501..968536bcf5186db 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2125,7 +2125,7 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in {
   }
 }
 
-let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,bf16,neon" in {
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
   def VBF1CVT_BF16_MF8        : VInst<"vcvt1_bf16_mf8_fpm",      "(QB).V", "m">;
   def VBF1CVT_LOW_BF16_MF8    : VInst<"vcvt1_low_bf16_mf8_fpm",  "B.V",    "Qm">;
   def VBF2CVTL_BF16_MF8       : VInst<"vcvt2_bf16_mf8_fpm",      "(QB).V", "m">;
diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c
new file mode 100644
index 000000000000000..2c7004c7968a464
--- /dev/null
+++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +faminmax -emit-llvm -verify %s -o /dev/null
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+void test_features(float16x4_t vd4, float16x8_t vd8, float32x4_t va4,
+                   mfloat8x8_t v8, mfloat8x16_t v16, fpm_t fpm) {
+  (void) vcvt1_bf16_mf8_fpm(v8, fpm);
+  // expected-error at -1 {{'vcvt1_bf16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt1_low_bf16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt1_low_bf16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_bf16_mf8_fpm(v8, fpm);
+  // expected-error at -1 {{'vcvt2_bf16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_low_bf16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt2_low_bf16_mf8_fpm' requires target feature 'fp8'}}
+
+  (void) vcvt1_high_bf16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt1_high_bf16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_high_bf16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt2_high_bf16_mf8_fpm' requires target feature 'fp8'}}
+
+  (void) vcvt1_f16_mf8_fpm(v8, fpm);
+  // expected-error at -1 {{'vcvt1_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt1_low_f16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt1_low_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_f16_mf8_fpm(v8, fpm);
+  // expected-error at -1 {{'vcvt2_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_low_f16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt2_low_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt1_high_f16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt1_high_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_high_f16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt2_high_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt_mf8_f32_fpm(va4, va4, fpm);
+  // expected-error at -1 {{'vcvt_mf8_f32_fpm' requires target feature 'fp8'}}
+  (void) vcvt_high_mf8_f32_fpm(v8, va4, va4, fpm);
+  // expected-error at -1 {{'vcvt_high_mf8_f32_fpm' requires target feature 'fp8'}}
+  (void) vcvt_mf8_f16_fpm(vd4, vd4, fpm);
+  // expected-error at -1 {{'vcvt_mf8_f16_fpm' requires target feature 'fp8'}}
+  (void) vcvtq_mf8_f16_fpm(vd8, vd8, fpm);
+  // expected-error at -1 {{'vcvtq_mf8_f16_fpm' requires target feature 'fp8'}}
+}

>From d617b03536b14f1aeec5392199a6cac2f4070532 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 3 Jan 2025 11:36:36 +0000
Subject: [PATCH 11/19] [fixup] Clear the NoManglingQ flag for FP8

---
 clang/include/clang/Basic/arm_neon.td | 21 ++++++++++-----------
 clang/utils/TableGen/NeonEmitter.cpp  |  1 -
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 968536bcf5186db..64de6c0cf17a359 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2127,25 +2127,24 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in {
 
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
   def VBF1CVT_BF16_MF8        : VInst<"vcvt1_bf16_mf8_fpm",      "(QB).V", "m">;
-  def VBF1CVT_LOW_BF16_MF8    : VInst<"vcvt1_low_bf16_mf8_fpm",  "B.V",    "Qm">;
+  def VBF1CVT_LOW_BF16_MF8    : VInst<"vcvt1_low_bf16_mf8_fpm",  "B.V",    "Hm">;
   def VBF2CVTL_BF16_MF8       : VInst<"vcvt2_bf16_mf8_fpm",      "(QB).V", "m">;
-  def VBF2CVTL_LOW_BF16_MF8   : VInst<"vcvt2_low_bf16_mf8_fpm",  "B.V",    "Qm">;
-  def VBF1CVTL2_HIGH_BF16_MF8 : VInst<"vcvt1_high_bf16_mf8_fpm", "B.V",    "Qm">;
-  def VBF2CVTL2_HIGH_BF16_MF8 : VInst<"vcvt2_high_bf16_mf8_fpm", "B.V",    "Qm">;
+  def VBF2CVTL_LOW_BF16_MF8   : VInst<"vcvt2_low_bf16_mf8_fpm",  "B.V",    "Hm">;
+  def VBF1CVTL2_HIGH_BF16_MF8 : VInst<"vcvt1_high_bf16_mf8_fpm", "B.V",    "Hm">;
+  def VBF2CVTL2_HIGH_BF16_MF8 : VInst<"vcvt2_high_bf16_mf8_fpm", "B.V",    "Hm">;
 }
 
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
   def VF1CVT_F16_MF8        : VInst<"vcvt1_f16_mf8_fpm",      "(>QF).V", "m">;
-  def VF1CVT_LOW_F16_MF8    : VInst<"vcvt1_low_f16_mf8_fpm",  "(>F).V",  "Qm">;
+  def VF1CVT_LOW_F16_MF8    : VInst<"vcvt1_low_f16_mf8_fpm",  "(>F).V",  "Hm">;
   def VF2CVTL_F16_MF8       : VInst<"vcvt2_f16_mf8_fpm",      "(>QF).V", "m">;
-  def VF2CVTL_LOW_F16_MF8   : VInst<"vcvt2_low_f16_mf8_fpm",  "(>F).V",  "Qm">;
-  def VF1CVTL2_HIGH_F16_MF8 : VInst<"vcvt1_high_f16_mf8_fpm", "(>F).V",  "Qm">;
-  def VF2CVTL2_HIGH_F16_MF8 : VInst<"vcvt2_high_f16_mf8_fpm", "(>F).V",  "Qm">;
+  def VF2CVTL_LOW_F16_MF8   : VInst<"vcvt2_low_f16_mf8_fpm",  "(>F).V",  "Hm">;
+  def VF1CVTL2_HIGH_F16_MF8 : VInst<"vcvt1_high_f16_mf8_fpm", "(>F).V",  "Hm">;
+  def VF2CVTL2_HIGH_F16_MF8 : VInst<"vcvt2_high_f16_mf8_fpm", "(>F).V",  "Hm">;
 
   def VCVTN_LOW_F8_F32  : VInst<"vcvt_mf8_f32_fpm",      ".(>>QF)(>>QF)V",  "m">;
-  def VCVTN_HIGH_F8_F32 : VInst<"vcvt_high_mf8_f32_fpm", ".(q)(>>F)(>>F)V", "Qm">;
-  def VCVTN_F8_F16      : VInst<"vcvt_mf8_f16_fpm",      ".(>F)(>F)V",      "m">;
-  def VCVTNQ_F8_F16     : VInst<"vcvtq_mf8_f16_fpm",     ".(>F)(>F)V",      "Qm">;
+  def VCVTN_HIGH_F8_F32 : VInst<"vcvt_high_mf8_f32_fpm", ".(q)(>>F)(>>F)V", "Hm">;
+  def VCVTN_F8_F16      : VInst<"vcvt_mf8_f16_fpm",      ".(>F)(>F)V",      "mQm">;
 }
 
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 61067937acf7297..eb679d002d3bb1a 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -897,7 +897,6 @@ void Type::applyTypespec(bool &Quad) {
     case 'm':
       Kind = MFloat8;
       ElementBitwidth = 8;
-      NoManglingQ = true;
       break;
     default:
       llvm_unreachable("Unhandled type code!");

>From 95d61df48707ea8e63993a3655a423bcf6794f0d Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Mon, 13 Jan 2025 18:26:23 +0000
Subject: [PATCH 12/19] [fixup] Remove instcombine,tailcallelim from test run
 lines

---
 .../fp8-intrinsics/acle_neon_fp8_cvt.c        | 160 +++++++++---------
 1 file changed, 84 insertions(+), 76 deletions(-)

diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c
index 7543938f4871038..551ccb20c9cbdfd 100644
--- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
-// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix CHECK-CXX
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix CHECK-CXX
 
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -S -O3 -o /dev/null %s
 
@@ -11,15 +11,15 @@
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_bf16_mf8_fpm(
 // CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]])
 // CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
 //
 // CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z23test_vcvt1_bf16_mf8_fpmu13__MFloat8x8_tm(
 // CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]])
 // CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
 //
 bfloat16x8_t test_vcvt1_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
@@ -29,17 +29,17 @@ bfloat16x8_t test_vcvt1_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_low_bf16_mf8_fpm(
 // CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]])
 // CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
 //
 // CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z27test_vcvt1_low_bf16_mf8_fpmu14__MFloat8x16_tm(
 // CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]])
 // CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
 //
 bfloat16x8_t test_vcvt1_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
@@ -49,15 +49,15 @@ bfloat16x8_t test_vcvt1_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_bf16_mf8_fpm(
 // CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]])
 // CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
 //
 // CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z23test_vcvt2_bf16_mf8_fpmu13__MFloat8x8_tm(
 // CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]])
 // CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
 //
 bfloat16x8_t test_vcvt2_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
@@ -67,17 +67,17 @@ bfloat16x8_t test_vcvt2_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_low_bf16_mf8_fpm(
 // CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]])
 // CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
 //
 // CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z27test_vcvt2_low_bf16_mf8_fpmu14__MFloat8x16_tm(
 // CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]])
 // CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
 //
 bfloat16x8_t test_vcvt2_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
@@ -87,15 +87,15 @@ bfloat16x8_t test_vcvt2_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_high_bf16_mf8_fpm(
 // CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]])
 // CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
 //
 // CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z28test_vcvt1_high_bf16_mf8_fpmu14__MFloat8x16_tm(
 // CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]])
 // CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
 //
 bfloat16x8_t test_vcvt1_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
@@ -105,15 +105,15 @@ bfloat16x8_t test_vcvt1_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
 // CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_high_bf16_mf8_fpm(
 // CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]])
 // CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
 //
 // CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z28test_vcvt2_high_bf16_mf8_fpmu14__MFloat8x16_tm(
 // CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]])
 // CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
 //
 bfloat16x8_t test_vcvt2_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
@@ -123,15 +123,15 @@ bfloat16x8_t test_vcvt2_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
 // CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_f16_mf8_fpm(
 // CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]])
 // CHECK-NEXT:    ret <8 x half> [[VBFCVT1_I]]
 //
 // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z22test_vcvt1_f16_mf8_fpmu13__MFloat8x8_tm(
 // CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]])
 // CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT1_I]]
 //
 float16x8_t test_vcvt1_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
@@ -141,17 +141,17 @@ float16x8_t test_vcvt1_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
 // CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_low_f16_mf8_fpm(
 // CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]])
 // CHECK-NEXT:    ret <8 x half> [[VBFCVT1_I]]
 //
 // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z26test_vcvt1_low_f16_mf8_fpmu14__MFloat8x16_tm(
 // CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]])
 // CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT1_I]]
 //
 float16x8_t test_vcvt1_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
@@ -161,15 +161,15 @@ float16x8_t test_vcvt1_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
 // CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_f16_mf8_fpm(
 // CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]])
 // CHECK-NEXT:    ret <8 x half> [[VBFCVT2_I]]
 //
 // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z22test_vcvt2_f16_mf8_fpmu13__MFloat8x8_tm(
 // CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]])
 // CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT2_I]]
 //
 float16x8_t test_vcvt2_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
@@ -179,17 +179,17 @@ float16x8_t test_vcvt2_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
 // CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_low_f16_mf8_fpm(
 // CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]])
 // CHECK-NEXT:    ret <8 x half> [[VBFCVT2_I]]
 //
 // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z26test_vcvt2_low_f16_mf8_fpmu14__MFloat8x16_tm(
 // CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]])
 // CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT2_I]]
 //
 float16x8_t test_vcvt2_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
@@ -199,15 +199,15 @@ float16x8_t test_vcvt2_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
 // CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_high_f16_mf8_fpm(
 // CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]])
 // CHECK-NEXT:    ret <8 x half> [[VBFCVT1_I]]
 //
 // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z27test_vcvt1_high_f16_mf8_fpmu14__MFloat8x16_tm(
 // CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]])
 // CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT1_I]]
 //
 float16x8_t test_vcvt1_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
@@ -217,15 +217,15 @@ float16x8_t test_vcvt1_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
 // CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_high_f16_mf8_fpm(
 // CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]])
 // CHECK-NEXT:    ret <8 x half> [[VBFCVT2_I]]
 //
 // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z27test_vcvt2_high_f16_mf8_fpmu14__MFloat8x16_tm(
 // CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]])
 // CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT2_I]]
 //
 float16x8_t test_vcvt2_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
@@ -235,15 +235,15 @@ float16x8_t test_vcvt2_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
 // CHECK-LABEL: define dso_local <8 x i8> @test_vcvt_mf8_f32_fpm(
 // CHECK-SAME: <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VFCVTN_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VFCVTN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]])
 // CHECK-NEXT:    ret <8 x i8> [[VFCVTN_I]]
 //
 // CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z21test_vcvt_mf8_f32_fpm13__Float32x4_tS_m(
 // CHECK-CXX-SAME: <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VFCVTN_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VFCVTN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]])
 // CHECK-CXX-NEXT:    ret <8 x i8> [[VFCVTN_I]]
 //
 mfloat8x8_t test_vcvt_mf8_f32_fpm(float32x4_t vn, float32x4_t vm, fpm_t fpm) {
@@ -253,17 +253,17 @@ mfloat8x8_t test_vcvt_mf8_f32_fpm(float32x4_t vn, float32x4_t vm, fpm_t fpm) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vcvt_high_mf8_f32_fpm(
 // CHECK-SAME: <8 x i8> [[VD:%.*]], <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VD]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VD]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]])
 // CHECK-NEXT:    ret <16 x i8> [[VFCVTN2_I]]
 //
 // CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vcvt_high_mf8_f32_fpmu13__MFloat8x8_t13__Float32x4_tS_m(
 // CHECK-CXX-SAME: <8 x i8> [[VD:%.*]], <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VD]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VD]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]])
 // CHECK-CXX-NEXT:    ret <16 x i8> [[VFCVTN2_I]]
 //
 mfloat8x16_t test_vcvt_high_mf8_f32_fpm(mfloat8x8_t vd, float32x4_t vn,
@@ -274,15 +274,19 @@ mfloat8x16_t test_vcvt_high_mf8_f32_fpm(mfloat8x8_t vd, float32x4_t vn,
 // CHECK-LABEL: define dso_local <8 x i8> @test_vcvt_mf8_f16_fpm(
 // CHECK-SAME: <4 x half> noundef [[VN:%.*]], <4 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VFCVTN2_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VN]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[VM]] to <8 x i8>
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VFCVTN2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]])
 // CHECK-NEXT:    ret <8 x i8> [[VFCVTN2_I]]
 //
 // CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z21test_vcvt_mf8_f16_fpm13__Float16x4_tS_m(
 // CHECK-CXX-SAME: <4 x half> noundef [[VN:%.*]], <4 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VFCVTN2_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]])
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VN]] to <8 x i8>
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = bitcast <4 x half> [[VM]] to <8 x i8>
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VFCVTN2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]])
 // CHECK-CXX-NEXT:    ret <8 x i8> [[VFCVTN2_I]]
 //
 mfloat8x8_t test_vcvt_mf8_f16_fpm(float16x4_t vn, float16x4_t vm, fpm_t fpm) {
@@ -292,15 +296,19 @@ mfloat8x8_t test_vcvt_mf8_f16_fpm(float16x4_t vn, float16x4_t vm, fpm_t fpm) {
 // CHECK-LABEL: define dso_local <16 x i8> @test_vcvtq_mf8_f16_fpm(
 // CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VN]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[VM]] to <16 x i8>
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]])
 // CHECK-NEXT:    ret <16 x i8> [[VFCVTN2_I]]
 //
 // CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z22test_vcvtq_mf8_f16_fpm13__Float16x8_tS_m(
 // CHECK-CXX-SAME: <8 x half> noundef [[VN:%.*]], <8 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]])
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VN]] to <16 x i8>
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = bitcast <8 x half> [[VM]] to <16 x i8>
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]])
 // CHECK-CXX-NEXT:    ret <16 x i8> [[VFCVTN2_I]]
 //
 mfloat8x16_t test_vcvtq_mf8_f16_fpm(float16x8_t vn, float16x8_t vm, fpm_t fpm) {

>From 6005e9e5beca028bc5067e8d8449a3b80fa47b6d Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 17 Dec 2024 11:42:42 +0000
Subject: [PATCH 13/19] [AArch64] Add FP8 Neon intrinsics for dot-product

THis patch adds the following intrinsics:

float16x4_t vdot_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpm)
float16x8_t vdotq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm)

float16x4_t vdot_lane_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x4_t vdot_laneq_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vdotq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vdotq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
---
 clang/include/clang/Basic/arm_neon.td         |  22 +++
 clang/include/clang/Basic/arm_neon_incl.td    |   2 +-
 clang/lib/CodeGen/CGBuiltin.cpp               |  47 ++++++
 clang/lib/CodeGen/CodeGenFunction.h           |   5 +
 .../fp8-intrinsics/acle_neon_fp8_fdot.c       | 143 ++++++++++++++++++
 .../acle_neon_fp8_fdot.c                      |  54 +++++++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  21 +++
 .../lib/Target/AArch64/AArch64InstrFormats.td |  82 ++++++----
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  14 +-
 llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll    |  74 +++++++++
 10 files changed, 424 insertions(+), 40 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c
 create mode 100644 clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fdot.c
 create mode 100644 llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 64de6c0cf17a359..5d1267213f238a8 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2147,6 +2147,28 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
   def VCVTN_F8_F16      : VInst<"vcvt_mf8_f16_fpm",      ".(>F)(>F)V",      "mQm">;
 }
 
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot2,neon" in {
+  def VDOT_F16_MF8  : VInst<"vdot_f16_mf8_fpm", "(>F)(>F)..V", "m">;
+  def VDOTQ_F16_MF8 : VInst<"vdotq_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+
+  def VDOT_LANE_F16_MF8 : VInst<"vdot_lane_f16_mf8_fpm", "(>F)(>F)..IV", "m",   [ImmCheck<3, ImmCheck0_3, 0>]>;
+  def VDOT_LANEQ_F16_MF8 : VInst<"vdot_laneq_f16_mf8_fpm", "(>F)(>F).QIV", "m",   [ImmCheck<3, ImmCheck0_7, 0>]>;
+
+  def VDOTQ_LANE_F16_MF8 : VInst<"vdotq_lane_f16_mf8_fpm", "(>F)(>F).qIV", "Qm",   [ImmCheck<3, ImmCheck0_3, 0>]>;
+  def VDOTQ_LANEQ_F16_MF8 : VInst<"vdotq_laneq_f16_mf8_fpm", "(>F)(>F)..IV", "Qm",   [ImmCheck<3, ImmCheck0_7, 0>]>;
+}
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot4,neon" in {
+  def VDOT_F32_MF8  : VInst<"vdot_f32_mf8_fpm", "(>>F)(>>F)..V", "m">;
+  def VDOTQ_F32_MF8 : VInst<"vdotq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+
+  def VDOT_LANE_F32_MF8 : VInst<"vdot_lane_f32_mf8_fpm", "(>>F)(>>F)..IV", "m",   [ImmCheck<3, ImmCheck0_1, 0>]>;
+  def VDOT_LANEQ_F32_MF8 : VInst<"vdot_laneq_f32_mf8_fpm", "(>>F)(>>F).QIV", "m",   [ImmCheck<3, ImmCheck0_3, 0>]>;
+
+  def VDOTQ_LANE_F32_MF8 : VInst<"vdotq_lane_f32_mf8_fpm", "(>>F)(>>F).qIV", "Qm",   [ImmCheck<3, ImmCheck0_1, 0>]>;
+  def VDOTQ_LANEQ_F32_MF8 : VInst<"vdotq_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm",   [ImmCheck<3, ImmCheck0_3, 0>]>;
+}
+
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
   def FAMIN : WInst<"vamin", "...", "fhQdQfQh">;
   def FAMAX : WInst<"vamax", "...", "fhQdQfQh">;
diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td
index 91a2bf3020b9a3f..b9b9d509c22512b 100644
--- a/clang/include/clang/Basic/arm_neon_incl.td
+++ b/clang/include/clang/Basic/arm_neon_incl.td
@@ -302,7 +302,7 @@ class Inst <string n, string p, string t, Operation o, list<ImmCheck> ch = []>{
 class SInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
 class IInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
 class WInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
-class VInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
+class VInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
 
 // The following instruction classes are implemented via operators
 // instead of builtins. As such these declarations are only used for
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index f0e94a76fca6f22..9c1cef83103d814 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6905,6 +6905,25 @@ Value *CodeGenFunction::EmitFP8NeonCall(Function *F,
   return EmitNeonCall(F, Ops, name);
 }
 
+llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
+    unsigned IID, bool ExtendLane, llvm::Type *RetTy,
+    SmallVectorImpl<llvm::Value *> &Ops, unsigned ICEArguments,
+    const CallExpr *E, const char *name) {
+
+  const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() /
+                             RetTy->getPrimitiveSizeInBits();
+  llvm::Type *Tys[] = {llvm::FixedVectorType::get(RetTy, ElemCount),
+                       Ops[1]->getType()};
+  if (ExtendLane) {
+    auto *VT = llvm::FixedVectorType::get(Int8Ty, 16);
+    Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2],
+                                        Builder.getInt64(0));
+  }
+  llvm::Value *FPM =
+      EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+  return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
+}
+
 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
                                             bool neg) {
   int SV = cast<ConstantInt>(V)->getSExtValue();
@@ -12865,6 +12884,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
 
   unsigned Int;
   bool ExtractLow = false;
+  bool ExtendLane = false;
   switch (BuiltinID) {
   default: return nullptr;
   case NEON::BI__builtin_neon_vbsl_v:
@@ -14132,6 +14152,33 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn2,
                               Ty, Ops[1]->getType(), false, Ops, E, "vfcvtn2");
   }
+
+  case NEON::BI__builtin_neon_vdot_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_f16_mf8_fpm:
+    return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2, false, HalfTy,
+                               Ops, ICEArguments, E, "fdot2");
+  case NEON::BI__builtin_neon_vdot_lane_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_lane_f16_mf8_fpm:
+    ExtendLane = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vdot_laneq_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_laneq_f16_mf8_fpm:
+    return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2_lane,
+                               ExtendLane, HalfTy, Ops, ICEArguments, E,
+                               "fdot2_lane");
+  case NEON::BI__builtin_neon_vdot_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_f32_mf8_fpm:
+    return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4, false,
+                               FloatTy, Ops, ICEArguments, E, "fdot4");
+  case NEON::BI__builtin_neon_vdot_lane_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_lane_f32_mf8_fpm:
+    ExtendLane = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vdot_laneq_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm:
+    return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane,
+                               ExtendLane, FloatTy, Ops, ICEArguments, E,
+                               "fdot4_lane");
   case NEON::BI__builtin_neon_vamin_f16:
   case NEON::BI__builtin_neon_vaminq_f16:
   case NEON::BI__builtin_neon_vamin_f32:
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 010b44f4d38d96d..b5f690b2f6ee0d7 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4689,6 +4689,11 @@ class CodeGenFunction : public CodeGenTypeCache {
                                   llvm::Type *Ty1, bool Extract,
                                   SmallVectorImpl<llvm::Value *> &Ops,
                                   const CallExpr *E, const char *name);
+  llvm::Value *EmitFP8NeonFDOTCall(unsigned IID, bool ExtendLane,
+                                   llvm::Type *RetTy,
+                                   SmallVectorImpl<llvm::Value *> &Ops,
+                                   unsigned ICEArguments, const CallExpr *E,
+                                   const char *name);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx,
                              const llvm::ElementCount &Count);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx);
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c
new file mode 100644
index 000000000000000..b273bc2abe87797
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c
@@ -0,0 +1,143 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+
+// REQUIES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: define dso_local <4 x half> @test_vdot_f16(
+// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT21_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
+// CHECK-NEXT:    ret <4 x half> [[FDOT21_I]]
+//
+float16x4_t test_vdot_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
+  return vdot_f16_mf8_fpm(vd, vn, vm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_f16(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT21_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <8 x half> [[FDOT21_I]]
+//
+float16x8_t test_vdotq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
+  return vdotq_f16_mf8_fpm(vd, vn, vm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_vdot_lane_f16(
+// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 3)
+// CHECK-NEXT:    ret <4 x half> [[FDOT2_LANE1]]
+//
+float16x4_t test_vdot_lane_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
+  return vdot_lane_f16_mf8_fpm(vd, vn, vm, 3, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_vdot_laneq_f16(
+// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    ret <4 x half> [[FDOT2_LANE1]]
+//
+float16x4_t test_vdot_laneq_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x16_t vm, fpm_t fpmr) {
+  return vdot_laneq_f16_mf8_fpm(vd, vn, vm, 7, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_lane_f16(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 3)
+// CHECK-NEXT:    ret <8 x half> [[FDOT2_LANE1]]
+//
+float16x8_t test_vdotq_lane_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) {
+  return vdotq_lane_f16_mf8_fpm(vd, vn, vm, 3, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_laneq_f16(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    ret <8 x half> [[FDOT2_LANE1]]
+//
+float16x8_t test_vdotq_laneq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
+  return vdotq_laneq_f16_mf8_fpm(vd, vn, vm, 7, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <2 x float> @test_vdot_f32(
+// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
+// CHECK-NEXT:    ret <2 x float> [[FDOT4_I]]
+//
+float32x2_t test_vdot_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
+  return vdot_f32_mf8_fpm(vd, vn, vm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_f32(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <4 x float> [[FDOT4_I]]
+//
+float32x4_t test_vdotq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
+  return vdotq_f32_mf8_fpm(vd, vn, vm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <2 x float> @test_vdot_lane_f32(
+// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_LANE:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret <2 x float> [[FDOT4_LANE]]
+//
+float32x2_t test_vdot_lane_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
+  return vdot_lane_f32_mf8_fpm(vd, vn, vm, 1, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <2 x float> @test_vdot_laneq_f32(
+// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_LANE:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    ret <2 x float> [[FDOT4_LANE]]
+//
+float32x2_t test_vdot_laneq_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x16_t vm, fpm_t fpmr) {
+  return vdot_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_lane_f32(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret <4 x float> [[FDOT4_LANE]]
+//
+float32x4_t test_vdotq_lane_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) {
+  return vdotq_lane_f32_mf8_fpm(vd, vn, vm, 1, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_laneq_f32(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    ret <4 x float> [[FDOT4_LANE]]
+//
+float32x4_t test_vdotq_laneq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
+  return vdotq_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpmr);
+}
diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fdot.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fdot.c
new file mode 100644
index 000000000000000..8bfe3ac26ab2c33
--- /dev/null
+++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fdot.c
@@ -0,0 +1,54 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +faminmax -target-feature +fp8 -emit-llvm -verify %s -o /dev/null
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+void test_features(float16x4_t vd4, float16x8_t vd8, float32x4_t va4, float32x2_t va2,
+                   mfloat8x8_t v8, mfloat8x16_t v16, fpm_t fpm) {
+  (void) vdot_f16_mf8_fpm(vd4, v8, v8, fpm);
+// expected-error at -1 {{'vdot_f16_mf8_fpm' requires target feature 'fp8dot2'}}
+  (void) vdotq_f16_mf8_fpm(vd8, v16, v16, fpm);
+// expected-error at -1 {{'vdotq_f16_mf8_fpm' requires target feature 'fp8dot2'}}
+  (void) vdot_lane_f16_mf8_fpm(vd4, v8, v8, 3, fpm);
+// expected-error at -1 {{'__builtin_neon_vdot_lane_f16_mf8_fpm' needs target feature fp8dot2,neon}}
+  (void) vdot_laneq_f16_mf8_fpm(vd4, v8, v16, 7, fpm);
+// expected-error at -1 {{'__builtin_neon_vdot_laneq_f16_mf8_fpm' needs target feature fp8dot2,neon}}
+  (void) vdotq_lane_f16_mf8_fpm(vd8, v16, v8, 3, fpm);
+// expected-error at -1 {{'__builtin_neon_vdotq_lane_f16_mf8_fpm' needs target feature fp8dot2,neon}}
+  (void) vdotq_laneq_f16_mf8_fpm(vd8, v16, v16, 7, fpm);
+// expected-error at -1 {{'__builtin_neon_vdotq_laneq_f16_mf8_fpm' needs target feature fp8dot2,neon}}
+
+  (void) vdot_f32_mf8_fpm(va2, v8, v8, fpm);
+// expected-error at -1 {{'vdot_f32_mf8_fpm' requires target feature 'fp8dot4'}}
+  (void) vdotq_f32_mf8_fpm(va4, v16, v16, fpm);
+// expected-error at -1 {{'vdotq_f32_mf8_fpm' requires target feature 'fp8dot4}}
+  (void) vdot_lane_f32_mf8_fpm(va2, v8, v8, 1, fpm);
+// expected-error at -1 {{'__builtin_neon_vdot_lane_f32_mf8_fpm' needs target feature fp8dot4,neon}}
+  (void) vdot_laneq_f32_mf8_fpm(va2, v8, v16, 3, fpm);
+// expected-error at -1 {{'__builtin_neon_vdot_laneq_f32_mf8_fpm' needs target feature fp8dot4,neon}}
+  (void) vdotq_lane_f32_mf8_fpm(va4, v16, v8, 1, fpm);
+// expected-error at -1 {{'__builtin_neon_vdotq_lane_f32_mf8_fpm' needs target feature fp8dot4,neon}}
+  (void) vdotq_laneq_f32_mf8_fpm(va4, v16, v16, 3, fpm);
+// expected-error at -1 {{'__builtin_neon_vdotq_laneq_f32_mf8_fpm' needs target feature fp8dot4,neon}}
+}
+
+void test_imm(float16x4_t vd4, float16x8_t vd8, float32x2_t va2, float32x4_t va4,
+              mfloat8x8_t v8, mfloat8x16_t v16, fpm_t fpm) {
+  (void) vdot_lane_f16_mf8_fpm(vd4, v8, v8, -1, fpm);
+  // expected-error at -1 {{argument value -1 is outside the valid range [0, 3]}}
+  (void) vdot_laneq_f16_mf8_fpm(vd4, v8, v16, -1, fpm);
+  // expected-error at -1 {{argument value -1 is outside the valid range [0, 7]}}
+  (void) vdotq_lane_f16_mf8_fpm(vd8, v16, v8, -1, fpm);
+  // expected-error at -1 {{argument value -1 is outside the valid range [0, 3]}}
+  (void) vdotq_laneq_f16_mf8_fpm(vd8, v16, v16, -1, fpm);
+  // expected-error at -1 {{argument value -1 is outside the valid range [0, 7]}}
+  (void) vdot_lane_f32_mf8_fpm(va2, v8, v8, -1, fpm);
+  // expected-error at -1 {{argument value -1 is outside the valid range [0, 1]}}
+  (void) vdot_laneq_f32_mf8_fpm(va2, v8, v16, -1, fpm);
+  // expected-error at -1 {{argument value -1 is outside the valid range [0, 3]}}
+  (void) vdotq_lane_f32_mf8_fpm(va4, v16, v8, -1, fpm);
+  // expected-error at -1 {{argument value -1 is outside the valid range [0, 1]}}
+  (void) vdotq_laneq_f32_mf8_fpm(va4, v16, v16, -1, fpm);
+  // expected-error at -1 {{argument value -1 is outside the valid range [0, 3]}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 3f841f86f31d8ff..2c7ef00edfa3017 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1026,6 +1026,27 @@ def int_aarch64_st64bv0: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], dat
                              llvm_anyvector_ty,
                              LLVMMatchType<1>],
                             [IntrReadMem, IntrInaccessibleMemOnly]>;
+
+  // Dot-product
+  class AdvSIMD_FP8_DOT_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [LLVMMatchType<0>,
+                             llvm_anyvector_ty,
+                             LLVMMatchType<1>],
+                             [IntrReadMem, IntrInaccessibleMemOnly]>;
+  class AdvSIMD_FP8_DOT_LANE_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [LLVMMatchType<0>,
+                             llvm_anyvector_ty,
+                             llvm_v16i8_ty,
+                             llvm_i32_ty],
+                             [IntrReadMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<3>>]>;
+
+  def int_aarch64_neon_fp8_fdot2 : AdvSIMD_FP8_DOT_Intrinsic;
+  def int_aarch64_neon_fp8_fdot2_lane : AdvSIMD_FP8_DOT_LANE_Intrinsic;
+
+  def int_aarch64_neon_fp8_fdot4 : AdvSIMD_FP8_DOT_Intrinsic;
+  def int_aarch64_neon_fp8_fdot4_lane : AdvSIMD_FP8_DOT_LANE_Intrinsic;
 }
 
 def llvm_nxv1i1_ty  : LLVMType<nxv1i1>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 0ecc35f61903ba9..78cfdb412416d2e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6584,19 +6584,22 @@ multiclass SIMD_FP8_CVTN_F32<string asm, SDPatternOperator Op> {
             (!cast<Instruction>(NAME # 2v16f8) V128:$_Rd, V128:$Rn, V128:$Rm)>;
 }
 
-// TODO: Create a new Value Type v8f8 and v16f8
-multiclass SIMDThreeSameVectorDOT2<string asm> {
-   def v4f16 : BaseSIMDThreeSameVectorDot<0b0, 0b0, 0b01, 0b1111, asm, ".4h", ".8b",
-                                          V64, v4f16, v8i8, null_frag>;
-   def v8f16 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b01, 0b1111, asm, ".8h", ".16b",
-                                          V128, v8f16, v16i8, null_frag>;
+multiclass SIMD_FP8_Dot2<string asm, SDPatternOperator op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v4f16 : BaseSIMDThreeSameVectorDot<0b0, 0b0, 0b01, 0b1111, asm, ".4h", ".8b",
+                                           V64, v4f16, v8i8, op>;
+    def v8f16 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b01, 0b1111, asm, ".8h", ".16b",
+                                           V128, v8f16, v16i8, op>;
+  }
 }
 
-multiclass SIMDThreeSameVectorDOT4<string asm> {
-   def v2f32 : BaseSIMDThreeSameVectorDot<0b0, 0b0, 0b00, 0b1111, asm, ".2s", ".8b",
-                                          V64, v2f32, v8i8, null_frag>;
-   def v4f32 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1111, asm, ".4s", ".16b",
-                                          V128, v4f32, v16i8, null_frag>;
+multiclass SIMD_FP8_Dot4<string asm, SDPatternOperator op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v2f32 : BaseSIMDThreeSameVectorDot<0b0, 0b0, 0b00, 0b1111, asm, ".2s", ".8b",
+                                           V64, v2f32, v8i8, op>;
+    def v4f32 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1111, asm, ".4s", ".16b",
+                                           V128, v4f32, v16i8, op>;
+  }
 }
 
 let mayRaiseFPException = 1, Uses = [FPCR] in
@@ -9140,15 +9143,16 @@ class BaseSIMDThreeSameVectorIndexS<bit Q, bit U, bits<2> size, bits<4> opc, str
                                     string dst_kind, string lhs_kind, string rhs_kind,
                                     RegisterOperand RegType,
                                     ValueType AccumType, ValueType InputType,
+                                    AsmVectorIndexOpnd VIdx,
                                     SDPatternOperator OpNode> :
         BaseSIMDIndexedTied<Q, U, 0b0, size, opc, RegType, RegType, V128,
-                            VectorIndexS, asm, "", dst_kind, lhs_kind, rhs_kind,
+                            VIdx, asm, "", dst_kind, lhs_kind, rhs_kind,
         [(set (AccumType RegType:$dst),
               (AccumType (OpNode (AccumType RegType:$Rd),
                                  (InputType RegType:$Rn),
                                  (InputType (bitconvert (AccumType
                                     (AArch64duplane32 (v4i32 V128:$Rm),
-                                        VectorIndexS:$idx)))))))]> {
+                                        VIdx:$idx)))))))]> {
   bits<2> idx;
   let Inst{21}    = idx{0};  // L
   let Inst{11}    = idx{1};  // H
@@ -9157,17 +9161,24 @@ class BaseSIMDThreeSameVectorIndexS<bit Q, bit U, bits<2> size, bits<4> opc, str
 multiclass SIMDThreeSameVectorDotIndex<bit U, bit Mixed, bits<2> size, string asm,
                                        SDPatternOperator OpNode> {
   def v8i8  : BaseSIMDThreeSameVectorIndexS<0, U, size, {0b111, Mixed}, asm, ".2s", ".8b", ".4b",
-                                              V64, v2i32, v8i8, OpNode>;
+                                              V64, v2i32, v8i8, VectorIndexS, OpNode>;
   def v16i8 : BaseSIMDThreeSameVectorIndexS<1, U, size, {0b111, Mixed}, asm, ".4s", ".16b", ".4b",
-                                              V128, v4i32, v16i8, OpNode>;
+                                              V128, v4i32, v16i8, VectorIndexS, OpNode>;
 }
 
-// TODO: The vectors v8i8 and v16i8 should be v8f8 and v16f8
-multiclass SIMDThreeSameVectorFP8DOT4Index<string asm> {
-  def v8f8 : BaseSIMDThreeSameVectorIndexS<0b0, 0b0, 0b00, 0b0000, asm, ".2s", ".8b", ".4b",
-                                           V64, v2f32, v8i8, null_frag>;
-  def v16f8 : BaseSIMDThreeSameVectorIndexS<0b1, 0b0, 0b00, 0b0000, asm, ".4s", ".16b",".4b",
-                                            V128, v4f32, v16i8, null_frag>;
+multiclass SIMD_FP8_Dot4_Index<string asm, SDPatternOperator op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v2f32 : BaseSIMDThreeSameVectorIndexS<0b0, 0b0, 0b00, 0b0000, asm, ".2s", ".8b", ".4b",
+                                              V64, v2f32, v8i8, VectorIndexS32b_timm, null_frag>;
+    def v4f32 : BaseSIMDThreeSameVectorIndexS<0b1, 0b0, 0b00, 0b0000, asm, ".4s", ".16b",".4b",
+                                              V128, v4f32, v16i8, VectorIndexS32b_timm, null_frag>;
+  }
+
+  def : Pat<(v2f32 (op (v2f32 V64:$Rd), (v8i8 V64:$Rn), (v16i8 V128:$Rm), VectorIndexS32b_timm:$Idx)),
+            (!cast<Instruction>(NAME # v2f32) $Rd, $Rn, $Rm, $Idx)>;
+
+  def : Pat<(v4f32 (op (v4f32 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm), VectorIndexS32b_timm:$Idx)),
+            (!cast<Instruction>(NAME # v4f32) $Rd, $Rn, $Rm, $Idx)>;
 }
 
 // ARMv8.2-A Fused Multiply Add-Long Instructions (Indexed)
@@ -9176,14 +9187,15 @@ class BaseSIMDThreeSameVectorIndexH<bit Q, bit U, bits<2> sz, bits<4> opc, strin
                                       string dst_kind, string lhs_kind,
                                       string rhs_kind, RegisterOperand RegType,
                                       RegisterOperand RegType_lo, ValueType AccumType,
-                                      ValueType InputType, SDPatternOperator OpNode> :
+                                      ValueType InputType, AsmVectorIndexOpnd VIdx,
+                                      SDPatternOperator OpNode> :
         BaseSIMDIndexedTied<Q, U, 0, sz, opc, RegType, RegType, RegType_lo,
-                            VectorIndexH, asm, "", dst_kind, lhs_kind, rhs_kind,
+                            VIdx, asm, "", dst_kind, lhs_kind, rhs_kind,
           [(set (AccumType RegType:$dst),
                 (AccumType (OpNode (AccumType RegType:$Rd),
                                    (InputType RegType:$Rn),
                                    (InputType (AArch64duplane16 (v8f16 V128_lo:$Rm),
-                                                VectorIndexH:$idx)))))]> {
+                                                VIdx:$idx)))))]> {
   // idx = H:L:M
   bits<3> idx;
   let Inst{11} = idx{2}; // H
@@ -9194,19 +9206,25 @@ class BaseSIMDThreeSameVectorIndexH<bit Q, bit U, bits<2> sz, bits<4> opc, strin
 multiclass SIMDThreeSameVectorFMLIndex<bit U, bits<4> opc, string asm,
                                        SDPatternOperator OpNode> {
   def v4f16 : BaseSIMDThreeSameVectorIndexH<0, U, 0b10, opc, asm, ".2s", ".2h", ".h",
-                                              V64, V128_lo, v2f32, v4f16, OpNode>;
+                                              V64, V128_lo, v2f32, v4f16, VectorIndexH, OpNode>;
   def v8f16 : BaseSIMDThreeSameVectorIndexH<1, U, 0b10, opc, asm, ".4s", ".4h", ".h",
-                                              V128, V128_lo, v4f32, v8f16, OpNode>;
+                                              V128, V128_lo, v4f32, v8f16, VectorIndexH, OpNode>;
 }
 
 //----------------------------------------------------------------------------
 // FP8 Advanced SIMD vector x indexed element
-// TODO: Replace value types v8i8 and v16i8 by v8f8 and v16f8
-multiclass SIMDThreeSameVectorFP8DOT2Index<string asm> {
-  def v4f16 : BaseSIMDThreeSameVectorIndexH<0b0, 0b0, 0b01, 0b0000, asm, ".4h", ".8b", ".2b",
-                                            V64, V128_lo, v4f16, v8i8, null_frag>;
-  def v8f16 : BaseSIMDThreeSameVectorIndexH<0b1, 0b0, 0b01, 0b0000, asm, ".8h", ".16b", ".2b",
-                                            V128, V128_lo, v8f16, v8i16, null_frag>;
+multiclass SIMD_FP8_Dot2_Index<string asm, SDPatternOperator op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in { 
+    def v4f16 : BaseSIMDThreeSameVectorIndexH<0b0, 0b0, 0b01, 0b0000, asm, ".4h", ".8b", ".2b",
+                                              V64, V128_lo, v4f16, v8i8, VectorIndexH32b_timm, null_frag>;
+    def v8f16 : BaseSIMDThreeSameVectorIndexH<0b1, 0b0, 0b01, 0b0000, asm, ".8h", ".16b", ".2b",
+                                              V128, V128_lo, v8f16, v16i8, VectorIndexH32b_timm, null_frag>;
+  }
+  def : Pat<(v4f16 (op (v4f16 V64:$Rd), (v8i8 V64:$Rn), (v16i8 V128_lo:$Rm), VectorIndexH32b_timm:$Idx)),
+            (!cast<Instruction>(NAME # v4f16) $Rd, $Rn, $Rm, $Idx)>;
+
+  def : Pat<(v8f16 (op (v8f16 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128_lo:$Rm), VectorIndexH32b_timm:$Idx)),
+            (!cast<Instruction>(NAME # v8f16) $Rd, $Rn, $Rm, $Idx)>;
 }
 
 multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index b415d843ac2e2d7..463a8f2f95e33e1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1489,7 +1489,7 @@ class BaseSIMDSUDOTIndex<bit Q, string dst_kind, string lhs_kind,
                          ValueType AccumType, ValueType InputType>
       : BaseSIMDThreeSameVectorIndexS<Q, 0, 0b00, 0b1111, "sudot", dst_kind,
                                         lhs_kind, rhs_kind, RegType, AccumType,
-                                        InputType, null_frag> {
+                                        InputType, VectorIndexS, null_frag> {
   let Pattern = [(set (AccumType RegType:$dst),
                       (AccumType (AArch64usdot (AccumType RegType:$Rd),
                                  (InputType (bitconvert (AccumType
@@ -10346,14 +10346,14 @@ let Uses = [FPMR, FPCR], Predicates = [HasFP8FMA] in {
  defm FMLALLTT : SIMDThreeSameVectorMLAL<0b1, 0b01, "fmlalltt">;
 } // End let Predicates = [HasFP8FMA]
 
-let Uses = [FPMR, FPCR], Predicates = [HasFP8DOT2] in {
- defm FDOTlane : SIMDThreeSameVectorFP8DOT2Index<"fdot">;
- defm FDOT : SIMDThreeSameVectorDOT2<"fdot">;
+let Predicates = [HasFP8DOT2] in {
+ defm FDOTlane : SIMD_FP8_Dot2_Index<"fdot", int_aarch64_neon_fp8_fdot2_lane>;
+ defm FDOT : SIMD_FP8_Dot2<"fdot", int_aarch64_neon_fp8_fdot2>;
 } // End let Predicates = [HasFP8DOT2]
 
-let Uses = [FPMR, FPCR], Predicates = [HasFP8DOT4] in {
- defm FDOTlane : SIMDThreeSameVectorFP8DOT4Index<"fdot">;
- defm FDOT : SIMDThreeSameVectorDOT4<"fdot">;
+let Predicates = [HasFP8DOT4] in {
+ defm FDOTlane : SIMD_FP8_Dot4_Index<"fdot", int_aarch64_neon_fp8_fdot4_lane>;
+ defm FDOT : SIMD_FP8_Dot4<"fdot", int_aarch64_neon_fp8_fdot4>;
 } // End let Predicates = [HasFP8DOT4]
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll b/llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll
new file mode 100644
index 000000000000000..b7a35c5fddf1707
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8dot2,+fp8dot4 < %s | FileCheck %s
+
+define <4 x half> @test_fdot_f16(<4 x half> %vd, <8 x i8> %vn, <8 x i8> %vm) {
+; CHECK-LABEL: test_fdot_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.4h, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %res = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> %vd, <8 x i8> %vn, <8 x i8> %vm)
+  ret <4 x half> %res
+}
+
+define <8 x half> @test_fdotq_f16(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fdotq_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.8h, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm)
+  ret <8 x half> %res
+}
+
+define <4 x half> @test_fdot_lane_f16(<4 x half> %vd, <8 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fdot_lane_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.4h, v1.8b, v2.2b[0]
+; CHECK-NEXT:    ret
+  %res = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> %vd, <8 x i8> %vn, <16 x i8> %vm, i32 0)
+  ret <4 x half> %res
+}
+
+define <8 x half> @test_fdotq_lane_f16(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fdotq_lane_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.8h, v1.16b, v2.2b[7]
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm, i32 7)
+  ret <8 x half> %res
+}
+
+define <2 x float> @test_fdot_f32(<2 x float> %vd, <8 x i8> %vn, <8 x i8> %vm) {
+; CHECK-LABEL: test_fdot_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.2s, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %res = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.v2f32.v8i8(<2 x float> %vd, <8 x i8> %vn, <8 x i8> %vm)
+  ret <2 x float> %res
+}
+
+define <4 x float> @test_fdotq_f32(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fdotq_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %res = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.v4f32.v16i8(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm)
+  ret <4 x float> %res
+}
+
+define <2 x float> @test_fdot_lane_f32(<2 x float> %vd, <8 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fdot_lane_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.2s, v1.8b, v2.4b[0]
+; CHECK-NEXT:    ret
+  %res = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> %vd, <8 x i8> %vn, <16 x i8> %vm, i32 0)
+  ret <2 x float> %res
+}
+
+define <4 x float> @test_fdotq_lane_f32(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fdotq_lane_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.4s, v1.16b, v2.4b[3]
+; CHECK-NEXT:    ret
+  %res = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm, i32 3)
+  ret <4 x float> %res
+}

>From 4b0118447e342661972b1a1b59f1c7d579d91fb2 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 17 Dec 2024 13:23:31 +0000
Subject: [PATCH 14/19] [fixup] Remove not needed argument (NFC)

---
 clang/lib/CodeGen/CGBuiltin.cpp     | 15 ++++++---------
 clang/lib/CodeGen/CodeGenFunction.h |  3 +--
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 9c1cef83103d814..69ce28ecb642e0c 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6907,8 +6907,7 @@ Value *CodeGenFunction::EmitFP8NeonCall(Function *F,
 
 llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
     unsigned IID, bool ExtendLane, llvm::Type *RetTy,
-    SmallVectorImpl<llvm::Value *> &Ops, unsigned ICEArguments,
-    const CallExpr *E, const char *name) {
+    SmallVectorImpl<llvm::Value *> &Ops, const CallExpr *E, const char *name) {
 
   const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() /
                              RetTy->getPrimitiveSizeInBits();
@@ -6920,7 +6919,7 @@ llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
                                         Builder.getInt64(0));
   }
   llvm::Value *FPM =
-      EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+      EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
   return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
 }
 
@@ -14156,7 +14155,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vdot_f16_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_f16_mf8_fpm:
     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2, false, HalfTy,
-                               Ops, ICEArguments, E, "fdot2");
+                               Ops, E, "fdot2");
   case NEON::BI__builtin_neon_vdot_lane_f16_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_lane_f16_mf8_fpm:
     ExtendLane = true;
@@ -14164,12 +14163,11 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vdot_laneq_f16_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_laneq_f16_mf8_fpm:
     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2_lane,
-                               ExtendLane, HalfTy, Ops, ICEArguments, E,
-                               "fdot2_lane");
+                               ExtendLane, HalfTy, Ops, E, "fdot2_lane");
   case NEON::BI__builtin_neon_vdot_f32_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_f32_mf8_fpm:
     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4, false,
-                               FloatTy, Ops, ICEArguments, E, "fdot4");
+                               FloatTy, Ops, E, "fdot4");
   case NEON::BI__builtin_neon_vdot_lane_f32_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_lane_f32_mf8_fpm:
     ExtendLane = true;
@@ -14177,8 +14175,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vdot_laneq_f32_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm:
     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane,
-                               ExtendLane, FloatTy, Ops, ICEArguments, E,
-                               "fdot4_lane");
+                               ExtendLane, FloatTy, Ops, E, "fdot4_lane");
   case NEON::BI__builtin_neon_vamin_f16:
   case NEON::BI__builtin_neon_vaminq_f16:
   case NEON::BI__builtin_neon_vamin_f32:
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index b5f690b2f6ee0d7..9798a8f55072442 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4692,8 +4692,7 @@ class CodeGenFunction : public CodeGenTypeCache {
   llvm::Value *EmitFP8NeonFDOTCall(unsigned IID, bool ExtendLane,
                                    llvm::Type *RetTy,
                                    SmallVectorImpl<llvm::Value *> &Ops,
-                                   unsigned ICEArguments, const CallExpr *E,
-                                   const char *name);
+                                   const CallExpr *E, const char *name);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx,
                              const llvm::ElementCount &Count);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx);

>From b57c87e10d69f5acf4209003f082a0191f899e2e Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 3 Jan 2025 12:02:55 +0000
Subject: [PATCH 15/19] [fixup] Update intrinsics declarations

---
 clang/include/clang/Basic/arm_neon.td | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 5d1267213f238a8..6952cbd93bc9145 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2148,25 +2148,23 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
 }
 
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot2,neon" in {
-  def VDOT_F16_MF8  : VInst<"vdot_f16_mf8_fpm", "(>F)(>F)..V", "m">;
-  def VDOTQ_F16_MF8 : VInst<"vdotq_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+  def VDOT_F16_MF8 : VInst<"vdot_f16_mf8_fpm", "(>F)(>F)..V", "mQm">;
 
-  def VDOT_LANE_F16_MF8 : VInst<"vdot_lane_f16_mf8_fpm", "(>F)(>F)..IV", "m",   [ImmCheck<3, ImmCheck0_3, 0>]>;
-  def VDOT_LANEQ_F16_MF8 : VInst<"vdot_laneq_f16_mf8_fpm", "(>F)(>F).QIV", "m",   [ImmCheck<3, ImmCheck0_7, 0>]>;
+  def VDOT_LANE_F16_MF8  : VInst<"vdot_lane_f16_mf8_fpm",  "(>F)(>F)..IV", "m", [ImmCheck<3, ImmCheck0_3, 0>]>;
+  def VDOT_LANEQ_F16_MF8 : VInst<"vdot_laneq_f16_mf8_fpm", "(>F)(>F).QIV", "m", [ImmCheck<3, ImmCheck0_7, 0>]>;
 
-  def VDOTQ_LANE_F16_MF8 : VInst<"vdotq_lane_f16_mf8_fpm", "(>F)(>F).qIV", "Qm",   [ImmCheck<3, ImmCheck0_3, 0>]>;
-  def VDOTQ_LANEQ_F16_MF8 : VInst<"vdotq_laneq_f16_mf8_fpm", "(>F)(>F)..IV", "Qm",   [ImmCheck<3, ImmCheck0_7, 0>]>;
+  def VDOTQ_LANE_F16_MF8  : VInst<"vdot_lane_f16_mf8_fpm",  "(>F)(>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_3, 0>]>;
+  def VDOTQ_LANEQ_F16_MF8 : VInst<"vdot_laneq_f16_mf8_fpm", "(>F)(>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_7, 0>]>;
 }
 
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot4,neon" in {
-  def VDOT_F32_MF8  : VInst<"vdot_f32_mf8_fpm", "(>>F)(>>F)..V", "m">;
-  def VDOTQ_F32_MF8 : VInst<"vdotq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+  def VDOT_F32_MF8 : VInst<"vdot_f32_mf8_fpm", "(>>F)(>>F)..V", "mQm">;
 
-  def VDOT_LANE_F32_MF8 : VInst<"vdot_lane_f32_mf8_fpm", "(>>F)(>>F)..IV", "m",   [ImmCheck<3, ImmCheck0_1, 0>]>;
-  def VDOT_LANEQ_F32_MF8 : VInst<"vdot_laneq_f32_mf8_fpm", "(>>F)(>>F).QIV", "m",   [ImmCheck<3, ImmCheck0_3, 0>]>;
+  def VDOT_LANE_F32_MF8  : VInst<"vdot_lane_f32_mf8_fpm",  "(>>F)(>>F)..IV", "m", [ImmCheck<3, ImmCheck0_1, 0>]>;
+  def VDOT_LANEQ_F32_MF8 : VInst<"vdot_laneq_f32_mf8_fpm", "(>>F)(>>F).QIV", "m", [ImmCheck<3, ImmCheck0_3, 0>]>;
 
-  def VDOTQ_LANE_F32_MF8 : VInst<"vdotq_lane_f32_mf8_fpm", "(>>F)(>>F).qIV", "Qm",   [ImmCheck<3, ImmCheck0_1, 0>]>;
-  def VDOTQ_LANEQ_F32_MF8 : VInst<"vdotq_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm",   [ImmCheck<3, ImmCheck0_3, 0>]>;
+  def VDOTQ_LANE_F32_MF8  : VInst<"vdot_lane_f32_mf8_fpm",  "(>>F)(>>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_1, 0>]>;
+  def VDOTQ_LANEQ_F32_MF8 : VInst<"vdot_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_3, 0>]>;
 }
 
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {

>From d7896c34fc681c020b39a5a6d7ddfc403d915d20 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 14 Jan 2025 10:35:21 +0000
Subject: [PATCH 16/19] [fixup] Add C++ runs to tests, remove some opt passes

---
 .../fp8-intrinsics/acle_neon_fp8_fdot.c       | 171 +++++++++++++++---
 1 file changed, 141 insertions(+), 30 deletions(-)

diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c
index b273bc2abe87797..4b565adf1786176 100644
--- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c
@@ -1,18 +1,30 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
 
-// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix CHECK-CXX
 
-// REQUIES: aarch64-registered-target
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -O3 -Werror -Wall -S -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
 
 #include <arm_neon.h>
 
 // CHECK-LABEL: define dso_local <4 x half> @test_vdot_f16(
 // CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CHECK-NEXT:    [[FDOT21_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8>
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT21_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
 // CHECK-NEXT:    ret <4 x half> [[FDOT21_I]]
 //
+// CHECK-CXX-LABEL: define dso_local noundef <4 x half> @_Z13test_vdot_f1613__Float16x4_tu13__MFloat8x8_tu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8>
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT21_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <4 x half> [[FDOT21_I]]
+//
 float16x4_t test_vdot_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
   return vdot_f16_mf8_fpm(vd, vn, vm, fpmr);
 }
@@ -20,10 +32,19 @@ float16x4_t test_vdot_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t
 // CHECK-LABEL: define dso_local <8 x half> @test_vdotq_f16(
 // CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CHECK-NEXT:    [[FDOT21_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT21_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
 // CHECK-NEXT:    ret <8 x half> [[FDOT21_I]]
 //
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z14test_vdotq_f1613__Float16x8_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT21_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[FDOT21_I]]
+//
 float16x8_t test_vdotq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
   return vdotq_f16_mf8_fpm(vd, vn, vm, fpmr);
 }
@@ -31,11 +52,23 @@ float16x8_t test_vdotq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm
 // CHECK-LABEL: define dso_local <4 x half> @test_vdot_lane_f16(
 // CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 3)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[TMP1]], i32 3)
 // CHECK-NEXT:    ret <4 x half> [[FDOT2_LANE1]]
 //
+// CHECK-CXX-LABEL: define dso_local noundef <4 x half> @_Z18test_vdot_lane_f1613__Float16x4_tu13__MFloat8x8_tu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8>
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-CXX-NEXT:    [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[TMP1]], i32 3)
+// CHECK-CXX-NEXT:    ret <4 x half> [[FDOT2_LANE1]]
+//
 float16x4_t test_vdot_lane_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
   return vdot_lane_f16_mf8_fpm(vd, vn, vm, 3, fpmr);
 }
@@ -43,10 +76,21 @@ float16x4_t test_vdot_lane_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, f
 // CHECK-LABEL: define dso_local <4 x half> @test_vdot_laneq_f16(
 // CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8>
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
 // CHECK-NEXT:    ret <4 x half> [[FDOT2_LANE1]]
 //
+// CHECK-CXX-LABEL: define dso_local noundef <4 x half> @_Z19test_vdot_laneq_f1613__Float16x4_tu13__MFloat8x8_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8>
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half>
+// CHECK-CXX-NEXT:    [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-CXX-NEXT:    ret <4 x half> [[FDOT2_LANE1]]
+//
 float16x4_t test_vdot_laneq_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x16_t vm, fpm_t fpmr) {
   return vdot_laneq_f16_mf8_fpm(vd, vn, vm, 7, fpmr);
 }
@@ -54,11 +98,23 @@ float16x4_t test_vdot_laneq_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x16_t vm,
 // CHECK-LABEL: define dso_local <8 x half> @test_vdotq_lane_f16(
 // CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 3)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP1]], i32 3)
 // CHECK-NEXT:    ret <8 x half> [[FDOT2_LANE1]]
 //
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z19test_vdotq_lane_f1613__Float16x8_tu14__MFloat8x16_tu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-CXX-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-CXX-NEXT:    [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP1]], i32 3)
+// CHECK-CXX-NEXT:    ret <8 x half> [[FDOT2_LANE1]]
+//
 float16x8_t test_vdotq_lane_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) {
   return vdotq_lane_f16_mf8_fpm(vd, vn, vm, 3, fpmr);
 }
@@ -66,10 +122,21 @@ float16x8_t test_vdotq_lane_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm,
 // CHECK-LABEL: define dso_local <8 x half> @test_vdotq_laneq_f16(
 // CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
 // CHECK-NEXT:    ret <8 x half> [[FDOT2_LANE1]]
 //
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z20test_vdotq_laneq_f1613__Float16x8_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half>
+// CHECK-CXX-NEXT:    [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-CXX-NEXT:    ret <8 x half> [[FDOT2_LANE1]]
+//
 float16x8_t test_vdotq_laneq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
   return vdotq_laneq_f16_mf8_fpm(vd, vn, vm, 7, fpmr);
 }
@@ -77,10 +144,17 @@ float16x8_t test_vdotq_laneq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t v
 // CHECK-LABEL: define dso_local <2 x float> @test_vdot_f32(
 // CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CHECK-NEXT:    [[FDOT4_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
 // CHECK-NEXT:    ret <2 x float> [[FDOT4_I]]
 //
+// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z13test_vdot_f3213__Float32x2_tu13__MFloat8x8_tu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT4_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <2 x float> [[FDOT4_I]]
+//
 float32x2_t test_vdot_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
   return vdot_f32_mf8_fpm(vd, vn, vm, fpmr);
 }
@@ -88,10 +162,17 @@ float32x2_t test_vdot_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t
 // CHECK-LABEL: define dso_local <4 x float> @test_vdotq_f32(
 // CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CHECK-NEXT:    [[FDOT4_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
 // CHECK-NEXT:    ret <4 x float> [[FDOT4_I]]
 //
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z14test_vdotq_f3213__Float32x4_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT4_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <4 x float> [[FDOT4_I]]
+//
 float32x4_t test_vdotq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
   return vdotq_f32_mf8_fpm(vd, vn, vm, fpmr);
 }
@@ -99,11 +180,19 @@ float32x4_t test_vdotq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm
 // CHECK-LABEL: define dso_local <2 x float> @test_vdot_lane_f32(
 // CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CHECK-NEXT:    [[FDOT4_LANE:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_LANE:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
 // CHECK-NEXT:    ret <2 x float> [[FDOT4_LANE]]
 //
+// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z18test_vdot_lane_f3213__Float32x2_tu13__MFloat8x8_tu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT4_LANE:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
+// CHECK-CXX-NEXT:    ret <2 x float> [[FDOT4_LANE]]
+//
 float32x2_t test_vdot_lane_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
   return vdot_lane_f32_mf8_fpm(vd, vn, vm, 1, fpmr);
 }
@@ -111,10 +200,17 @@ float32x2_t test_vdot_lane_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, f
 // CHECK-LABEL: define dso_local <2 x float> @test_vdot_laneq_f32(
 // CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CHECK-NEXT:    [[FDOT4_LANE:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_LANE:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
 // CHECK-NEXT:    ret <2 x float> [[FDOT4_LANE]]
 //
+// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z19test_vdot_laneq_f3213__Float32x2_tu13__MFloat8x8_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT4_LANE:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
+// CHECK-CXX-NEXT:    ret <2 x float> [[FDOT4_LANE]]
+//
 float32x2_t test_vdot_laneq_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x16_t vm, fpm_t fpmr) {
   return vdot_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpmr);
 }
@@ -122,11 +218,19 @@ float32x2_t test_vdot_laneq_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x16_t vm,
 // CHECK-LABEL: define dso_local <4 x float> @test_vdotq_lane_f32(
 // CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CHECK-NEXT:    [[FDOT4_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
 // CHECK-NEXT:    ret <4 x float> [[FDOT4_LANE]]
 //
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z19test_vdotq_lane_f3213__Float32x4_tu14__MFloat8x16_tu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0)
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT4_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
+// CHECK-CXX-NEXT:    ret <4 x float> [[FDOT4_LANE]]
+//
 float32x4_t test_vdotq_lane_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) {
   return vdotq_lane_f32_mf8_fpm(vd, vn, vm, 1, fpmr);
 }
@@ -134,10 +238,17 @@ float32x4_t test_vdotq_lane_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm,
 // CHECK-LABEL: define dso_local <4 x float> @test_vdotq_laneq_f32(
 // CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
-// CHECK-NEXT:    [[FDOT4_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
 // CHECK-NEXT:    ret <4 x float> [[FDOT4_LANE]]
 //
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z20test_vdotq_laneq_f3213__Float32x4_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-CXX-NEXT:    [[FDOT4_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
+// CHECK-CXX-NEXT:    ret <4 x float> [[FDOT4_LANE]]
+//
 float32x4_t test_vdotq_laneq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
   return vdotq_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpmr);
 }

>From 8a1a735114c5688015053a85ba93603bff763417 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 17 Dec 2024 17:10:38 +0000
Subject: [PATCH 17/19] [AArch64] Implement NEON FP8 fused multiply-add
 intrinsics (non-indexed)

This patch adds the following intrinsics:

    float16x8_t vmlalbq_f16_mf8_fpm(float16x8_t, mfloat8x16_t, mfloat8x16_t, fpm_t)
    float16x8_t vmlaltq_f16_mf8_fpm(float16x8_t, mfloat8x16_t, mfloat8x16_t, fpm_t)

    float32x4_t vmlallbbq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t, fpm_t)
    float32x4_t vmlallbtq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t, fpm_t)
    float32x4_t vmlalltbq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t, fpm_t)
    float32x4_t vmlallttq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t, fpm_t)
---
 clang/include/clang/Basic/arm_neon.td         |  11 ++
 clang/lib/CodeGen/CGBuiltin.cpp               |  43 +++++--
 clang/lib/CodeGen/CodeGenFunction.h           |   4 +-
 .../fp8-intrinsics/acle_neon_fp8_fmla.c       | 117 ++++++++++++++++++
 .../acle_neon_fp8_fmla.c                      |  22 ++++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  17 +++
 .../lib/Target/AArch64/AArch64InstrFormats.td |   9 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  14 +--
 llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll    |  56 +++++++++
 9 files changed, 271 insertions(+), 22 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c
 create mode 100644 clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c
 create mode 100644 llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 6952cbd93bc9145..06c4636916bd1a3 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2167,6 +2167,17 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot4,neon" in {
   def VDOTQ_LANEQ_F32_MF8 : VInst<"vdot_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_3, 0>]>;
 }
 
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8fma,neon" in {
+  def VMLALB_F16_F8 : VInst<"vmlalbq_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+  def VMLALT_F16_F8 : VInst<"vmlaltq_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+
+  def VMLALLBB_F32_F8 : VInst<"vmlallbbq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+  def VMLALLBT_F32_F8 : VInst<"vmlallbtq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+  def VMLALLTB_F32_F8 : VInst<"vmlalltbq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+  def VMLALLTT_F32_F8 : VInst<"vmlallttq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+}
+
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
   def FAMIN : WInst<"vamin", "...", "fhQdQfQh">;
   def FAMAX : WInst<"vamax", "...", "fhQdQfQh">;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 69ce28ecb642e0c..d6dc01d4beecc2d 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6898,11 +6898,14 @@ Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
     return Builder.CreateCall(F, Ops, name);
 }
 
-Value *CodeGenFunction::EmitFP8NeonCall(Function *F,
+Value *CodeGenFunction::EmitFP8NeonCall(unsigned IID,
+                                        ArrayRef<llvm::Type *> Tys,
                                         SmallVectorImpl<Value *> &Ops,
-                                        Value *FPM, const char *name) {
+                                        const CallExpr *E, const char *name) {
+  llvm::Value *FPM =
+      EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
   Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), FPM);
-  return EmitNeonCall(F, Ops, name);
+  return EmitNeonCall(CGM.getIntrinsic(IID, Tys), Ops, name);
 }
 
 llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
@@ -6918,9 +6921,7 @@ llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
     Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2],
                                         Builder.getInt64(0));
   }
-  llvm::Value *FPM =
-      EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
-  return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
+  return EmitFP8NeonCall(IID, Tys, Ops, E, name);
 }
 
 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
@@ -6941,9 +6942,7 @@ Value *CodeGenFunction::EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0,
     Tys[1] = llvm::FixedVectorType::get(Int8Ty, 8);
     Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
   }
-  llvm::Value *FPM =
-      EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
-  return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
+  return EmitFP8NeonCall(IID, Tys, Ops, E, name);
 }
 
 // Right-shift a vector by a constant.
@@ -14176,6 +14175,32 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm:
     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane,
                                ExtendLane, FloatTy, Ops, E, "fdot4_lane");
+
+  case NEON::BI__builtin_neon_vmlalbq_f16_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalb,
+                           {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E,
+                           "vmlal");
+  case NEON::BI__builtin_neon_vmlaltq_f16_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalt,
+                           {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E,
+                           "vmlal");
+  case NEON::BI__builtin_neon_vmlallbbq_f32_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlallbb,
+                           {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
+                           "vmlall");
+  case NEON::BI__builtin_neon_vmlallbtq_f32_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlallbt,
+                           {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
+                           "vmlall");
+  case NEON::BI__builtin_neon_vmlalltbq_f32_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalltb,
+                           {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
+                           "vmlall");
+  case NEON::BI__builtin_neon_vmlallttq_f32_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalltt,
+                           {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
+                           "vmlall");
+
   case NEON::BI__builtin_neon_vamin_f16:
   case NEON::BI__builtin_neon_vaminq_f16:
   case NEON::BI__builtin_neon_vamin_f32:
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 9798a8f55072442..bc25b01ee727a1f 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4682,9 +4682,9 @@ class CodeGenFunction : public CodeGenTypeCache {
                             SmallVectorImpl<llvm::Value*> &O,
                             const char *name,
                             unsigned shift = 0, bool rightshift = false);
-  llvm::Value *EmitFP8NeonCall(llvm::Function *F,
+  llvm::Value *EmitFP8NeonCall(unsigned IID, ArrayRef<llvm::Type *> Tys,
                                SmallVectorImpl<llvm::Value *> &O,
-                               llvm::Value *FPM, const char *name);
+                               const CallExpr *E, const char *name);
   llvm::Value *EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0,
                                   llvm::Type *Ty1, bool Extract,
                                   SmallVectorImpl<llvm::Value *> &Ops,
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c
new file mode 100644
index 000000000000000..528bbd694518653
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c
@@ -0,0 +1,117 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix CHECK-CXX
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -S -o /dev/null %s
+
+// REQUIES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vmlalb(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLAL1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <8 x half> [[VMLAL1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z11test_vmlalb13__Float16x8_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLAL1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VMLAL1_I]]
+//
+float16x8_t test_vmlalb(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlalbq_f16_mf8_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vmlalt(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLAL1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <8 x half> [[VMLAL1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z11test_vmlalt13__Float16x8_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLAL1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VMLAL1_I]]
+//
+float16x8_t test_vmlalt(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlaltq_f16_mf8_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlallbb(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z13test_vmlallbb13__Float32x4_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+float32x4_t test_vmlallbb(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlallbbq_f32_mf8_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlallbt(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z13test_vmlallbt13__Float32x4_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+float32x4_t test_vmlallbt(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlallbtq_f32_mf8_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlalltb(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z13test_vmlalltb13__Float32x4_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+float32x4_t test_vmlalltb(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlalltbq_f32_mf8_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlalltt(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z13test_vmlalltt13__Float32x4_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+float32x4_t test_vmlalltt(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlallttq_f32_mf8_fpm(vd, vn, vm, fpm);
+}
diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c
new file mode 100644
index 000000000000000..fcdd14e583101ed
--- /dev/null
+++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +faminmax -target-feature +fp8 -emit-llvm -verify %s -o /dev/null
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+void test_features(float16x8_t a, float32x4_t b, mfloat8x16_t u, fpm_t fpm) {
+
+  (void) vmlalbq_f16_mf8_fpm(a, u, u, fpm);
+  // expected-error at -1 {{'vmlalbq_f16_mf8_fpm' requires target feature 'fp8fma'}}
+  (void) vmlaltq_f16_mf8_fpm(a, u, u, fpm);
+  // expected-error at -1 {{'vmlaltq_f16_mf8_fpm' requires target feature 'fp8fma'}}
+  (void) vmlallbbq_f32_mf8_fpm(b, u, u, fpm);
+  // expected-error at -1 {{'vmlallbbq_f32_mf8_fpm' requires target feature 'fp8fma'}}
+  (void) vmlallbtq_f32_mf8_fpm(b, u, u, fpm);
+  // expected-error at -1 {{'vmlallbtq_f32_mf8_fpm' requires target feature 'fp8fma'}}
+  (void) vmlalltbq_f32_mf8_fpm(b, u, u, fpm);
+  // expected-error at -1 {{'vmlalltbq_f32_mf8_fpm' requires target feature 'fp8fma'}}
+  (void) vmlallttq_f32_mf8_fpm(b, u, u, fpm);
+  // expected-error at -1 {{'vmlallttq_f32_mf8_fpm' requires target feature 'fp8fma'}}
+}
+
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 2c7ef00edfa3017..8374debf245c64f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1047,6 +1047,23 @@ def int_aarch64_st64bv0: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], dat
 
   def int_aarch64_neon_fp8_fdot4 : AdvSIMD_FP8_DOT_Intrinsic;
   def int_aarch64_neon_fp8_fdot4_lane : AdvSIMD_FP8_DOT_LANE_Intrinsic;
+
+
+// Fused multiply-add
+  class AdvSIMD_FP8_FMLA_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [LLVMMatchType<0>,
+                             llvm_v16i8_ty,
+                             llvm_v16i8_ty],
+                             [IntrReadMem, IntrInaccessibleMemOnly]>;
+
+  def int_aarch64_neon_fp8_fmlalb : AdvSIMD_FP8_FMLA_Intrinsic;
+  def int_aarch64_neon_fp8_fmlalt : AdvSIMD_FP8_FMLA_Intrinsic;
+
+  def int_aarch64_neon_fp8_fmlallbb : AdvSIMD_FP8_FMLA_Intrinsic;
+  def int_aarch64_neon_fp8_fmlallbt : AdvSIMD_FP8_FMLA_Intrinsic;
+  def int_aarch64_neon_fp8_fmlalltb : AdvSIMD_FP8_FMLA_Intrinsic;
+  def int_aarch64_neon_fp8_fmlalltt : AdvSIMD_FP8_FMLA_Intrinsic;
 }
 
 def llvm_nxv1i1_ty  : LLVMType<nxv1i1>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 78cfdb412416d2e..03bb5775839e067 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6518,14 +6518,15 @@ multiclass SIMDThreeSameVectorFML<bit U, bit b13, bits<3> size, string asm,
                                          v4f32, v8f16, OpNode>;
 }
 
-multiclass SIMDThreeSameVectorMLA<bit Q, string asm>{
+multiclass SIMDThreeSameVectorMLA<bit Q, string asm, SDPatternOperator op> {
+
   def v8f16 : BaseSIMDThreeSameVectorDot<Q, 0b0, 0b11, 0b1111, asm, ".8h", ".16b",
-                                         V128, v8f16, v16i8, null_frag>;
+                                         V128, v8f16, v16i8, op>;
 }
 
-multiclass SIMDThreeSameVectorMLAL<bit Q, bits<2> sz, string asm>{
+multiclass SIMDThreeSameVectorMLAL<bit Q, bits<2> sz, string asm, SDPatternOperator op> {
   def v4f32 : BaseSIMDThreeSameVectorDot<Q, 0b0, sz, 0b1000, asm, ".4s", ".16b",
-                                         V128, v4f32, v16i8, null_frag>;
+                                         V128, v4f32, v16i8, op>;
 }
 
 // FP8 assembly/disassembly classes
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 463a8f2f95e33e1..e0b28906076e193 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -10330,7 +10330,7 @@ let Predicates = [HasNEON, HasFAMINMAX] in {
  defm FAMIN : SIMDThreeSameVectorFP<0b1, 0b1, 0b011, "famin", AArch64famin>;
 } // End let Predicates = [HasNEON, HasFAMINMAX]
 
-let Uses = [FPMR, FPCR], Predicates = [HasFP8FMA] in {
+let Predicates = [HasFP8FMA], Uses = [FPMR, FPCR], mayLoad = 1 in {
  defm FMLALBlane : SIMDThreeSameVectorMLAIndex<0b0, "fmlalb">;
  defm FMLALTlane : SIMDThreeSameVectorMLAIndex<0b1, "fmlalt">;
  defm FMLALLBBlane : SIMDThreeSameVectorMLALIndex<0b0, 0b00, "fmlallbb">;
@@ -10338,12 +10338,12 @@ let Uses = [FPMR, FPCR], Predicates = [HasFP8FMA] in {
  defm FMLALLTBlane : SIMDThreeSameVectorMLALIndex<0b1, 0b00, "fmlalltb">;
  defm FMLALLTTlane : SIMDThreeSameVectorMLALIndex<0b1, 0b01, "fmlalltt">;
 
- defm FMLALB : SIMDThreeSameVectorMLA<0b0, "fmlalb">;
- defm FMLALT : SIMDThreeSameVectorMLA<0b1, "fmlalt">;
- defm FMLALLBB : SIMDThreeSameVectorMLAL<0b0, 0b00, "fmlallbb">;
- defm FMLALLBT : SIMDThreeSameVectorMLAL<0b0, 0b01, "fmlallbt">;
- defm FMLALLTB : SIMDThreeSameVectorMLAL<0b1, 0b00, "fmlalltb">;
- defm FMLALLTT : SIMDThreeSameVectorMLAL<0b1, 0b01, "fmlalltt">;
+ defm FMLALB : SIMDThreeSameVectorMLA<0b0, "fmlalb", int_aarch64_neon_fp8_fmlalb>;
+ defm FMLALT : SIMDThreeSameVectorMLA<0b1, "fmlalt", int_aarch64_neon_fp8_fmlalt>;
+ defm FMLALLBB : SIMDThreeSameVectorMLAL<0b0, 0b00, "fmlallbb", int_aarch64_neon_fp8_fmlallbb>;
+ defm FMLALLBT : SIMDThreeSameVectorMLAL<0b0, 0b01, "fmlallbt", int_aarch64_neon_fp8_fmlallbt>;
+ defm FMLALLTB : SIMDThreeSameVectorMLAL<0b1, 0b00, "fmlalltb", int_aarch64_neon_fp8_fmlalltb>;
+ defm FMLALLTT : SIMDThreeSameVectorMLAL<0b1, 0b01, "fmlalltt", int_aarch64_neon_fp8_fmlalltt>;
 } // End let Predicates = [HasFP8FMA]
 
 let Predicates = [HasFP8DOT2] in {
diff --git a/llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll b/llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll
new file mode 100644
index 000000000000000..008069ff63761f0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll
@@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8fma < %s | FileCheck %s
+
+define <8 x half> @test_fmlalb(<8 x half> %d, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_fmlalb:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalb v0.8h, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %r = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.v8f16(<8 x half> %d, <16 x i8> %a, <16 x i8> %b)
+  ret <8 x half> %r
+}
+
+define <8 x half> @test_fmlalt(<8 x half> %d, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_fmlalt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalt v0.8h, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %r = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.v8f16(<8 x half> %d, <16 x i8> %a, <16 x i8> %b)
+  ret <8 x half> %r
+}
+
+define <4 x float> @test_fmlallbb(<4 x float> %d, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_fmlallbb:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlallbb v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %r = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.v4f32(<4 x float> %d, <16 x i8> %a, <16 x i8> %b)
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_fmlallbt(<4 x float> %d, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_fmlallbt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlallbt v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %r = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.v4f32(<4 x float> %d, <16 x i8> %a, <16 x i8> %b)
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_fmlalltb(<4 x float> %d, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_fmlalltb:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalltb v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %r = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.v4f32(<4 x float> %d, <16 x i8> %a, <16 x i8> %b)
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_fmlalltt(<4 x float> %d, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_fmlalltt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalltt v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %r = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.v4f32(<4 x float> %d, <16 x i8> %a, <16 x i8> %b)
+  ret <4 x float> %r
+}

>From 9749148142b6e2ef5a1bc5737678c681757e1b80 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 3 Jan 2025 13:30:54 +0000
Subject: [PATCH 18/19] [fixup] Update intrinsics definitions

---
 clang/include/clang/Basic/arm_neon.td | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 06c4636916bd1a3..e0e342a76a9cd7d 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2167,15 +2167,14 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot4,neon" in {
   def VDOTQ_LANEQ_F32_MF8 : VInst<"vdot_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_3, 0>]>;
 }
 
-
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8fma,neon" in {
-  def VMLALB_F16_F8 : VInst<"vmlalbq_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
-  def VMLALT_F16_F8 : VInst<"vmlaltq_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+  def VMLALB_F16_F8 : VInst<"vmlalb_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+  def VMLALT_F16_F8 : VInst<"vmlalt_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
 
-  def VMLALLBB_F32_F8 : VInst<"vmlallbbq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
-  def VMLALLBT_F32_F8 : VInst<"vmlallbtq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
-  def VMLALLTB_F32_F8 : VInst<"vmlalltbq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
-  def VMLALLTT_F32_F8 : VInst<"vmlallttq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+  def VMLALLBB_F32_F8 : VInst<"vmlallbb_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+  def VMLALLBT_F32_F8 : VInst<"vmlallbt_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+  def VMLALLTB_F32_F8 : VInst<"vmlalltb_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+  def VMLALLTT_F32_F8 : VInst<"vmlalltt_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
 }
 
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {

>From 2b939e3aee18e3368d6e0734123a1f91edfe91cc Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 14 Jan 2025 10:49:31 +0000
Subject: [PATCH 19/19] [fixup] Remove some opt passes from RUN lines

---
 .../fp8-intrinsics/acle_neon_fp8_fmla.c       | 58 ++++++++++---------
 1 file changed, 31 insertions(+), 27 deletions(-)

diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c
index 528bbd694518653..13006ede59f6a96 100644
--- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c
@@ -1,25 +1,27 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
-// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
-// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix CHECK-CXX
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix CHECK-CXX
 
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -S -o /dev/null %s
 
-// REQUIES: aarch64-registered-target
+// REQUIRES: aarch64-registered-target
 
 #include <arm_neon.h>
 
 // CHECK-LABEL: define dso_local <8 x half> @test_vmlalb(
 // CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VMLAL1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLAL1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
 // CHECK-NEXT:    ret <8 x half> [[VMLAL1_I]]
 //
 // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z11test_vmlalb13__Float16x8_tu14__MFloat8x16_tu14__MFloat8x16_tm(
 // CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VMLAL1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLAL1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
 // CHECK-CXX-NEXT:    ret <8 x half> [[VMLAL1_I]]
 //
 float16x8_t test_vmlalb(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
@@ -29,15 +31,17 @@ float16x8_t test_vmlalb(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t
 // CHECK-LABEL: define dso_local <8 x half> @test_vmlalt(
 // CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VMLAL1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLAL1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
 // CHECK-NEXT:    ret <8 x half> [[VMLAL1_I]]
 //
 // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z11test_vmlalt13__Float16x8_tu14__MFloat8x16_tu14__MFloat8x16_tm(
 // CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VMLAL1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8>
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLAL1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
 // CHECK-CXX-NEXT:    ret <8 x half> [[VMLAL1_I]]
 //
 float16x8_t test_vmlalt(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
@@ -47,15 +51,15 @@ float16x8_t test_vmlalt(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t
 // CHECK-LABEL: define dso_local <4 x float> @test_vmlallbb(
 // CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
 // CHECK-NEXT:    ret <4 x float> [[VMLALL_I]]
 //
 // CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z13test_vmlallbb13__Float32x4_tu14__MFloat8x16_tu14__MFloat8x16_tm(
 // CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
 // CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_I]]
 //
 float32x4_t test_vmlallbb(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
@@ -65,15 +69,15 @@ float32x4_t test_vmlallbb(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_
 // CHECK-LABEL: define dso_local <4 x float> @test_vmlallbt(
 // CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
 // CHECK-NEXT:    ret <4 x float> [[VMLALL_I]]
 //
 // CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z13test_vmlallbt13__Float32x4_tu14__MFloat8x16_tu14__MFloat8x16_tm(
 // CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
 // CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_I]]
 //
 float32x4_t test_vmlallbt(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
@@ -83,15 +87,15 @@ float32x4_t test_vmlallbt(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_
 // CHECK-LABEL: define dso_local <4 x float> @test_vmlalltb(
 // CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
 // CHECK-NEXT:    ret <4 x float> [[VMLALL_I]]
 //
 // CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z13test_vmlalltb13__Float32x4_tu14__MFloat8x16_tu14__MFloat8x16_tm(
 // CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
 // CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_I]]
 //
 float32x4_t test_vmlalltb(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
@@ -101,15 +105,15 @@ float32x4_t test_vmlalltb(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_
 // CHECK-LABEL: define dso_local <4 x float> @test_vmlalltt(
 // CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
 // CHECK-NEXT:    ret <4 x float> [[VMLALL_I]]
 //
 // CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z13test_vmlalltt13__Float32x4_tu14__MFloat8x16_tu14__MFloat8x16_tm(
 // CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
 // CHECK-CXX-NEXT:  [[ENTRY:.*:]]
-// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
-// CHECK-CXX-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
 // CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_I]]
 //
 float32x4_t test_vmlalltt(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {