[clang] [llvm] [AArch64] Implement NEON FP8 intrinsics for fused multiply-add (indexed) (PR #120403)

Wed Dec 18 03:00:18 PST 2024

https://github.com/momchil-velikov created https://github.com/llvm/llvm-project/pull/120403

This patch adds the following intrinsics:
    
* Floating-point multiply-add long to half-precision (vector, by element)

        float16x8_t vmlalbq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
        float16x8_t vmlalbq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
        float16x8_t vmlaltq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
        float16x8_t vmlaltq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
    
* Floating-point multiply-add long-long to single-precision (vector, by element)

        float32x4_t vmlallbbq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
        float32x4_t vmlallbbq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
        float32x4_t vmlallbtq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
        float32x4_t vmlallbtq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
        float32x4_t vmlalltbq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
        float32x4_t vmlalltbq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
        float32x4_t vmlallttq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
        float32x4_t vmlallttq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)

>From 1a9c550599b155fbfee3bdbdf0f43bf52dd14b11 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 6 Dec 2024 13:09:23 +0000
Subject: [PATCH 01/11] [AArch64] Refactor implementation of FP8 types (NFC)

* The FP8 scalar type (`__mfp8`) was described as a vector type
* The FP8 vector types were described/assumed to have
  integer element type (the element type ought to be `__mfp8`),
* Add support for `m` type specifier (denoting `__mfp8`)
  in `DecodeTypeFromStr` and create SVE builtin prototypes using
  the specifier, instead of `int8_t`.
---
 clang/include/clang/AST/Type.h                |  5 +++
 .../clang/Basic/AArch64SVEACLETypes.def       | 24 +++++++++---
 clang/lib/AST/ASTContext.cpp                  | 37 +++++++++++++++----
 clang/lib/AST/ItaniumMangle.cpp               |  5 +++
 clang/lib/AST/Type.cpp                        |  4 +-
 clang/lib/CodeGen/CodeGenTypes.cpp            | 13 +++++--
 clang/lib/CodeGen/Targets/AArch64.cpp         |  7 +++-
 clang/utils/TableGen/SveEmitter.cpp           |  4 +-
 8 files changed, 76 insertions(+), 23 deletions(-)

diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 09c98f642852fc..aa313719a65755 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2518,6 +2518,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
   bool isFloat32Type() const;
   bool isDoubleType() const;
   bool isBFloat16Type() const;
+  bool isMFloat8Type() const;
   bool isFloat128Type() const;
   bool isIbm128Type() const;
   bool isRealType() const;         // C99 6.2.5p17 (real floating + integer)
@@ -8532,6 +8533,10 @@ inline bool Type::isBFloat16Type() const {
   return isSpecificBuiltinType(BuiltinType::BFloat16);
 }
 
+inline bool Type::isMFloat8Type() const {
+  return isSpecificBuiltinType(BuiltinType::MFloat8);
+}
+
 inline bool Type::isFloat128Type() const {
   return isSpecificBuiltinType(BuiltinType::Float128);
 }
diff --git a/clang/include/clang/Basic/AArch64SVEACLETypes.def b/clang/include/clang/Basic/AArch64SVEACLETypes.def
index 063cac1f4a58ee..6b704b386536c9 100644
--- a/clang/include/clang/Basic/AArch64SVEACLETypes.def
+++ b/clang/include/clang/Basic/AArch64SVEACLETypes.def
@@ -57,6 +57,11 @@
 //  - IsBF true for vector of brain float elements.
 //===----------------------------------------------------------------------===//
 
+#ifndef SVE_SCALAR_TYPE
+#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits) \
+  SVE_TYPE(Name, Id, SingletonId)
+#endif
+
 #ifndef SVE_VECTOR_TYPE
 #define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \
   SVE_TYPE(Name, Id, SingletonId)
@@ -72,6 +77,11 @@
   SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, false, true)
 #endif
 
+#ifndef SVE_VECTOR_TYPE_MFLOAT
+#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) \
+  SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, false, false)
+#endif
+
 #ifndef SVE_VECTOR_TYPE_FLOAT
 #define SVE_VECTOR_TYPE_FLOAT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) \
   SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, true, false)
@@ -125,8 +135,7 @@ SVE_VECTOR_TYPE_FLOAT("__SVFloat64_t", "__SVFloat64_t", SveFloat64, SveFloat64Ty
 
 SVE_VECTOR_TYPE_BFLOAT("__SVBfloat16_t", "__SVBfloat16_t", SveBFloat16, SveBFloat16Ty, 8, 16, 1)
 
-// This is a 8 bits opaque type.
-SVE_VECTOR_TYPE_INT("__SVMfloat8_t", "__SVMfloat8_t",  SveMFloat8, SveMFloat8Ty, 16, 8, 1, false)
+SVE_VECTOR_TYPE_MFLOAT("__SVMfloat8_t", "__SVMfloat8_t",  SveMFloat8, SveMFloat8Ty, 16, 8, 1)
 
 //
 // x2
@@ -148,7 +157,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x2_t", "svfloat64x2_t", SveFloat64x2, Sv
 
 SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x2_t", "svbfloat16x2_t", SveBFloat16x2, SveBFloat16x2Ty, 8, 16, 2)
 
-SVE_VECTOR_TYPE_INT("__clang_svmfloat8x2_t", "svmfloat8x2_t", SveMFloat8x2, SveMFloat8x2Ty, 16, 8, 2, false)
+SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x2_t", "svmfloat8x2_t", SveMFloat8x2, SveMFloat8x2Ty, 16, 8, 2)
 
 //
 // x3
@@ -170,7 +179,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x3_t", "svfloat64x3_t", SveFloat64x3, Sv
 
 SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x3_t", "svbfloat16x3_t", SveBFloat16x3, SveBFloat16x3Ty, 8, 16, 3)
 
-SVE_VECTOR_TYPE_INT("__clang_svmfloat8x3_t", "svmfloat8x3_t", SveMFloat8x3, SveMFloat8x3Ty, 16, 8, 3, false)
+SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x3_t", "svmfloat8x3_t", SveMFloat8x3, SveMFloat8x3Ty, 16, 8, 3)
 
 //
 // x4
@@ -192,7 +201,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x4_t", "svfloat64x4_t", SveFloat64x4, Sv
 
 SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x4_t", "svbfloat16x4_t", SveBFloat16x4, SveBFloat16x4Ty, 8, 16, 4)
 
-SVE_VECTOR_TYPE_INT("__clang_svmfloat8x4_t", "svmfloat8x4_t", SveMFloat8x4, SveMFloat8x4Ty, 16, 8, 4, false)
+SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x4_t", "svmfloat8x4_t", SveMFloat8x4, SveMFloat8x4Ty, 16, 8, 4)
 
 SVE_PREDICATE_TYPE_ALL("__SVBool_t", "__SVBool_t", SveBool, SveBoolTy, 16, 1)
 SVE_PREDICATE_TYPE_ALL("__clang_svboolx2_t", "svboolx2_t", SveBoolx2, SveBoolx2Ty, 16, 2)
@@ -200,11 +209,13 @@ SVE_PREDICATE_TYPE_ALL("__clang_svboolx4_t", "svboolx4_t", SveBoolx4, SveBoolx4T
 
 SVE_OPAQUE_TYPE("__SVCount_t", "__SVCount_t", SveCount, SveCountTy)
 
-AARCH64_VECTOR_TYPE_MFLOAT("__mfp8", "__mfp8", MFloat8, MFloat8Ty, 1, 8, 1)
+SVE_SCALAR_TYPE("__mfp8", "__mfp8", MFloat8, MFloat8Ty, 8)
+
 AARCH64_VECTOR_TYPE_MFLOAT("__MFloat8x8_t", "__MFloat8x8_t", MFloat8x8, MFloat8x8Ty, 8, 8, 1)
 AARCH64_VECTOR_TYPE_MFLOAT("__MFloat8x16_t", "__MFloat8x16_t", MFloat8x16, MFloat8x16Ty, 16, 8, 1)
 
 #undef SVE_VECTOR_TYPE
+#undef SVE_VECTOR_TYPE_MFLOAT
 #undef SVE_VECTOR_TYPE_BFLOAT
 #undef SVE_VECTOR_TYPE_FLOAT
 #undef SVE_VECTOR_TYPE_INT
@@ -213,4 +224,5 @@ AARCH64_VECTOR_TYPE_MFLOAT("__MFloat8x16_t", "__MFloat8x16_t", MFloat8x16, MFloa
 #undef SVE_OPAQUE_TYPE
 #undef AARCH64_VECTOR_TYPE_MFLOAT
 #undef AARCH64_VECTOR_TYPE
+#undef SVE_SCALAR_TYPE
 #undef SVE_TYPE
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 6ec927e13a7552..904df6f6163bc0 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -2275,6 +2275,11 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
     Width = NumEls * ElBits * NF;                                              \
     Align = NumEls * ElBits;                                                   \
     break;
+#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits)              \
+  case BuiltinType::Id:                                                        \
+    Width = Bits;                                                              \
+    Align = Bits;                                                              \
+    break;
 #include "clang/Basic/AArch64SVEACLETypes.def"
 #define PPC_VECTOR_TYPE(Name, Id, Size)                                        \
   case BuiltinType::Id:                                                        \
@@ -4395,15 +4400,18 @@ ASTContext::getBuiltinVectorTypeInfo(const BuiltinType *Ty) const {
                                ElBits, NF)                                     \
   case BuiltinType::Id:                                                        \
     return {BFloat16Ty, llvm::ElementCount::getScalable(NumEls), NF};
+#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls,     \
+                               ElBits, NF)                                     \
+  case BuiltinType::Id:                                                        \
+    return {MFloat8Ty, llvm::ElementCount::getScalable(NumEls), NF};
 #define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \
   case BuiltinType::Id:                                                        \
     return {BoolTy, llvm::ElementCount::getScalable(NumEls), NF};
 #define AARCH64_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, \
                                    ElBits, NF)                                 \
   case BuiltinType::Id:                                                        \
-    return {getIntTypeForBitwidth(ElBits, false),                              \
-            llvm::ElementCount::getFixed(NumEls), NF};
-#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
+    return {MFloat8Ty, llvm::ElementCount::getFixed(NumEls), NF};
+#define SVE_TYPE(Name, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
 
 #define RVV_VECTOR_TYPE_INT(Name, Id, SingletonId, NumEls, ElBits, NF,         \
@@ -4465,11 +4473,16 @@ QualType ASTContext::getScalableVectorType(QualType EltTy, unsigned NumElts,
       EltTySize == ElBits && NumElts == (NumEls * NF) && NumFields == 1) {     \
     return SingletonId;                                                        \
   }
+#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls,     \
+                               ElBits, NF)                                     \
+  if (EltTy->isMFloat8Type() && EltTySize == ElBits &&                         \
+      NumElts == (NumEls * NF) && NumFields == 1) {                            \
+    return SingletonId;                                                        \
+  }
 #define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \
   if (EltTy->isBooleanType() && NumElts == (NumEls * NF) && NumFields == 1)    \
     return SingletonId;
-#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
-#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId)
+#define SVE_TYPE(Name, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
   } else if (Target->hasRISCVVTypes()) {
     uint64_t EltTySize = getTypeSize(EltTy);
@@ -12177,8 +12190,15 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
                                              RequiresICE, false);
     assert(!RequiresICE && "Can't require vector ICE");
 
-    // TODO: No way to make AltiVec vectors in builtins yet.
-    Type = Context.getVectorType(ElementType, NumElements, VectorKind::Generic);
+    if (ElementType == Context.MFloat8Ty) {
+      assert((NumElements == 8 || NumElements == 16) &&
+             "Invalid number of elements");
+      Type = NumElements == 8 ? Context.MFloat8x8Ty : Context.MFloat8x16Ty;
+    } else {
+      // TODO: No way to make AltiVec vectors in builtins yet.
+      Type =
+          Context.getVectorType(ElementType, NumElements, VectorKind::Generic);
+    }
     break;
   }
   case 'E': {
@@ -12234,6 +12254,9 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context,
   case 'p':
     Type = Context.getProcessIDType();
     break;
+  case 'm':
+    Type = Context.MFloat8Ty;
+    break;
   }
 
   // If there are modifiers and if we're allowed to parse them, go for it.
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 47aa9b40dab845..9404f9fd9b151d 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -3438,6 +3438,11 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
     type_name = MangledName;                                                   \
     Out << (type_name == Name ? "u" : "") << type_name.size() << type_name;    \
     break;
+#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits)              \
+  case BuiltinType::Id:                                                        \
+    type_name = MangledName;                                                   \
+    Out << (type_name == Name ? "u" : "") << type_name.size() << type_name;    \
+    break;
 #include "clang/Basic/AArch64SVEACLETypes.def"
 #define PPC_VECTOR_TYPE(Name, Id, Size)                                        \
   case BuiltinType::Id:                                                        \
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 976361d07b68bf..dc9df9524457c2 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2527,9 +2527,7 @@ bool Type::isSVESizelessBuiltinType() const {
 #define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId)                 \
   case BuiltinType::Id:                                                        \
     return true;
-#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId)                \
-  case BuiltinType::Id:                                                        \
-    return false;
+#define SVE_TYPE(Name, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
     default:
       return false;
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 09191a4901f493..fd3327cf9acd89 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -507,13 +507,15 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
   case BuiltinType::Id:
 #define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId)                \
   case BuiltinType::Id:
-#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId)
+#define SVE_TYPE(Name, Id, SingletonId)
 #include "clang/Basic/AArch64SVEACLETypes.def"
       {
         ASTContext::BuiltinVectorTypeInfo Info =
             Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(Ty));
-        auto VTy =
-            llvm::VectorType::get(ConvertType(Info.ElementType), Info.EC);
+        auto *EltTy = Info.ElementType->isMFloat8Type()
+                          ? llvm::Type::getInt8Ty(getLLVMContext())
+                          : ConvertType(Info.ElementType);
+        auto *VTy = llvm::VectorType::get(EltTy, Info.EC);
         switch (Info.NumVectors) {
         default:
           llvm_unreachable("Expected 1, 2, 3 or 4 vectors!");
@@ -529,6 +531,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
       }
     case BuiltinType::SveCount:
       return llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount");
+    case BuiltinType::MFloat8:
+      return llvm::VectorType::get(llvm::Type::getInt8Ty(getLLVMContext()), 1,
+                                   false);
 #define PPC_VECTOR_TYPE(Name, Id, Size) \
     case BuiltinType::Id: \
       ResultType = \
@@ -650,6 +655,8 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
     // An ext_vector_type of Bool is really a vector of bits.
     llvm::Type *IRElemTy = VT->isExtVectorBoolType()
                                ? llvm::Type::getInt1Ty(getLLVMContext())
+                           : VT->getElementType()->isMFloat8Type()
+                               ? llvm::Type::getInt8Ty(getLLVMContext())
                                : ConvertType(VT->getElementType());
     ResultType = llvm::FixedVectorType::get(IRElemTy, VT->getNumElements());
     break;
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index be33e26f047841..065c92103823e7 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -243,6 +243,7 @@ AArch64ABIInfo::convertFixedToScalableVectorType(const VectorType *VT) const {
 
     case BuiltinType::SChar:
     case BuiltinType::UChar:
+    case BuiltinType::MFloat8:
       return llvm::ScalableVectorType::get(
           llvm::Type::getInt8Ty(getVMContext()), 16);
 
@@ -761,8 +762,10 @@ bool AArch64ABIInfo::passAsPureScalableType(
       getContext().getBuiltinVectorTypeInfo(cast<BuiltinType>(Ty));
   assert(Info.NumVectors > 0 && Info.NumVectors <= 4 &&
          "Expected 1, 2, 3 or 4 vectors!");
-  auto VTy = llvm::ScalableVectorType::get(CGT.ConvertType(Info.ElementType),
-                                           Info.EC.getKnownMinValue());
+  llvm::Type *EltTy = Info.ElementType->isMFloat8Type()
+                          ? llvm::Type::getInt8Ty(getVMContext())
+                          : CGT.ConvertType(Info.ElementType);
+  auto *VTy = llvm::ScalableVectorType::get(EltTy, Info.EC.getKnownMinValue());
 
   if (CoerceToSeq.size() + Info.NumVectors > 12)
     return false;
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index 14e5637f62517e..8eb68151f0a54f 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -448,7 +448,7 @@ std::string SVEType::builtinBaseType() const {
   case TypeKind::PredicatePattern:
     return "i";
   case TypeKind::Fpm:
-    return "Wi";
+    return "UWi";
   case TypeKind::Predicate:
     return "b";
   case TypeKind::BFloat16:
@@ -456,7 +456,7 @@ std::string SVEType::builtinBaseType() const {
     return "y";
   case TypeKind::MFloat8:
     assert(ElementBitwidth == 8 && "Invalid MFloat8!");
-    return "c";
+    return "m";
   case TypeKind::Float:
     switch (ElementBitwidth) {
     case 16:

>From a5493d04b5b4f82e545e352ce4e808474edf55a7 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 6 Dec 2024 15:44:58 +0000
Subject: [PATCH 02/11] [Clang][AArch64] Allow FP8 Neon vector types to be used
 by __builtin_shufflevector

The Neon vector types for FP8 (`__MFloat8x8_t` and `__MFloat8x16_t`) are
implemented as builtin types and need a special case in
`__builtin_shufflevector`.
---
 clang/include/clang/AST/Type.h                |   4 +
 .../clang/Basic/DiagnosticSemaKinds.td        |   5 +
 clang/lib/AST/Type.cpp                        |  13 ++
 clang/lib/Sema/SemaChecking.cpp               |  39 +++++-
 .../AArch64/builtin-shufflevector-fp8.c       | 123 ++++++++++++++++++
 clang/test/Sema/builtin-shufflevector.c       |  30 +++++
 6 files changed, 208 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c
 create mode 100644 clang/test/Sema/builtin-shufflevector.c

diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index aa313719a65755..fbc62f61ad5a55 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2404,6 +2404,10 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase {
   /// SVE vector or predicate, excluding tuple types such as svint32x4_t.
   bool isSveVLSBuiltinType() const;
 
+  /// Determines if this is a *builtin* NEON vector type, a type not built with
+  /// `neon_vector_type`
+  bool isNeonVectorBuiltinType() const;
+
   /// Returns the representative type for the element of an SVE builtin type.
   /// This is used to represent fixed-length SVE vectors created with the
   /// 'arm_sve_vector_bits' type attribute as VectorType.
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 811265151fa0da..fff0231572ed51 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10545,6 +10545,9 @@ def err_vec_builtin_incompatible_vector : Error<
 def err_vsx_builtin_nonconstant_argument : Error<
   "argument %0 to %1 must be a 2-bit unsigned literal (i.e. 0, 1, 2 or 3)">;
 
+def err_shufflevector_incompatible_index_vector : Error<
+  "second argument for __builtin_shufflevector must be integer vector "
+  "with length equal to the length of the first argument">;
 def err_shufflevector_nonconstant_argument : Error<
   "index for __builtin_shufflevector must be a constant integer">;
 def err_shufflevector_argument_too_large : Error<
@@ -10552,6 +10555,8 @@ def err_shufflevector_argument_too_large : Error<
   "of vector elements">;
 def err_shufflevector_minus_one_is_undefined_behavior_constexpr : Error<
   "index for __builtin_shufflevector not within the bounds of the input vectors; index of -1 found at position %0 is not permitted in a constexpr context">;
+def err_shufflevector_unsupported_result_vector_type : Error<
+  "unsupported vector type for the result">;
 
 def err_convertvector_non_vector : Error<
   "first argument to __builtin_convertvector must be a vector">;
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index dc9df9524457c2..1c21fad75253d8 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2576,6 +2576,19 @@ bool Type::isSveVLSBuiltinType() const {
   return false;
 }
 
+bool Type::isNeonVectorBuiltinType() const {
+  if (const BuiltinType *BT = getAs<BuiltinType>()) {
+    switch (BT->getKind()) {
+    case BuiltinType::MFloat8x8:
+    case BuiltinType::MFloat8x16:
+      return true;
+    default:
+      return false;
+    }
+  }
+  return false;
+}
+
 QualType Type::getSizelessVectorEltType(const ASTContext &Ctx) const {
   assert(isSizelessVectorType() && "Must be sizeless vector type");
   // Currently supports SVE and RVV
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index a248a6b53b0d06..02bd09bda08fce 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5127,24 +5127,32 @@ ExprResult Sema::BuiltinShuffleVector(CallExpr *TheCall) {
     QualType LHSType = TheCall->getArg(0)->getType();
     QualType RHSType = TheCall->getArg(1)->getType();
 
-    if (!LHSType->isVectorType() || !RHSType->isVectorType())
+    if (!LHSType->isVectorType() && !LHSType->isNeonVectorBuiltinType())
       return ExprError(
-          Diag(TheCall->getBeginLoc(), diag::err_vec_builtin_non_vector)
-          << TheCall->getDirectCallee() << /*isMorethantwoArgs*/ false
+          Diag(TheCall->getBeginLoc(), diag::err_builtin_non_vector_type)
+          << "first" << TheCall->getDirectCallee()
+          << /*isMorethantwoArgs*/ false
           << SourceRange(TheCall->getArg(0)->getBeginLoc(),
                          TheCall->getArg(1)->getEndLoc()));
 
-    numElements = LHSType->castAs<VectorType>()->getNumElements();
+    if (auto *Ty = LHSType->getAs<BuiltinType>()) {
+      assert(Ty->getKind() == BuiltinType::MFloat8x8 ||
+             Ty->getKind() == BuiltinType::MFloat8x16);
+      numElements = Ty->getKind() == BuiltinType::MFloat8x8 ? 8 : 16;
+    } else {
+      numElements = LHSType->castAs<VectorType>()->getNumElements();
+    }
+
     unsigned numResElements = TheCall->getNumArgs() - 2;
 
     // Check to see if we have a call with 2 vector arguments, the unary shuffle
     // with mask.  If so, verify that RHS is an integer vector type with the
     // same number of elts as lhs.
     if (TheCall->getNumArgs() == 2) {
-      if (!RHSType->hasIntegerRepresentation() ||
+      if (!RHSType->isVectorType() || !RHSType->hasIntegerRepresentation() ||
           RHSType->castAs<VectorType>()->getNumElements() != numElements)
         return ExprError(Diag(TheCall->getBeginLoc(),
-                              diag::err_vec_builtin_incompatible_vector)
+                              diag::err_shufflevector_incompatible_index_vector)
                          << TheCall->getDirectCallee()
                          << /*isMorethantwoArgs*/ false
                          << SourceRange(TheCall->getArg(1)->getBeginLoc(),
@@ -5157,6 +5165,25 @@ ExprResult Sema::BuiltinShuffleVector(CallExpr *TheCall) {
                        << SourceRange(TheCall->getArg(0)->getBeginLoc(),
                                       TheCall->getArg(1)->getEndLoc()));
     } else if (numElements != numResElements) {
+      if (auto *Ty = LHSType->getAs<BuiltinType>()) {
+        assert(Ty->getKind() == BuiltinType::MFloat8x8 ||
+               Ty->getKind() == BuiltinType::MFloat8x16);
+        switch (numResElements) {
+        case 8:
+          resType = Context.MFloat8x8Ty;
+          break;
+        case 16:
+          resType = Context.MFloat8x16Ty;
+          break;
+        default:
+          return ExprError(Diag(TheCall->getBeginLoc(),
+                                diag::err_shufflevector_unsupported_result_vector_type)
+                           << TheCall->getDirectCallee()
+                           << /*isMorethantwoArgs*/ false
+                           << SourceRange(TheCall->getArg(0)->getBeginLoc(),
+                                          TheCall->getArg(1)->getEndLoc()));
+        }
+      }
       QualType eltType = LHSType->castAs<VectorType>()->getElementType();
       resType =
           Context.getVectorType(eltType, numResElements, VectorKind::Generic);
diff --git a/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c b/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c
new file mode 100644
index 00000000000000..45ea8127509537
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c
@@ -0,0 +1,123 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -triple aarch64-linux -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+typedef __attribute__((neon_vector_type(8))) signed char int8x8_t;
+typedef __attribute__((neon_vector_type(16))) signed char int8x16_t;
+
+typedef __MFloat8x8_t mfloat8x8_t;
+typedef __MFloat8x16_t mfloat8x16_t;
+
+// CHECK-LABEL: define dso_local <8 x i8> @f0(
+// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[X]], <8 x i8> [[X]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <8 x i8> [[SHUFFLE]]
+//
+mfloat8x8_t f0(mfloat8x8_t x) {
+  return __builtin_shufflevector(x, x, 3, 2, 1, 0, 3, 2, 1, 0);
+}
+
+// CHECK-LABEL: define dso_local <8 x i8> @f1(
+// CHECK-SAME: <8 x i8> [[X:%.*]], <8 x i8> noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MASK:%.*]] = and <8 x i8> [[P]], splat (i8 7)
+// CHECK-NEXT:    [[SHUF_IDX:%.*]] = extractelement <8 x i8> [[MASK]], i64 0
+// CHECK-NEXT:    [[SHUF_ELT:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX]]
+// CHECK-NEXT:    [[SHUF_INS:%.*]] = insertelement <8 x i8> poison, i8 [[SHUF_ELT]], i64 0
+// CHECK-NEXT:    [[SHUF_IDX1:%.*]] = extractelement <8 x i8> [[MASK]], i64 1
+// CHECK-NEXT:    [[SHUF_ELT2:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX1]]
+// CHECK-NEXT:    [[SHUF_INS3:%.*]] = insertelement <8 x i8> [[SHUF_INS]], i8 [[SHUF_ELT2]], i64 1
+// CHECK-NEXT:    [[SHUF_IDX4:%.*]] = extractelement <8 x i8> [[MASK]], i64 2
+// CHECK-NEXT:    [[SHUF_ELT5:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX4]]
+// CHECK-NEXT:    [[SHUF_INS6:%.*]] = insertelement <8 x i8> [[SHUF_INS3]], i8 [[SHUF_ELT5]], i64 2
+// CHECK-NEXT:    [[SHUF_IDX7:%.*]] = extractelement <8 x i8> [[MASK]], i64 3
+// CHECK-NEXT:    [[SHUF_ELT8:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX7]]
+// CHECK-NEXT:    [[SHUF_INS9:%.*]] = insertelement <8 x i8> [[SHUF_INS6]], i8 [[SHUF_ELT8]], i64 3
+// CHECK-NEXT:    [[SHUF_IDX10:%.*]] = extractelement <8 x i8> [[MASK]], i64 4
+// CHECK-NEXT:    [[SHUF_ELT11:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX10]]
+// CHECK-NEXT:    [[SHUF_INS12:%.*]] = insertelement <8 x i8> [[SHUF_INS9]], i8 [[SHUF_ELT11]], i64 4
+// CHECK-NEXT:    [[SHUF_IDX13:%.*]] = extractelement <8 x i8> [[MASK]], i64 5
+// CHECK-NEXT:    [[SHUF_ELT14:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX13]]
+// CHECK-NEXT:    [[SHUF_INS15:%.*]] = insertelement <8 x i8> [[SHUF_INS12]], i8 [[SHUF_ELT14]], i64 5
+// CHECK-NEXT:    [[SHUF_IDX16:%.*]] = extractelement <8 x i8> [[MASK]], i64 6
+// CHECK-NEXT:    [[SHUF_ELT17:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX16]]
+// CHECK-NEXT:    [[SHUF_INS18:%.*]] = insertelement <8 x i8> [[SHUF_INS15]], i8 [[SHUF_ELT17]], i64 6
+// CHECK-NEXT:    [[SHUF_IDX19:%.*]] = extractelement <8 x i8> [[MASK]], i64 7
+// CHECK-NEXT:    [[SHUF_ELT20:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX19]]
+// CHECK-NEXT:    [[SHUF_INS21:%.*]] = insertelement <8 x i8> [[SHUF_INS18]], i8 [[SHUF_ELT20]], i64 7
+// CHECK-NEXT:    ret <8 x i8> [[SHUF_INS21]]
+//
+mfloat8x8_t f1(mfloat8x8_t x, int8x8_t p) {
+  return __builtin_shufflevector(x, p);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @f3(
+// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[X]], <16 x i8> [[X]], <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+// CHECK-NEXT:    ret <16 x i8> [[SHUFFLE]]
+//
+mfloat8x16_t f3(mfloat8x16_t x) {
+  return __builtin_shufflevector(x, x, 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2,
+                                 1, 0);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @f4(
+// CHECK-SAME: <16 x i8> [[X:%.*]], <16 x i8> noundef [[P:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[MASK:%.*]] = and <16 x i8> [[P]], splat (i8 15)
+// CHECK-NEXT:    [[SHUF_IDX:%.*]] = extractelement <16 x i8> [[MASK]], i64 0
+// CHECK-NEXT:    [[SHUF_ELT:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX]]
+// CHECK-NEXT:    [[SHUF_INS:%.*]] = insertelement <16 x i8> poison, i8 [[SHUF_ELT]], i64 0
+// CHECK-NEXT:    [[SHUF_IDX1:%.*]] = extractelement <16 x i8> [[MASK]], i64 1
+// CHECK-NEXT:    [[SHUF_ELT2:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX1]]
+// CHECK-NEXT:    [[SHUF_INS3:%.*]] = insertelement <16 x i8> [[SHUF_INS]], i8 [[SHUF_ELT2]], i64 1
+// CHECK-NEXT:    [[SHUF_IDX4:%.*]] = extractelement <16 x i8> [[MASK]], i64 2
+// CHECK-NEXT:    [[SHUF_ELT5:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX4]]
+// CHECK-NEXT:    [[SHUF_INS6:%.*]] = insertelement <16 x i8> [[SHUF_INS3]], i8 [[SHUF_ELT5]], i64 2
+// CHECK-NEXT:    [[SHUF_IDX7:%.*]] = extractelement <16 x i8> [[MASK]], i64 3
+// CHECK-NEXT:    [[SHUF_ELT8:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX7]]
+// CHECK-NEXT:    [[SHUF_INS9:%.*]] = insertelement <16 x i8> [[SHUF_INS6]], i8 [[SHUF_ELT8]], i64 3
+// CHECK-NEXT:    [[SHUF_IDX10:%.*]] = extractelement <16 x i8> [[MASK]], i64 4
+// CHECK-NEXT:    [[SHUF_ELT11:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX10]]
+// CHECK-NEXT:    [[SHUF_INS12:%.*]] = insertelement <16 x i8> [[SHUF_INS9]], i8 [[SHUF_ELT11]], i64 4
+// CHECK-NEXT:    [[SHUF_IDX13:%.*]] = extractelement <16 x i8> [[MASK]], i64 5
+// CHECK-NEXT:    [[SHUF_ELT14:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX13]]
+// CHECK-NEXT:    [[SHUF_INS15:%.*]] = insertelement <16 x i8> [[SHUF_INS12]], i8 [[SHUF_ELT14]], i64 5
+// CHECK-NEXT:    [[SHUF_IDX16:%.*]] = extractelement <16 x i8> [[MASK]], i64 6
+// CHECK-NEXT:    [[SHUF_ELT17:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX16]]
+// CHECK-NEXT:    [[SHUF_INS18:%.*]] = insertelement <16 x i8> [[SHUF_INS15]], i8 [[SHUF_ELT17]], i64 6
+// CHECK-NEXT:    [[SHUF_IDX19:%.*]] = extractelement <16 x i8> [[MASK]], i64 7
+// CHECK-NEXT:    [[SHUF_ELT20:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX19]]
+// CHECK-NEXT:    [[SHUF_INS21:%.*]] = insertelement <16 x i8> [[SHUF_INS18]], i8 [[SHUF_ELT20]], i64 7
+// CHECK-NEXT:    [[SHUF_IDX22:%.*]] = extractelement <16 x i8> [[MASK]], i64 8
+// CHECK-NEXT:    [[SHUF_ELT23:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX22]]
+// CHECK-NEXT:    [[SHUF_INS24:%.*]] = insertelement <16 x i8> [[SHUF_INS21]], i8 [[SHUF_ELT23]], i64 8
+// CHECK-NEXT:    [[SHUF_IDX25:%.*]] = extractelement <16 x i8> [[MASK]], i64 9
+// CHECK-NEXT:    [[SHUF_ELT26:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX25]]
+// CHECK-NEXT:    [[SHUF_INS27:%.*]] = insertelement <16 x i8> [[SHUF_INS24]], i8 [[SHUF_ELT26]], i64 9
+// CHECK-NEXT:    [[SHUF_IDX28:%.*]] = extractelement <16 x i8> [[MASK]], i64 10
+// CHECK-NEXT:    [[SHUF_ELT29:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX28]]
+// CHECK-NEXT:    [[SHUF_INS30:%.*]] = insertelement <16 x i8> [[SHUF_INS27]], i8 [[SHUF_ELT29]], i64 10
+// CHECK-NEXT:    [[SHUF_IDX31:%.*]] = extractelement <16 x i8> [[MASK]], i64 11
+// CHECK-NEXT:    [[SHUF_ELT32:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX31]]
+// CHECK-NEXT:    [[SHUF_INS33:%.*]] = insertelement <16 x i8> [[SHUF_INS30]], i8 [[SHUF_ELT32]], i64 11
+// CHECK-NEXT:    [[SHUF_IDX34:%.*]] = extractelement <16 x i8> [[MASK]], i64 12
+// CHECK-NEXT:    [[SHUF_ELT35:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX34]]
+// CHECK-NEXT:    [[SHUF_INS36:%.*]] = insertelement <16 x i8> [[SHUF_INS33]], i8 [[SHUF_ELT35]], i64 12
+// CHECK-NEXT:    [[SHUF_IDX37:%.*]] = extractelement <16 x i8> [[MASK]], i64 13
+// CHECK-NEXT:    [[SHUF_ELT38:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX37]]
+// CHECK-NEXT:    [[SHUF_INS39:%.*]] = insertelement <16 x i8> [[SHUF_INS36]], i8 [[SHUF_ELT38]], i64 13
+// CHECK-NEXT:    [[SHUF_IDX40:%.*]] = extractelement <16 x i8> [[MASK]], i64 14
+// CHECK-NEXT:    [[SHUF_ELT41:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX40]]
+// CHECK-NEXT:    [[SHUF_INS42:%.*]] = insertelement <16 x i8> [[SHUF_INS39]], i8 [[SHUF_ELT41]], i64 14
+// CHECK-NEXT:    [[SHUF_IDX43:%.*]] = extractelement <16 x i8> [[MASK]], i64 15
+// CHECK-NEXT:    [[SHUF_ELT44:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX43]]
+// CHECK-NEXT:    [[SHUF_INS45:%.*]] = insertelement <16 x i8> [[SHUF_INS42]], i8 [[SHUF_ELT44]], i64 15
+// CHECK-NEXT:    ret <16 x i8> [[SHUF_INS45]]
+//
+mfloat8x16_t f4(mfloat8x16_t x, int8x16_t p) {
+  return __builtin_shufflevector(x, p);
+}
diff --git a/clang/test/Sema/builtin-shufflevector.c b/clang/test/Sema/builtin-shufflevector.c
new file mode 100644
index 00000000000000..c2dabb9d6585a2
--- /dev/null
+++ b/clang/test/Sema/builtin-shufflevector.c
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple aarch64 -fsyntax-only -verify %s
+
+// REQUIRES: aarch64-registered-target
+
+typedef __attribute__((neon_vector_type(8))) signed char int8x8_t;
+typedef __attribute__((neon_vector_type(16))) signed char int8x16_t;
+
+typedef __MFloat8x8_t mfloat8x8_t;
+typedef __MFloat8x16_t mfloat8x16_t;
+
+int8x8_t non_vector(int x) {
+  return __builtin_shufflevector(x, x, 3, 2, 1, 0, 3, 2, 1, 0);
+  // expected-error at -1 {{first argument to '__builtin_shufflevector' must be of vector type}}
+}
+
+mfloat8x8_t unsuported_vector(mfloat8x8_t x) {
+  return __builtin_shufflevector(x, x, 3, 2, 1, 0, 3, 2, 1, 0, 0);
+  // expected-error at -1 {{unsupported vector type for the result}}
+}
+
+int8x8_t non_vector_index(int8x8_t x, int p) {
+  return __builtin_shufflevector(x, p);
+  // expected-error at -1 {{second argument for __builtin_shufflevector must be integer vector with length equal to the length of the first argument}}
+}
+
+int8x8_t bad_vector_index_length(int8x8_t x, int8x16_t p) {
+  return __builtin_shufflevector(x, p);
+  // expected-error at -1 {{second argument for __builtin_shufflevector must be integer vector with length equal to the length of the first argument}}
+}
+

>From c1493619a0e514678fe5ecc42ad5fdde5680efd5 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Mon, 16 Dec 2024 09:58:22 +0000
Subject: [PATCH 03/11] [fixup] Fix formatting (NFC)

---
 clang/lib/Sema/SemaChecking.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 02bd09bda08fce..de57e7d6efaca5 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -5176,12 +5176,12 @@ ExprResult Sema::BuiltinShuffleVector(CallExpr *TheCall) {
           resType = Context.MFloat8x16Ty;
           break;
         default:
-          return ExprError(Diag(TheCall->getBeginLoc(),
-                                diag::err_shufflevector_unsupported_result_vector_type)
-                           << TheCall->getDirectCallee()
-                           << /*isMorethantwoArgs*/ false
-                           << SourceRange(TheCall->getArg(0)->getBeginLoc(),
-                                          TheCall->getArg(1)->getEndLoc()));
+          return ExprError(
+              Diag(TheCall->getBeginLoc(),
+                   diag::err_shufflevector_unsupported_result_vector_type)
+              << TheCall->getDirectCallee() << /*isMorethantwoArgs*/ false
+              << SourceRange(TheCall->getArg(0)->getBeginLoc(),
+                             TheCall->getArg(1)->getEndLoc()));
         }
       }
       QualType eltType = LHSType->castAs<VectorType>()->getElementType();

>From 4c9e30a3af35b7ee94dcd82912756081df343480 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Fri, 6 Dec 2024 19:24:16 +0000
Subject: [PATCH 04/11] [AArch64] Add Neon FP8 conversion intrinsics

---
 clang/include/clang/Basic/arm_neon.td         |  25 +++-
 clang/include/clang/Basic/arm_neon_incl.td    |   2 +
 clang/lib/CodeGen/CGBuiltin.cpp               | 120 +++++++++++++++++-
 clang/lib/CodeGen/CodeGenFunction.h           |   3 +
 clang/utils/TableGen/NeonEmitter.cpp          |  24 +++-
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  22 ++++
 .../lib/Target/AArch64/AArch64InstrFormats.td |  46 +++++--
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  14 +-
 8 files changed, 232 insertions(+), 24 deletions(-)

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index ef89fa4358dfeb..cc161c51df6608 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2125,6 +2125,29 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in {
   }
 }
 
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,bf16,neon" in {
+  def VBF1CVT_BF16_MF8        : VInst<"vcvt1_bf16_mf8_fpm",      "(QB).V", "m">;
+  def VBF1CVT_LOW_BF16_MF8    : VInst<"vcvt1_low_bf16_mf8_fpm",  "B.V",    "Qm">;
+  def VBF2CVTL_BF16_MF8       : VInst<"vcvt2_bf16_mf8_fpm",      "(QB).V", "m">;
+  def VBF2CVTL_LOW_BF16_MF8   : VInst<"vcvt2_low_bf16_mf8_fpm",  "B.V",    "Qm">;
+  def VBF1CVTL2_HIGH_BF16_MF8 : VInst<"vcvt1_high_bf16_mf8_fpm", "B.V",    "Qm">;
+  def VBF2CVTL2_HIGH_BF16_MF8 : VInst<"vcvt2_high_bf16_mf8_fpm", "B.V",    "Qm">;
+}
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
+  def VF1CVT_F16_MF8        : VInst<"vcvt1_f16_mf8_fpm",      "(>QF).V", "m">;
+  def VF1CVT_LOW_F16_MF8    : VInst<"vcvt1_low_f16_mf8_fpm",  "(>F).V",  "Qm">;
+  def VF2CVTL_F16_MF8       : VInst<"vcvt2_f16_mf8_fpm",      "(>QF).V", "m">;
+  def VF2CVTL_LOW_F16_MF8   : VInst<"vcvt2_low_f16_mf8_fpm",  "(>F).V",  "Qm">;
+  def VF1CVTL2_HIGH_F16_MF8 : VInst<"vcvt1_high_f16_mf8_fpm", "(>F).V",  "Qm">;
+  def VF2CVTL2_HIGH_F16_MF8 : VInst<"vcvt2_high_f16_mf8_fpm", "(>F).V",  "Qm">;
+
+  def VCVTN_LOW_F8_F32  : VInst<"vcvt_mf8_f32_fpm",      ".(>>QF)(>>QF)V",  "m">;
+  def VCVTN_HIGH_F8_F32 : VInst<"vcvt_high_mf8_f32_fpm", ".(q)(>>F)(>>F)V", "Qm">;
+  def VCVTN_F8_F16      : VInst<"vcvt_mf8_f16_fpm",      ".(>F)(>F)V",      "m">;
+  def VCVTNQ_F8_F16     : VInst<"vcvtq_mf8_f16_fpm",     ".(>F)(>F)V",      "Qm">;
+}
+
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
   def FAMIN : WInst<"vamin", "...", "fhQdQfQh">;
   def FAMAX : WInst<"vamax", "...", "fhQdQfQh">;
@@ -2134,4 +2157,4 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
   // fscale
   def FSCALE_V128 : WInst<"vscale", "..(.S)", "QdQfQh">;
   def FSCALE_V64 : WInst<"vscale", "(.q)(.q)(.qS)", "fh">;
-}
\ No newline at end of file
+}
diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td
index fd800e5a6278e4..91a2bf3020b9a3 100644
--- a/clang/include/clang/Basic/arm_neon_incl.td
+++ b/clang/include/clang/Basic/arm_neon_incl.td
@@ -243,6 +243,7 @@ def OP_UNAVAILABLE : Operation {
 // B: change to BFloat16
 // P: change to polynomial category.
 // p: change polynomial to equivalent integer category. Otherwise nop.
+// V: change to fpm_t
 //
 // >: double element width (vector size unchanged).
 // <: half element width (vector size unchanged).
@@ -301,6 +302,7 @@ class Inst <string n, string p, string t, Operation o, list<ImmCheck> ch = []>{
 class SInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
 class IInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
 class WInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
+class VInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
 
 // The following instruction classes are implemented via operators
 // instead of builtins. As such these declarations are only used for
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 84048a4beac2c5..3f46e7e8c75e93 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6868,6 +6868,13 @@ Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
     return Builder.CreateCall(F, Ops, name);
 }
 
+Value *CodeGenFunction::EmitFP8NeonCall(Function *F,
+                                        SmallVectorImpl<Value *> &Ops,
+                                        Value *FPM, const char *name) {
+  Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), FPM);
+  return EmitNeonCall(F, Ops, name);
+}
+
 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
                                             bool neg) {
   int SV = cast<ConstantInt>(V)->getSExtValue();
@@ -14011,7 +14018,118 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
   }
-
+  case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_cvtl1;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(BFloatTy, 8);
+    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower part of
+    // the vector.
+    if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm) {
+      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+                                               /*isQuad*/ false));
+      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
+    } else
+      Tys[1] = Ops[0]->getType();
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt1");
+  }
+  case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_cvtl2;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(BFloatTy, 8);
+    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
+    //  part of the vector.
+    if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm) {
+      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+                                               /*isQuad*/ false));
+      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
+    } else
+      Tys[1] = Ops[0]->getType();
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt2");
+  }
+  case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_cvtl1;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(HalfTy, 8);
+    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
+    //  part of the vector.
+    if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm) {
+      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+                                               /*isQuad*/ false));
+      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
+    } else
+      Tys[1] = Ops[0]->getType();
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt1");
+  }
+  case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_cvtl2;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(HalfTy, 8);
+    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
+    //  part of the vector.
+    if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm) {
+      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
+                                               /*isQuad*/ false));
+      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
+    } else
+      Tys[1] = Ops[0]->getType();
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt2");
+  }
+  case NEON::BI__builtin_neon_vcvt_mf8_f32_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_fcvtn;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 8);
+    Tys[1] = Ops[0]->getType();
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn");
+  }
+  case NEON::BI__builtin_neon_vcvt_mf8_f16_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_fcvtn;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 8);
+    // Gets the expected type, because arm_neon.h casts float16x4_t to int8x8_t
+    Tys[1] = llvm::FixedVectorType::get(HalfTy, 4);
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn");
+  }
+  case NEON::BI__builtin_neon_vcvtq_mf8_f16_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_fcvtn;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 16);
+    // Gets the expected type, because arm_neon.h casts float16x8_t to int8x16_t
+    Tys[1] = llvm::FixedVectorType::get(HalfTy, 8);
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn");
+  }
+  case NEON::BI__builtin_neon_vcvt_high_mf8_f32_fpm: {
+    Int = Intrinsic::aarch64_neon_fp8_fcvtn2;
+    llvm::Type *Tys[2];
+    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 16);
+    Tys[1] = Ops[1]->getType();
+    Ops[0] = Builder.CreateInsertVector(Tys[0], PoisonValue::get(Tys[0]),
+                                        Ops[0], Builder.getInt64(0));
+    llvm::Value *FPM =
+        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn2");
+  }
   case NEON::BI__builtin_neon_vamin_f16:
   case NEON::BI__builtin_neon_vaminq_f16:
   case NEON::BI__builtin_neon_vamin_f32:
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 092d55355a0a17..5924ad218a5292 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4648,6 +4648,9 @@ class CodeGenFunction : public CodeGenTypeCache {
                             SmallVectorImpl<llvm::Value*> &O,
                             const char *name,
                             unsigned shift = 0, bool rightshift = false);
+  llvm::Value *EmitFP8NeonCall(llvm::Function *F,
+                               SmallVectorImpl<llvm::Value *> &O,
+                               llvm::Value *FPM, const char *name);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx,
                              const llvm::ElementCount &Count);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx);
diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index d7d649dd2456d5..dd8eb98aff4a75 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -74,6 +74,7 @@ enum ClassKind {
   ClassI,     // generic integer instruction, e.g., "i8" suffix
   ClassS,     // signed/unsigned/poly, e.g., "s8", "u8" or "p8" suffix
   ClassW,     // width-specific instruction, e.g., "8" suffix
+  ClassV,     // void-suffix instruction, no suffix
   ClassB,     // bitcast arguments with enum argument to specify type
   ClassL,     // Logical instructions which are op instructions
               // but we need to not emit any suffix for in our
@@ -144,7 +145,7 @@ class Type {
 private:
   TypeSpec TS;
 
-  enum TypeKind { Void, Float, SInt, UInt, Poly, BFloat16, MFloat8 };
+  enum TypeKind { Void, Float, SInt, UInt, Poly, BFloat16, MFloat8, FPM };
   TypeKind Kind;
   bool Immediate, Constant, Pointer;
   // ScalarForMangling and NoManglingQ are really not suited to live here as
@@ -198,6 +199,7 @@ class Type {
   bool isVoid() const { return Kind == Void; }
   bool isBFloat16() const { return Kind == BFloat16; }
   bool isMFloat8() const { return Kind == MFloat8; }
+  bool isFPM() const { return Kind == FPM; }
   unsigned getNumElements() const { return Bitwidth / ElementBitwidth; }
   unsigned getSizeInBits() const { return Bitwidth; }
   unsigned getElementSizeInBits() const { return ElementBitwidth; }
@@ -600,6 +602,7 @@ class NeonEmitter {
     const Record *SI = R.getClass("SInst");
     const Record *II = R.getClass("IInst");
     const Record *WI = R.getClass("WInst");
+    const Record *VI = R.getClass("VInst");
     const Record *SOpI = R.getClass("SOpInst");
     const Record *IOpI = R.getClass("IOpInst");
     const Record *WOpI = R.getClass("WOpInst");
@@ -609,6 +612,7 @@ class NeonEmitter {
     ClassMap[SI] = ClassS;
     ClassMap[II] = ClassI;
     ClassMap[WI] = ClassW;
+    ClassMap[VI] = ClassV;
     ClassMap[SOpI] = ClassS;
     ClassMap[IOpI] = ClassI;
     ClassMap[WOpI] = ClassW;
@@ -641,6 +645,9 @@ class NeonEmitter {
 std::string Type::str() const {
   if (isVoid())
     return "void";
+  if (isFPM())
+    return "fpm_t";
+
   std::string S;
 
   if (isInteger() && !isSigned())
@@ -699,6 +706,8 @@ std::string Type::builtin_str() const {
   } else if (isMFloat8()) {
     assert(ElementBitwidth == 8 && "MFloat8 can only be 8 bits");
     S += "m";
+  } else if (isFPM()) {
+    S += "UWi";
   } else
     switch (ElementBitwidth) {
     case 16: S += "h"; break;
@@ -888,6 +897,7 @@ void Type::applyTypespec(bool &Quad) {
     case 'm':
       Kind = MFloat8;
       ElementBitwidth = 8;
+      NoManglingQ = true;
       break;
     default:
       llvm_unreachable("Unhandled type code!");
@@ -925,6 +935,13 @@ void Type::applyModifiers(StringRef Mods) {
     case 'P':
       Kind = Poly;
       break;
+    case 'V':
+      Kind = FPM;
+      Bitwidth = ElementBitwidth = 64;
+      NumVectors = 0;
+      Immediate = Constant = Pointer = false;
+      ScalarForMangling = NoManglingQ = true;
+      break;
     case '>':
       assert(ElementBitwidth < 128);
       ElementBitwidth *= 2;
@@ -1000,6 +1017,9 @@ std::string Intrinsic::getInstTypeCode(Type T, ClassKind CK) const {
   if (CK == ClassB && TargetGuard == "neon")
     return "";
 
+  if (this->CK == ClassV)
+    return "";
+
   if (T.isBFloat16())
     return "bf16";
 
@@ -1349,7 +1369,7 @@ void Intrinsic::emitBodyAsBuiltinCall() {
   if (!protoHasScalar())
     LocalCK = ClassB;
 
-  if (!getReturnType().isVoid() && !SRet)
+  if (!getReturnType().isVoid() && !SRet && !getReturnType().isMFloat8())
     S += "(" + RetVar.getType().str() + ") ";
 
   S += "__builtin_neon_" + mangleName(std::string(N), LocalCK) + "(";
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index eeecc5bb75cc1e..2dcacce389ef81 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1004,6 +1004,28 @@ def int_aarch64_st64b: Intrinsic<[], !listconcat([llvm_ptr_ty], data512)>;
 def int_aarch64_st64bv: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], data512)>;
 def int_aarch64_st64bv0: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], data512)>;
 
+  //
+  // Neon FP8 intrinsics
+  //
+
+  // Conversions
+  class AdvSIMD_FP8_1VectorArg_Long_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrReadMem, IntrInaccessibleMemOnly]>;
+
+  def int_aarch64_neon_fp8_cvtl1   : AdvSIMD_FP8_1VectorArg_Long_Intrinsic;
+  def int_aarch64_neon_fp8_cvtl2   : AdvSIMD_FP8_1VectorArg_Long_Intrinsic;
+
+  def int_aarch64_neon_fp8_fcvtn
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [llvm_anyvector_ty,
+                             LLVMMatchType<1>],
+                            [IntrReadMem, IntrInaccessibleMemOnly]>;
+  def int_aarch64_neon_fp8_fcvtn2
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [LLVMMatchType<0>,
+                             llvm_anyvector_ty,
+                             LLVMMatchType<1>],
+                            [IntrReadMem, IntrInaccessibleMemOnly]>;
 }
 
 def llvm_nxv1i1_ty  : LLVMType<nxv1i1>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 56ff7b0d3a280d..df224b580c6e5f 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6551,17 +6551,30 @@ class BaseSIMDThreeVectors<bit Q, bit U, bits<2> size, bits<4> op,
 
 
 // FCVTN (FP16 to FP8)
-multiclass SIMDThreeSameSizeVectorCvt<string asm> {
-   def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b01, 0b1110, V64, V64, asm, ".8b",".4h">;
-   def v16f8 : BaseSIMDThreeVectors<0b1, 0b0, 0b01, 0b1110,  V128, V128, asm, ".16b", ".8h">;
+multiclass SIMD_FP8_CVTN_F16<string asm, SDPatternOperator Op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b01, 0b1110, V64, V64, asm, ".8b",".4h">;
+    def v16f8 : BaseSIMDThreeVectors<0b1, 0b0, 0b01, 0b1110,  V128, V128, asm, ".16b", ".8h">;
+  }
+  def : Pat<(v8i8 (Op (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
+            (!cast<Instruction>(NAME # v8f8) V64:$Rn, V64:$Rm)>;
+  def : Pat<(v16i8 (Op (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
+            (!cast<Instruction>(NAME # v16f8) V128:$Rn, V128:$Rm)>;
 }
 
-// TODO : Create v16f8 value type
 // FCVTN, FCVTN2 (FP32 to FP8)
-multiclass SIMDThreeVectorCvt<string asm> {
-   def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b00, 0b1110, V64, V128, asm, ".8b", ".4s">;
-   def 2v16f8 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1110, asm#2, ".16b", ".4s",
-                                           V128, v16i8, v4f32, null_frag>;
+multiclass SIMD_FP8_CVTN_F32<string asm, SDPatternOperator Op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b00, 0b1110, V64, V128, asm, ".8b", ".4s">;
+    def 2v16f8 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1110, asm#2, ".16b", ".4s",
+                                            V128, v16i8, v4f32, null_frag>;
+  }
+
+  def : Pat<(v8i8 (Op (v4f32 V128:$Rn), (v4f32 V128:$Rm))),
+            (!cast<Instruction>(NAME # v8f8) V128:$Rn, V128:$Rm)>;
+
+  def : Pat<(v16i8 (!cast<SDPatternOperator>(Op # 2) (v16i8 V128:$_Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm))),
+            (!cast<Instruction>(NAME # 2v16f8) V128:$_Rd, V128:$Rn, V128:$Rm)>;
 }
 
 // TODO: Create a new Value Type v8f8 and v16f8
@@ -7025,11 +7038,18 @@ multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
 //----------------------------------------------------------------------------
 // FP8 Advanced SIMD two-register miscellaneous
 //----------------------------------------------------------------------------
-multiclass SIMDMixedTwoVectorFP8<bits<2>sz, string asm> {
-  def v8f16 : BaseSIMDMixedTwoVector<0b0, 0b1, sz, 0b10111, V64, V128,
-                                     asm, ".8h", ".8b", []>;
-  def 2v8f16 : BaseSIMDMixedTwoVector<0b1, 0b1, sz, 0b10111, V128, V128,
-                                     asm#2, ".8h", ".16b", []>;
+multiclass SIMD_FP8_CVTL<bits<2>sz, string asm, ValueType dty, SDPatternOperator Op> {
+  let Uses=[FPMR, FPCR], mayLoad = 1 in {
+    def NAME : BaseSIMDMixedTwoVector<0b0, 0b1, sz, 0b10111, V64, V128,
+                                      asm, ".8h", ".8b", []>;
+    def NAME#2 : BaseSIMDMixedTwoVector<0b1, 0b1, sz, 0b10111, V128, V128,
+                                        asm#2, ".8h", ".16b", []>;
+  }
+  def : Pat<(dty (Op (v8i8 V64:$Rn))),
+            (!cast<Instruction>(NAME) V64:$Rn)>;
+
+  def : Pat<(dty (Op (v16i8 V128:$Rn))),
+            (!cast<Instruction>(NAME#2) V128:$Rn)>;
 }
 
 class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<2> size2,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index d015cc15581ad0..18ffff396fa5da 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -10329,13 +10329,13 @@ let Predicates = [HasD128] in {
 // 2023 Architecture Extensions:
 //===----------------------------===//
 
-let Uses = [FPMR, FPCR], Predicates = [HasFP8] in {
-  defm F1CVTL  : SIMDMixedTwoVectorFP8<0b00, "f1cvtl">;
-  defm F2CVTL  : SIMDMixedTwoVectorFP8<0b01, "f2cvtl">;
-  defm BF1CVTL : SIMDMixedTwoVectorFP8<0b10, "bf1cvtl">;
-  defm BF2CVTL : SIMDMixedTwoVectorFP8<0b11, "bf2cvtl">;
-  defm FCVTN_F16_F8 : SIMDThreeSameSizeVectorCvt<"fcvtn">;
-  defm FCVTN_F32_F8 : SIMDThreeVectorCvt<"fcvtn">;
+let Predicates = [HasFP8] in {
+  defm F1CVTL  : SIMD_FP8_CVTL<0b00, "f1cvtl", v8f16, int_aarch64_neon_fp8_cvtl1>;
+  defm F2CVTL  : SIMD_FP8_CVTL<0b01, "f2cvtl", v8f16, int_aarch64_neon_fp8_cvtl2>;
+  defm BF1CVTL : SIMD_FP8_CVTL<0b10, "bf1cvtl", v8bf16, int_aarch64_neon_fp8_cvtl1>;
+  defm BF2CVTL : SIMD_FP8_CVTL<0b11, "bf2cvtl", v8bf16, int_aarch64_neon_fp8_cvtl2>;
+  defm FCVTN_F16 : SIMD_FP8_CVTN_F16<"fcvtn", int_aarch64_neon_fp8_fcvtn>;
+  defm FCVTN_F32 : SIMD_FP8_CVTN_F32<"fcvtn", int_aarch64_neon_fp8_fcvtn>;
   defm FSCALE : SIMDThreeVectorFscale<0b1, 0b1, 0b111, "fscale", int_aarch64_neon_fp8_fscale>;
 } // End let Predicates = [HasFP8]
 

>From f9ddbe61469e4ea9f3e72884636881c10419f822 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 10 Dec 2024 18:14:02 +0000
Subject: [PATCH 05/11] [fixup] Add tests, fix calling the wrong LLVM intrinsic

---
 clang/lib/CodeGen/CGBuiltin.cpp               |   4 +-
 .../fp8-intrinsics/acle_neon_fp8_cvt.c        | 308 ++++++++++++++++++
 llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll     | 112 +++++++
 3 files changed, 422 insertions(+), 2 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c
 create mode 100644 llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 3f46e7e8c75e93..9eac01e0a77697 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -14062,7 +14062,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Tys[0] = llvm::FixedVectorType::get(HalfTy, 8);
     // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
     //  part of the vector.
-    if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm) {
+    if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm) {
       Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
                                                /*isQuad*/ false));
       Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
@@ -14080,7 +14080,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Tys[0] = llvm::FixedVectorType::get(HalfTy, 8);
     // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
     //  part of the vector.
-    if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm) {
+    if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm) {
       Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
                                                /*isQuad*/ false));
       Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c
new file mode 100644
index 00000000000000..7543938f487103
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c
@@ -0,0 +1,308 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix CHECK-CXX
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -S -O3 -o /dev/null %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_bf16_mf8_fpm(
+// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z23test_vcvt1_bf16_mf8_fpmu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+bfloat16x8_t test_vcvt1_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
+  return vcvt1_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_low_bf16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z27test_vcvt1_low_bf16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+bfloat16x8_t test_vcvt1_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt1_low_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_bf16_mf8_fpm(
+// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z23test_vcvt2_bf16_mf8_fpmu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+bfloat16x8_t test_vcvt2_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
+  return vcvt2_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_low_bf16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z27test_vcvt2_low_bf16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+bfloat16x8_t test_vcvt2_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt2_low_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_high_bf16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z28test_vcvt1_high_bf16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT1_I]]
+//
+bfloat16x8_t test_vcvt1_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt1_high_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_high_bf16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z28test_vcvt2_high_bf16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x bfloat> [[VBFCVT2_I]]
+//
+bfloat16x8_t test_vcvt2_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt2_high_bf16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_f16_mf8_fpm(
+// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z22test_vcvt1_f16_mf8_fpmu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+float16x8_t test_vcvt1_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
+  return vcvt1_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_low_f16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z26test_vcvt1_low_f16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+float16x8_t test_vcvt1_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt1_low_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_f16_mf8_fpm(
+// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z22test_vcvt2_f16_mf8_fpmu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+float16x8_t test_vcvt2_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) {
+  return vcvt2_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_low_f16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z26test_vcvt2_low_f16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+float16x8_t test_vcvt2_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt2_low_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_high_f16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z27test_vcvt1_high_f16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT1_I]]
+//
+float16x8_t test_vcvt1_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt1_high_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_high_f16_mf8_fpm(
+// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z27test_vcvt2_high_f16_mf8_fpmu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VBFCVT2_I]]
+//
+float16x8_t test_vcvt2_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) {
+  return vcvt2_high_f16_mf8_fpm(op, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcvt_mf8_f32_fpm(
+// CHECK-SAME: <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VFCVTN_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-NEXT:    ret <8 x i8> [[VFCVTN_I]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z21test_vcvt_mf8_f32_fpm13__Float32x4_tS_m(
+// CHECK-CXX-SAME: <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VFCVTN_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-CXX-NEXT:    ret <8 x i8> [[VFCVTN_I]]
+//
+mfloat8x8_t test_vcvt_mf8_f32_fpm(float32x4_t vn, float32x4_t vm, fpm_t fpm) {
+  return vcvt_mf8_f32_fpm(vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcvt_high_mf8_f32_fpm(
+// CHECK-SAME: <8 x i8> [[VD:%.*]], <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VD]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-NEXT:    ret <16 x i8> [[VFCVTN2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vcvt_high_mf8_f32_fpmu13__MFloat8x8_t13__Float32x4_tS_m(
+// CHECK-CXX-SAME: <8 x i8> [[VD:%.*]], <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VD]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]])
+// CHECK-CXX-NEXT:    ret <16 x i8> [[VFCVTN2_I]]
+//
+mfloat8x16_t test_vcvt_high_mf8_f32_fpm(mfloat8x8_t vd, float32x4_t vn,
+                                    float32x4_t vm, fpm_t fpm) {
+  return vcvt_high_mf8_f32_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x i8> @test_vcvt_mf8_f16_fpm(
+// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <4 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VFCVTN2_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]])
+// CHECK-NEXT:    ret <8 x i8> [[VFCVTN2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z21test_vcvt_mf8_f16_fpm13__Float16x4_tS_m(
+// CHECK-CXX-SAME: <4 x half> noundef [[VN:%.*]], <4 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VFCVTN2_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]])
+// CHECK-CXX-NEXT:    ret <8 x i8> [[VFCVTN2_I]]
+//
+mfloat8x8_t test_vcvt_mf8_f16_fpm(float16x4_t vn, float16x4_t vm, fpm_t fpm) {
+  return vcvt_mf8_f16_fpm(vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <16 x i8> @test_vcvtq_mf8_f16_fpm(
+// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]])
+// CHECK-NEXT:    ret <16 x i8> [[VFCVTN2_I]]
+//
+// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z22test_vcvtq_mf8_f16_fpm13__Float16x8_tS_m(
+// CHECK-CXX-SAME: <8 x half> noundef [[VN:%.*]], <8 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]])
+// CHECK-CXX-NEXT:    ret <16 x i8> [[VFCVTN2_I]]
+//
+mfloat8x16_t test_vcvtq_mf8_f16_fpm(float16x8_t vn, float16x8_t vm, fpm_t fpm) {
+  return vcvtq_mf8_f16_fpm(vn, vm, fpm);
+}
diff --git a/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll b/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll
new file mode 100644
index 00000000000000..6070380d24234b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8 < %s | FileCheck %s
+
+define <8 x bfloat> @test_vbfcvtl1_low(<8 x i8> %vn) {
+; CHECK-LABEL: test_vbfcvtl1_low:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bf1cvtl v0.8h, v0.8b
+; CHECK-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> %vn)
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_vbfcvtl1_high(<16 x i8> %vn) {
+; CHECK-LABEL: test_vbfcvtl1_high:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bf1cvtl2 v0.8h, v0.16b
+; CHECK-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> %vn)
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_vbfcvtl2_low(<8 x i8> %vn) {
+; CHECK-LABEL: test_vbfcvtl2_low:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bf2cvtl v0.8h, v0.8b
+; CHECK-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> %vn)
+  ret <8 x bfloat> %res
+}
+
+define <8 x bfloat> @test_vbfcvtl2_high(<16 x i8> %vn) {
+; CHECK-LABEL: test_vbfcvtl2_high:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bf2cvtl2 v0.8h, v0.16b
+; CHECK-NEXT:    ret
+  %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> %vn)
+  ret <8 x bfloat> %res
+}
+
+
+define <8 x half> @test_vfcvtl1_low(<8 x i8> %vn) {
+; CHECK-LABEL: test_vfcvtl1_low:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    f1cvtl v0.8h, v0.8b
+; CHECK-NEXT:    ret
+   %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> %vn)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_vfcvtl1_high(<16 x i8> %vn) {
+; CHECK-LABEL: test_vfcvtl1_high:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    f1cvtl2 v0.8h, v0.16b
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> %vn)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_vfcvtl2_low(<8 x i8> %vn) {
+; CHECK-LABEL: test_vfcvtl2_low:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    f2cvtl v0.8h, v0.8b
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> %vn)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_vfcvtl2_high(<16 x i8> %vn) {
+; CHECK-LABEL: test_vfcvtl2_high:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    f2cvtl2 v0.8h, v0.16b
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> %vn)
+  ret <8 x half> %res
+}
+
+define <8 x i8> @test_vcvtn_low_f8_f32(<4 x float> %vn, <4 x float> %vm) {
+; CHECK-LABEL: test_vcvtn_low_f8_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn v0.8b, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+  %res = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> %vn, <4 x float> %vm)
+  ret <8 x i8> %res
+}
+
+define <16 x i8> @test_vcvtn_high_f8_f32(<16 x i8> %vd, <4 x float> %vn, <4 x float> %vm) {
+; CHECK-LABEL: test_vcvtn_high_f8_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn2 v0.16b, v1.4s, v2.4s
+; CHECK-NEXT:    ret
+  %res = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> %vd, <4 x float> %vn, <4 x float> %vm)
+  ret <16 x i8> %res
+}
+
+
+define <8 x i8> @test_vcvtn_f8_f16(<4 x half> %vn, <4 x half> %vm) {
+; CHECK-LABEL: test_vcvtn_f8_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn v0.8b, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+  %res = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> %vn, <4 x half> %vm)
+  ret <8 x i8> %res
+}
+
+define <16 x i8> @test_vcvtn2_f8_f16(<8 x half> %vn, <8 x half> %vm) {
+; CHECK-LABEL: test_vcvtn2_f8_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtn v0.16b, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+  %res = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> %vn, <8 x half> %vm)
+  ret <16 x i8> %res
+}

>From 66e907df46a1e577029ff790bcae89baa69c0c31 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 17 Dec 2024 10:49:19 +0000
Subject: [PATCH 06/11] [fixup] Refector much of common code into a helper
 function (NFC)

---
 clang/lib/CodeGen/CGBuiltin.cpp     | 171 +++++++++++-----------------
 clang/lib/CodeGen/CodeGenFunction.h |   4 +
 2 files changed, 69 insertions(+), 106 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 9eac01e0a77697..2ec771cd5eb6f4 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6881,6 +6881,23 @@ Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
   return ConstantInt::get(Ty, neg ? -SV : SV);
 }
 
+Value *CodeGenFunction::EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0,
+                                           llvm::Type *Ty1, bool Extract,
+                                           SmallVectorImpl<llvm::Value *> &Ops,
+                                           const CallExpr *E,
+                                           const char *name) {
+  llvm::Type *Tys[] = {Ty0, Ty1};
+  if (Extract) {
+    // Op[0] is mfloat8x16_t, but the intrinsic converts only the lower part of
+    // the vector.
+    Tys[1] = llvm::FixedVectorType::get(Int8Ty, 8);
+    Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
+  }
+  llvm::Value *FPM =
+      EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
+  return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
+}
+
 // Right-shift a vector by a constant.
 Value *CodeGenFunction::EmitNeonRShiftImm(Value *Vec, Value *Shift,
                                           llvm::Type *Ty, bool usgn,
@@ -12804,6 +12821,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return V;
 
   unsigned Int;
+  bool ExtractLow = false;
   switch (BuiltinID) {
   default: return nullptr;
   case NEON::BI__builtin_neon_vbsl_v:
@@ -14018,117 +14036,58 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2;
     return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2");
   }
-  case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm:
   case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm:
-  case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_cvtl1;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(BFloatTy, 8);
-    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower part of
-    // the vector.
-    if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm) {
-      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
-                                               /*isQuad*/ false));
-      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
-    } else
-      Tys[1] = Ops[0]->getType();
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt1");
-  }
-  case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm:
+    ExtractLow = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl1,
+                              llvm::FixedVectorType::get(BFloatTy, 8),
+                              Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt1");
   case NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm:
-  case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_cvtl2;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(BFloatTy, 8);
-    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
-    //  part of the vector.
-    if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm) {
-      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
-                                               /*isQuad*/ false));
-      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
-    } else
-      Tys[1] = Ops[0]->getType();
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt2");
-  }
-  case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm:
+    ExtractLow = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl2,
+                              llvm::FixedVectorType::get(BFloatTy, 8),
+                              Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt2");
   case NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm:
-  case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_cvtl1;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(HalfTy, 8);
-    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
-    //  part of the vector.
-    if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm) {
-      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
-                                               /*isQuad*/ false));
-      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
-    } else
-      Tys[1] = Ops[0]->getType();
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt1");
-  }
-  case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm:
+    ExtractLow = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl1,
+                              llvm::FixedVectorType::get(HalfTy, 8),
+                              Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt1");
   case NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm:
-  case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_cvtl2;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(HalfTy, 8);
-    // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower
-    //  part of the vector.
-    if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm) {
-      Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false,
-                                               /*isQuad*/ false));
-      Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
-    } else
-      Tys[1] = Ops[0]->getType();
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt2");
-  }
-  case NEON::BI__builtin_neon_vcvt_mf8_f32_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_fcvtn;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 8);
-    Tys[1] = Ops[0]->getType();
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn");
-  }
-  case NEON::BI__builtin_neon_vcvt_mf8_f16_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_fcvtn;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 8);
-    // Gets the expected type, because arm_neon.h casts float16x4_t to int8x8_t
-    Tys[1] = llvm::FixedVectorType::get(HalfTy, 4);
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn");
-  }
-  case NEON::BI__builtin_neon_vcvtq_mf8_f16_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_fcvtn;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 16);
-    // Gets the expected type, because arm_neon.h casts float16x8_t to int8x16_t
-    Tys[1] = llvm::FixedVectorType::get(HalfTy, 8);
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn");
-  }
+    ExtractLow = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl2,
+                              llvm::FixedVectorType::get(HalfTy, 8),
+                              Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt2");
+  case NEON::BI__builtin_neon_vcvt_mf8_f32_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn,
+                              llvm::FixedVectorType::get(Int8Ty, 8),
+                              Ops[0]->getType(), false, Ops, E, "vfcvtn");
+  case NEON::BI__builtin_neon_vcvt_mf8_f16_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn,
+                              llvm::FixedVectorType::get(Int8Ty, 8),
+                              llvm::FixedVectorType::get(HalfTy, 4), false, Ops,
+                              E, "vfcvtn");
+  case NEON::BI__builtin_neon_vcvtq_mf8_f16_fpm:
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn,
+                              llvm::FixedVectorType::get(Int8Ty, 16),
+                              llvm::FixedVectorType::get(HalfTy, 8), false, Ops,
+                              E, "vfcvtn");
   case NEON::BI__builtin_neon_vcvt_high_mf8_f32_fpm: {
-    Int = Intrinsic::aarch64_neon_fp8_fcvtn2;
-    llvm::Type *Tys[2];
-    Tys[0] = llvm::FixedVectorType::get(Int8Ty, 16);
-    Tys[1] = Ops[1]->getType();
-    Ops[0] = Builder.CreateInsertVector(Tys[0], PoisonValue::get(Tys[0]),
-                                        Ops[0], Builder.getInt64(0));
-    llvm::Value *FPM =
-        EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
-    return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn2");
+    llvm::Type *Ty = llvm::FixedVectorType::get(Int8Ty, 16);
+    Ops[0] = Builder.CreateInsertVector(Ty, PoisonValue::get(Ty), Ops[0],
+                                        Builder.getInt64(0));
+    return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn2,
+                              Ty, Ops[1]->getType(), false, Ops, E, "vfcvtn2");
   }
   case NEON::BI__builtin_neon_vamin_f16:
   case NEON::BI__builtin_neon_vaminq_f16:
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 5924ad218a5292..816e612759d458 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4651,6 +4651,10 @@ class CodeGenFunction : public CodeGenTypeCache {
   llvm::Value *EmitFP8NeonCall(llvm::Function *F,
                                SmallVectorImpl<llvm::Value *> &O,
                                llvm::Value *FPM, const char *name);
+  llvm::Value *EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0,
+                                  llvm::Type *Ty1, bool Extract,
+                                  SmallVectorImpl<llvm::Value *> &Ops,
+                                  const CallExpr *E, const char *name);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx,
                              const llvm::ElementCount &Count);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx);

>From 1836ab53c954e4d4f291615bb241158ef37d2667 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 17 Dec 2024 11:28:49 +0000
Subject: [PATCH 07/11] [fixup] Add target features test, remove redundant bf16
 guard

---
 clang/include/clang/Basic/arm_neon.td         |  2 +-
 .../acle_neon_fp8_cvt.c                       | 43 +++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index cc161c51df6608..a68d9c8bc86325 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2125,7 +2125,7 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in {
   }
 }
 
-let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,bf16,neon" in {
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
   def VBF1CVT_BF16_MF8        : VInst<"vcvt1_bf16_mf8_fpm",      "(QB).V", "m">;
   def VBF1CVT_LOW_BF16_MF8    : VInst<"vcvt1_low_bf16_mf8_fpm",  "B.V",    "Qm">;
   def VBF2CVTL_BF16_MF8       : VInst<"vcvt2_bf16_mf8_fpm",      "(QB).V", "m">;
diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c
new file mode 100644
index 00000000000000..2c7004c7968a46
--- /dev/null
+++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +faminmax -emit-llvm -verify %s -o /dev/null
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+void test_features(float16x4_t vd4, float16x8_t vd8, float32x4_t va4,
+                   mfloat8x8_t v8, mfloat8x16_t v16, fpm_t fpm) {
+  (void) vcvt1_bf16_mf8_fpm(v8, fpm);
+  // expected-error at -1 {{'vcvt1_bf16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt1_low_bf16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt1_low_bf16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_bf16_mf8_fpm(v8, fpm);
+  // expected-error at -1 {{'vcvt2_bf16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_low_bf16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt2_low_bf16_mf8_fpm' requires target feature 'fp8'}}
+
+  (void) vcvt1_high_bf16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt1_high_bf16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_high_bf16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt2_high_bf16_mf8_fpm' requires target feature 'fp8'}}
+
+  (void) vcvt1_f16_mf8_fpm(v8, fpm);
+  // expected-error at -1 {{'vcvt1_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt1_low_f16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt1_low_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_f16_mf8_fpm(v8, fpm);
+  // expected-error at -1 {{'vcvt2_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_low_f16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt2_low_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt1_high_f16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt1_high_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt2_high_f16_mf8_fpm(v16, fpm);
+  // expected-error at -1 {{'vcvt2_high_f16_mf8_fpm' requires target feature 'fp8'}}
+  (void) vcvt_mf8_f32_fpm(va4, va4, fpm);
+  // expected-error at -1 {{'vcvt_mf8_f32_fpm' requires target feature 'fp8'}}
+  (void) vcvt_high_mf8_f32_fpm(v8, va4, va4, fpm);
+  // expected-error at -1 {{'vcvt_high_mf8_f32_fpm' requires target feature 'fp8'}}
+  (void) vcvt_mf8_f16_fpm(vd4, vd4, fpm);
+  // expected-error at -1 {{'vcvt_mf8_f16_fpm' requires target feature 'fp8'}}
+  (void) vcvtq_mf8_f16_fpm(vd8, vd8, fpm);
+  // expected-error at -1 {{'vcvtq_mf8_f16_fpm' requires target feature 'fp8'}}
+}

>From c9b7e94cc8f5a7024d862a876b733ff7f1946bb8 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 17 Dec 2024 11:42:42 +0000
Subject: [PATCH 08/11] [AArch64] Add FP8 Neon intrinsics for dot-product

THis patch adds the following intrinsics:

float16x4_t vdot_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpm)
float16x8_t vdotq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm)

float16x4_t vdot_lane_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x4_t vdot_laneq_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vdotq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
float16x8_t vdotq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
---
 clang/include/clang/Basic/arm_neon.td         |  22 +++
 clang/include/clang/Basic/arm_neon_incl.td    |   2 +-
 clang/lib/CodeGen/CGBuiltin.cpp               |  47 ++++++
 clang/lib/CodeGen/CodeGenFunction.h           |   5 +
 .../fp8-intrinsics/acle_neon_fp8_fdot.c       | 143 ++++++++++++++++++
 .../acle_neon_fp8_fdot.c                      |  54 +++++++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  21 +++
 .../lib/Target/AArch64/AArch64InstrFormats.td |  82 ++++++----
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  14 +-
 llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll    |  74 +++++++++
 10 files changed, 424 insertions(+), 40 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c
 create mode 100644 clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fdot.c
 create mode 100644 llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index a68d9c8bc86325..fec91ac754ae6c 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2148,6 +2148,28 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in {
   def VCVTNQ_F8_F16     : VInst<"vcvtq_mf8_f16_fpm",     ".(>F)(>F)V",      "Qm">;
 }
 
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot2,neon" in {
+  def VDOT_F16_MF8  : VInst<"vdot_f16_mf8_fpm", "(>F)(>F)..V", "m">;
+  def VDOTQ_F16_MF8 : VInst<"vdotq_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+
+  def VDOT_LANE_F16_MF8 : VInst<"vdot_lane_f16_mf8_fpm", "(>F)(>F)..IV", "m",   [ImmCheck<3, ImmCheck0_3, 0>]>;
+  def VDOT_LANEQ_F16_MF8 : VInst<"vdot_laneq_f16_mf8_fpm", "(>F)(>F).QIV", "m",   [ImmCheck<3, ImmCheck0_7, 0>]>;
+
+  def VDOTQ_LANE_F16_MF8 : VInst<"vdotq_lane_f16_mf8_fpm", "(>F)(>F).qIV", "Qm",   [ImmCheck<3, ImmCheck0_3, 0>]>;
+  def VDOTQ_LANEQ_F16_MF8 : VInst<"vdotq_laneq_f16_mf8_fpm", "(>F)(>F)..IV", "Qm",   [ImmCheck<3, ImmCheck0_7, 0>]>;
+}
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot4,neon" in {
+  def VDOT_F32_MF8  : VInst<"vdot_f32_mf8_fpm", "(>>F)(>>F)..V", "m">;
+  def VDOTQ_F32_MF8 : VInst<"vdotq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+
+  def VDOT_LANE_F32_MF8 : VInst<"vdot_lane_f32_mf8_fpm", "(>>F)(>>F)..IV", "m",   [ImmCheck<3, ImmCheck0_1, 0>]>;
+  def VDOT_LANEQ_F32_MF8 : VInst<"vdot_laneq_f32_mf8_fpm", "(>>F)(>>F).QIV", "m",   [ImmCheck<3, ImmCheck0_3, 0>]>;
+
+  def VDOTQ_LANE_F32_MF8 : VInst<"vdotq_lane_f32_mf8_fpm", "(>>F)(>>F).qIV", "Qm",   [ImmCheck<3, ImmCheck0_1, 0>]>;
+  def VDOTQ_LANEQ_F32_MF8 : VInst<"vdotq_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm",   [ImmCheck<3, ImmCheck0_3, 0>]>;
+}
+
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
   def FAMIN : WInst<"vamin", "...", "fhQdQfQh">;
   def FAMAX : WInst<"vamax", "...", "fhQdQfQh">;
diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td
index 91a2bf3020b9a3..b9b9d509c22512 100644
--- a/clang/include/clang/Basic/arm_neon_incl.td
+++ b/clang/include/clang/Basic/arm_neon_incl.td
@@ -302,7 +302,7 @@ class Inst <string n, string p, string t, Operation o, list<ImmCheck> ch = []>{
 class SInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
 class IInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
 class WInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
-class VInst<string n, string p, string t> : Inst<n, p, t, OP_NONE> {}
+class VInst<string n, string p, string t, list<ImmCheck> ch = []> : Inst<n, p, t, OP_NONE, ch> {}
 
 // The following instruction classes are implemented via operators
 // instead of builtins. As such these declarations are only used for
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 2ec771cd5eb6f4..1d0b454ecb62f2 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6875,6 +6875,25 @@ Value *CodeGenFunction::EmitFP8NeonCall(Function *F,
   return EmitNeonCall(F, Ops, name);
 }
 
+llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
+    unsigned IID, bool ExtendLane, llvm::Type *RetTy,
+    SmallVectorImpl<llvm::Value *> &Ops, unsigned ICEArguments,
+    const CallExpr *E, const char *name) {
+
+  const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() /
+                             RetTy->getPrimitiveSizeInBits();
+  llvm::Type *Tys[] = {llvm::FixedVectorType::get(RetTy, ElemCount),
+                       Ops[1]->getType()};
+  if (ExtendLane) {
+    auto *VT = llvm::FixedVectorType::get(Int8Ty, 16);
+    Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2],
+                                        Builder.getInt64(0));
+  }
+  llvm::Value *FPM =
+      EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+  return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
+}
+
 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
                                             bool neg) {
   int SV = cast<ConstantInt>(V)->getSExtValue();
@@ -12822,6 +12841,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
 
   unsigned Int;
   bool ExtractLow = false;
+  bool ExtendLane = false;
   switch (BuiltinID) {
   default: return nullptr;
   case NEON::BI__builtin_neon_vbsl_v:
@@ -14089,6 +14109,33 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn2,
                               Ty, Ops[1]->getType(), false, Ops, E, "vfcvtn2");
   }
+
+  case NEON::BI__builtin_neon_vdot_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_f16_mf8_fpm:
+    return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2, false, HalfTy,
+                               Ops, ICEArguments, E, "fdot2");
+  case NEON::BI__builtin_neon_vdot_lane_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_lane_f16_mf8_fpm:
+    ExtendLane = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vdot_laneq_f16_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_laneq_f16_mf8_fpm:
+    return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2_lane,
+                               ExtendLane, HalfTy, Ops, ICEArguments, E,
+                               "fdot2_lane");
+  case NEON::BI__builtin_neon_vdot_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_f32_mf8_fpm:
+    return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4, false,
+                               FloatTy, Ops, ICEArguments, E, "fdot4");
+  case NEON::BI__builtin_neon_vdot_lane_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_lane_f32_mf8_fpm:
+    ExtendLane = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vdot_laneq_f32_mf8_fpm:
+  case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm:
+    return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane,
+                               ExtendLane, FloatTy, Ops, ICEArguments, E,
+                               "fdot4_lane");
   case NEON::BI__builtin_neon_vamin_f16:
   case NEON::BI__builtin_neon_vaminq_f16:
   case NEON::BI__builtin_neon_vamin_f32:
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 816e612759d458..09336e54a3be85 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4655,6 +4655,11 @@ class CodeGenFunction : public CodeGenTypeCache {
                                   llvm::Type *Ty1, bool Extract,
                                   SmallVectorImpl<llvm::Value *> &Ops,
                                   const CallExpr *E, const char *name);
+  llvm::Value *EmitFP8NeonFDOTCall(unsigned IID, bool ExtendLane,
+                                   llvm::Type *RetTy,
+                                   SmallVectorImpl<llvm::Value *> &Ops,
+                                   unsigned ICEArguments, const CallExpr *E,
+                                   const char *name);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx,
                              const llvm::ElementCount &Count);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx);
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c
new file mode 100644
index 00000000000000..b273bc2abe8779
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c
@@ -0,0 +1,143 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+
+// REQUIES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: define dso_local <4 x half> @test_vdot_f16(
+// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT21_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
+// CHECK-NEXT:    ret <4 x half> [[FDOT21_I]]
+//
+float16x4_t test_vdot_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
+  return vdot_f16_mf8_fpm(vd, vn, vm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_f16(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT21_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <8 x half> [[FDOT21_I]]
+//
+float16x8_t test_vdotq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
+  return vdotq_f16_mf8_fpm(vd, vn, vm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_vdot_lane_f16(
+// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 3)
+// CHECK-NEXT:    ret <4 x half> [[FDOT2_LANE1]]
+//
+float16x4_t test_vdot_lane_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
+  return vdot_lane_f16_mf8_fpm(vd, vn, vm, 3, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <4 x half> @test_vdot_laneq_f16(
+// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    ret <4 x half> [[FDOT2_LANE1]]
+//
+float16x4_t test_vdot_laneq_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x16_t vm, fpm_t fpmr) {
+  return vdot_laneq_f16_mf8_fpm(vd, vn, vm, 7, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_lane_f16(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 3)
+// CHECK-NEXT:    ret <8 x half> [[FDOT2_LANE1]]
+//
+float16x8_t test_vdotq_lane_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) {
+  return vdotq_lane_f16_mf8_fpm(vd, vn, vm, 3, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_laneq_f16(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    ret <8 x half> [[FDOT2_LANE1]]
+//
+float16x8_t test_vdotq_laneq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
+  return vdotq_laneq_f16_mf8_fpm(vd, vn, vm, 7, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <2 x float> @test_vdot_f32(
+// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]])
+// CHECK-NEXT:    ret <2 x float> [[FDOT4_I]]
+//
+float32x2_t test_vdot_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
+  return vdot_f32_mf8_fpm(vd, vn, vm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_f32(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <4 x float> [[FDOT4_I]]
+//
+float32x4_t test_vdotq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
+  return vdotq_f32_mf8_fpm(vd, vn, vm, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <2 x float> @test_vdot_lane_f32(
+// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_LANE:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret <2 x float> [[FDOT4_LANE]]
+//
+float32x2_t test_vdot_lane_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) {
+  return vdot_lane_f32_mf8_fpm(vd, vn, vm, 1, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <2 x float> @test_vdot_laneq_f32(
+// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_LANE:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    ret <2 x float> [[FDOT4_LANE]]
+//
+float32x2_t test_vdot_laneq_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x16_t vm, fpm_t fpmr) {
+  return vdot_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_lane_f32(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1)
+// CHECK-NEXT:    ret <4 x float> [[FDOT4_LANE]]
+//
+float32x4_t test_vdotq_lane_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) {
+  return vdotq_lane_f32_mf8_fpm(vd, vn, vm, 1, fpmr);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_laneq_f32(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]])
+// CHECK-NEXT:    [[FDOT4_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    ret <4 x float> [[FDOT4_LANE]]
+//
+float32x4_t test_vdotq_laneq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) {
+  return vdotq_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpmr);
+}
diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fdot.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fdot.c
new file mode 100644
index 00000000000000..8bfe3ac26ab2c3
--- /dev/null
+++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fdot.c
@@ -0,0 +1,54 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +faminmax -target-feature +fp8 -emit-llvm -verify %s -o /dev/null
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+void test_features(float16x4_t vd4, float16x8_t vd8, float32x4_t va4, float32x2_t va2,
+                   mfloat8x8_t v8, mfloat8x16_t v16, fpm_t fpm) {
+  (void) vdot_f16_mf8_fpm(vd4, v8, v8, fpm);
+// expected-error at -1 {{'vdot_f16_mf8_fpm' requires target feature 'fp8dot2'}}
+  (void) vdotq_f16_mf8_fpm(vd8, v16, v16, fpm);
+// expected-error at -1 {{'vdotq_f16_mf8_fpm' requires target feature 'fp8dot2'}}
+  (void) vdot_lane_f16_mf8_fpm(vd4, v8, v8, 3, fpm);
+// expected-error at -1 {{'__builtin_neon_vdot_lane_f16_mf8_fpm' needs target feature fp8dot2,neon}}
+  (void) vdot_laneq_f16_mf8_fpm(vd4, v8, v16, 7, fpm);
+// expected-error at -1 {{'__builtin_neon_vdot_laneq_f16_mf8_fpm' needs target feature fp8dot2,neon}}
+  (void) vdotq_lane_f16_mf8_fpm(vd8, v16, v8, 3, fpm);
+// expected-error at -1 {{'__builtin_neon_vdotq_lane_f16_mf8_fpm' needs target feature fp8dot2,neon}}
+  (void) vdotq_laneq_f16_mf8_fpm(vd8, v16, v16, 7, fpm);
+// expected-error at -1 {{'__builtin_neon_vdotq_laneq_f16_mf8_fpm' needs target feature fp8dot2,neon}}
+
+  (void) vdot_f32_mf8_fpm(va2, v8, v8, fpm);
+// expected-error at -1 {{'vdot_f32_mf8_fpm' requires target feature 'fp8dot4'}}
+  (void) vdotq_f32_mf8_fpm(va4, v16, v16, fpm);
+// expected-error at -1 {{'vdotq_f32_mf8_fpm' requires target feature 'fp8dot4}}
+  (void) vdot_lane_f32_mf8_fpm(va2, v8, v8, 1, fpm);
+// expected-error at -1 {{'__builtin_neon_vdot_lane_f32_mf8_fpm' needs target feature fp8dot4,neon}}
+  (void) vdot_laneq_f32_mf8_fpm(va2, v8, v16, 3, fpm);
+// expected-error at -1 {{'__builtin_neon_vdot_laneq_f32_mf8_fpm' needs target feature fp8dot4,neon}}
+  (void) vdotq_lane_f32_mf8_fpm(va4, v16, v8, 1, fpm);
+// expected-error at -1 {{'__builtin_neon_vdotq_lane_f32_mf8_fpm' needs target feature fp8dot4,neon}}
+  (void) vdotq_laneq_f32_mf8_fpm(va4, v16, v16, 3, fpm);
+// expected-error at -1 {{'__builtin_neon_vdotq_laneq_f32_mf8_fpm' needs target feature fp8dot4,neon}}
+}
+
+void test_imm(float16x4_t vd4, float16x8_t vd8, float32x2_t va2, float32x4_t va4,
+              mfloat8x8_t v8, mfloat8x16_t v16, fpm_t fpm) {
+  (void) vdot_lane_f16_mf8_fpm(vd4, v8, v8, -1, fpm);
+  // expected-error at -1 {{argument value -1 is outside the valid range [0, 3]}}
+  (void) vdot_laneq_f16_mf8_fpm(vd4, v8, v16, -1, fpm);
+  // expected-error at -1 {{argument value -1 is outside the valid range [0, 7]}}
+  (void) vdotq_lane_f16_mf8_fpm(vd8, v16, v8, -1, fpm);
+  // expected-error at -1 {{argument value -1 is outside the valid range [0, 3]}}
+  (void) vdotq_laneq_f16_mf8_fpm(vd8, v16, v16, -1, fpm);
+  // expected-error at -1 {{argument value -1 is outside the valid range [0, 7]}}
+  (void) vdot_lane_f32_mf8_fpm(va2, v8, v8, -1, fpm);
+  // expected-error at -1 {{argument value -1 is outside the valid range [0, 1]}}
+  (void) vdot_laneq_f32_mf8_fpm(va2, v8, v16, -1, fpm);
+  // expected-error at -1 {{argument value -1 is outside the valid range [0, 3]}}
+  (void) vdotq_lane_f32_mf8_fpm(va4, v16, v8, -1, fpm);
+  // expected-error at -1 {{argument value -1 is outside the valid range [0, 1]}}
+  (void) vdotq_laneq_f32_mf8_fpm(va4, v16, v16, -1, fpm);
+  // expected-error at -1 {{argument value -1 is outside the valid range [0, 3]}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 2dcacce389ef81..552d3010333c17 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1026,6 +1026,27 @@ def int_aarch64_st64bv0: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], dat
                              llvm_anyvector_ty,
                              LLVMMatchType<1>],
                             [IntrReadMem, IntrInaccessibleMemOnly]>;
+
+  // Dot-product
+  class AdvSIMD_FP8_DOT_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [LLVMMatchType<0>,
+                             llvm_anyvector_ty,
+                             LLVMMatchType<1>],
+                             [IntrReadMem, IntrInaccessibleMemOnly]>;
+  class AdvSIMD_FP8_DOT_LANE_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [LLVMMatchType<0>,
+                             llvm_anyvector_ty,
+                             llvm_v16i8_ty,
+                             llvm_i32_ty],
+                             [IntrReadMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<3>>]>;
+
+  def int_aarch64_neon_fp8_fdot2 : AdvSIMD_FP8_DOT_Intrinsic;
+  def int_aarch64_neon_fp8_fdot2_lane : AdvSIMD_FP8_DOT_LANE_Intrinsic;
+
+  def int_aarch64_neon_fp8_fdot4 : AdvSIMD_FP8_DOT_Intrinsic;
+  def int_aarch64_neon_fp8_fdot4_lane : AdvSIMD_FP8_DOT_LANE_Intrinsic;
 }
 
 def llvm_nxv1i1_ty  : LLVMType<nxv1i1>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index df224b580c6e5f..484caeff05572e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6577,19 +6577,22 @@ multiclass SIMD_FP8_CVTN_F32<string asm, SDPatternOperator Op> {
             (!cast<Instruction>(NAME # 2v16f8) V128:$_Rd, V128:$Rn, V128:$Rm)>;
 }
 
-// TODO: Create a new Value Type v8f8 and v16f8
-multiclass SIMDThreeSameVectorDOT2<string asm> {
-   def v4f16 : BaseSIMDThreeSameVectorDot<0b0, 0b0, 0b01, 0b1111, asm, ".4h", ".8b",
-                                          V64, v4f16, v8i8, null_frag>;
-   def v8f16 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b01, 0b1111, asm, ".8h", ".16b",
-                                          V128, v8f16, v16i8, null_frag>;
+multiclass SIMD_FP8_Dot2<string asm, SDPatternOperator op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v4f16 : BaseSIMDThreeSameVectorDot<0b0, 0b0, 0b01, 0b1111, asm, ".4h", ".8b",
+                                           V64, v4f16, v8i8, op>;
+    def v8f16 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b01, 0b1111, asm, ".8h", ".16b",
+                                           V128, v8f16, v16i8, op>;
+  }
 }
 
-multiclass SIMDThreeSameVectorDOT4<string asm> {
-   def v2f32 : BaseSIMDThreeSameVectorDot<0b0, 0b0, 0b00, 0b1111, asm, ".2s", ".8b",
-                                          V64, v2f32, v8i8, null_frag>;
-   def v4f32 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1111, asm, ".4s", ".16b",
-                                          V128, v4f32, v16i8, null_frag>;
+multiclass SIMD_FP8_Dot4<string asm, SDPatternOperator op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v2f32 : BaseSIMDThreeSameVectorDot<0b0, 0b0, 0b00, 0b1111, asm, ".2s", ".8b",
+                                           V64, v2f32, v8i8, op>;
+    def v4f32 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1111, asm, ".4s", ".16b",
+                                           V128, v4f32, v16i8, op>;
+  }
 }
 
 let mayRaiseFPException = 1, Uses = [FPCR] in
@@ -9133,15 +9136,16 @@ class BaseSIMDThreeSameVectorIndexS<bit Q, bit U, bits<2> size, bits<4> opc, str
                                     string dst_kind, string lhs_kind, string rhs_kind,
                                     RegisterOperand RegType,
                                     ValueType AccumType, ValueType InputType,
+                                    AsmVectorIndexOpnd VIdx,
                                     SDPatternOperator OpNode> :
         BaseSIMDIndexedTied<Q, U, 0b0, size, opc, RegType, RegType, V128,
-                            VectorIndexS, asm, "", dst_kind, lhs_kind, rhs_kind,
+                            VIdx, asm, "", dst_kind, lhs_kind, rhs_kind,
         [(set (AccumType RegType:$dst),
               (AccumType (OpNode (AccumType RegType:$Rd),
                                  (InputType RegType:$Rn),
                                  (InputType (bitconvert (AccumType
                                     (AArch64duplane32 (v4i32 V128:$Rm),
-                                        VectorIndexS:$idx)))))))]> {
+                                        VIdx:$idx)))))))]> {
   bits<2> idx;
   let Inst{21}    = idx{0};  // L
   let Inst{11}    = idx{1};  // H
@@ -9150,17 +9154,24 @@ class BaseSIMDThreeSameVectorIndexS<bit Q, bit U, bits<2> size, bits<4> opc, str
 multiclass SIMDThreeSameVectorDotIndex<bit U, bit Mixed, bits<2> size, string asm,
                                        SDPatternOperator OpNode> {
   def v8i8  : BaseSIMDThreeSameVectorIndexS<0, U, size, {0b111, Mixed}, asm, ".2s", ".8b", ".4b",
-                                              V64, v2i32, v8i8, OpNode>;
+                                              V64, v2i32, v8i8, VectorIndexS, OpNode>;
   def v16i8 : BaseSIMDThreeSameVectorIndexS<1, U, size, {0b111, Mixed}, asm, ".4s", ".16b", ".4b",
-                                              V128, v4i32, v16i8, OpNode>;
+                                              V128, v4i32, v16i8, VectorIndexS, OpNode>;
 }
 
-// TODO: The vectors v8i8 and v16i8 should be v8f8 and v16f8
-multiclass SIMDThreeSameVectorFP8DOT4Index<string asm> {
-  def v8f8 : BaseSIMDThreeSameVectorIndexS<0b0, 0b0, 0b00, 0b0000, asm, ".2s", ".8b", ".4b",
-                                           V64, v2f32, v8i8, null_frag>;
-  def v16f8 : BaseSIMDThreeSameVectorIndexS<0b1, 0b0, 0b00, 0b0000, asm, ".4s", ".16b",".4b",
-                                            V128, v4f32, v16i8, null_frag>;
+multiclass SIMD_FP8_Dot4_Index<string asm, SDPatternOperator op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v2f32 : BaseSIMDThreeSameVectorIndexS<0b0, 0b0, 0b00, 0b0000, asm, ".2s", ".8b", ".4b",
+                                              V64, v2f32, v8i8, VectorIndexS32b_timm, null_frag>;
+    def v4f32 : BaseSIMDThreeSameVectorIndexS<0b1, 0b0, 0b00, 0b0000, asm, ".4s", ".16b",".4b",
+                                              V128, v4f32, v16i8, VectorIndexS32b_timm, null_frag>;
+  }
+
+  def : Pat<(v2f32 (op (v2f32 V64:$Rd), (v8i8 V64:$Rn), (v16i8 V128:$Rm), VectorIndexS32b_timm:$Idx)),
+            (!cast<Instruction>(NAME # v2f32) $Rd, $Rn, $Rm, $Idx)>;
+
+  def : Pat<(v4f32 (op (v4f32 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm), VectorIndexS32b_timm:$Idx)),
+            (!cast<Instruction>(NAME # v4f32) $Rd, $Rn, $Rm, $Idx)>;
 }
 
 // ARMv8.2-A Fused Multiply Add-Long Instructions (Indexed)
@@ -9169,14 +9180,15 @@ class BaseSIMDThreeSameVectorIndexH<bit Q, bit U, bits<2> sz, bits<4> opc, strin
                                       string dst_kind, string lhs_kind,
                                       string rhs_kind, RegisterOperand RegType,
                                       RegisterOperand RegType_lo, ValueType AccumType,
-                                      ValueType InputType, SDPatternOperator OpNode> :
+                                      ValueType InputType, AsmVectorIndexOpnd VIdx,
+                                      SDPatternOperator OpNode> :
         BaseSIMDIndexedTied<Q, U, 0, sz, opc, RegType, RegType, RegType_lo,
-                            VectorIndexH, asm, "", dst_kind, lhs_kind, rhs_kind,
+                            VIdx, asm, "", dst_kind, lhs_kind, rhs_kind,
           [(set (AccumType RegType:$dst),
                 (AccumType (OpNode (AccumType RegType:$Rd),
                                    (InputType RegType:$Rn),
                                    (InputType (AArch64duplane16 (v8f16 V128_lo:$Rm),
-                                                VectorIndexH:$idx)))))]> {
+                                                VIdx:$idx)))))]> {
   // idx = H:L:M
   bits<3> idx;
   let Inst{11} = idx{2}; // H
@@ -9187,19 +9199,25 @@ class BaseSIMDThreeSameVectorIndexH<bit Q, bit U, bits<2> sz, bits<4> opc, strin
 multiclass SIMDThreeSameVectorFMLIndex<bit U, bits<4> opc, string asm,
                                        SDPatternOperator OpNode> {
   def v4f16 : BaseSIMDThreeSameVectorIndexH<0, U, 0b10, opc, asm, ".2s", ".2h", ".h",
-                                              V64, V128_lo, v2f32, v4f16, OpNode>;
+                                              V64, V128_lo, v2f32, v4f16, VectorIndexH, OpNode>;
   def v8f16 : BaseSIMDThreeSameVectorIndexH<1, U, 0b10, opc, asm, ".4s", ".4h", ".h",
-                                              V128, V128_lo, v4f32, v8f16, OpNode>;
+                                              V128, V128_lo, v4f32, v8f16, VectorIndexH, OpNode>;
 }
 
 //----------------------------------------------------------------------------
 // FP8 Advanced SIMD vector x indexed element
-// TODO: Replace value types v8i8 and v16i8 by v8f8 and v16f8
-multiclass SIMDThreeSameVectorFP8DOT2Index<string asm> {
-  def v4f16 : BaseSIMDThreeSameVectorIndexH<0b0, 0b0, 0b01, 0b0000, asm, ".4h", ".8b", ".2b",
-                                            V64, V128_lo, v4f16, v8i8, null_frag>;
-  def v8f16 : BaseSIMDThreeSameVectorIndexH<0b1, 0b0, 0b01, 0b0000, asm, ".8h", ".16b", ".2b",
-                                            V128, V128_lo, v8f16, v8i16, null_frag>;
+multiclass SIMD_FP8_Dot2_Index<string asm, SDPatternOperator op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in { 
+    def v4f16 : BaseSIMDThreeSameVectorIndexH<0b0, 0b0, 0b01, 0b0000, asm, ".4h", ".8b", ".2b",
+                                              V64, V128_lo, v4f16, v8i8, VectorIndexH32b_timm, null_frag>;
+    def v8f16 : BaseSIMDThreeSameVectorIndexH<0b1, 0b0, 0b01, 0b0000, asm, ".8h", ".16b", ".2b",
+                                              V128, V128_lo, v8f16, v16i8, VectorIndexH32b_timm, null_frag>;
+  }
+  def : Pat<(v4f16 (op (v4f16 V64:$Rd), (v8i8 V64:$Rn), (v16i8 V128_lo:$Rm), VectorIndexH32b_timm:$Idx)),
+            (!cast<Instruction>(NAME # v4f16) $Rd, $Rn, $Rm, $Idx)>;
+
+  def : Pat<(v8f16 (op (v8f16 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128_lo:$Rm), VectorIndexH32b_timm:$Idx)),
+            (!cast<Instruction>(NAME # v8f16) $Rd, $Rn, $Rm, $Idx)>;
 }
 
 multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 18ffff396fa5da..ae3f0e820a24e5 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1489,7 +1489,7 @@ class BaseSIMDSUDOTIndex<bit Q, string dst_kind, string lhs_kind,
                          ValueType AccumType, ValueType InputType>
       : BaseSIMDThreeSameVectorIndexS<Q, 0, 0b00, 0b1111, "sudot", dst_kind,
                                         lhs_kind, rhs_kind, RegType, AccumType,
-                                        InputType, null_frag> {
+                                        InputType, VectorIndexS, null_frag> {
   let Pattern = [(set (AccumType RegType:$dst),
                       (AccumType (AArch64usdot (AccumType RegType:$Rd),
                                  (InputType (bitconvert (AccumType
@@ -10374,14 +10374,14 @@ let Uses = [FPMR, FPCR], Predicates = [HasFP8FMA] in {
  defm FMLALLTT : SIMDThreeSameVectorMLAL<0b1, 0b01, "fmlalltt">;
 } // End let Predicates = [HasFP8FMA]
 
-let Uses = [FPMR, FPCR], Predicates = [HasFP8DOT2] in {
- defm FDOTlane : SIMDThreeSameVectorFP8DOT2Index<"fdot">;
- defm FDOT : SIMDThreeSameVectorDOT2<"fdot">;
+let Predicates = [HasFP8DOT2] in {
+ defm FDOTlane : SIMD_FP8_Dot2_Index<"fdot", int_aarch64_neon_fp8_fdot2_lane>;
+ defm FDOT : SIMD_FP8_Dot2<"fdot", int_aarch64_neon_fp8_fdot2>;
 } // End let Predicates = [HasFP8DOT2]
 
-let Uses = [FPMR, FPCR], Predicates = [HasFP8DOT4] in {
- defm FDOTlane : SIMDThreeSameVectorFP8DOT4Index<"fdot">;
- defm FDOT : SIMDThreeSameVectorDOT4<"fdot">;
+let Predicates = [HasFP8DOT4] in {
+ defm FDOTlane : SIMD_FP8_Dot4_Index<"fdot", int_aarch64_neon_fp8_fdot4_lane>;
+ defm FDOT : SIMD_FP8_Dot4<"fdot", int_aarch64_neon_fp8_fdot4>;
 } // End let Predicates = [HasFP8DOT4]
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll b/llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll
new file mode 100644
index 00000000000000..b7a35c5fddf170
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8dot2,+fp8dot4 < %s | FileCheck %s
+
+define <4 x half> @test_fdot_f16(<4 x half> %vd, <8 x i8> %vn, <8 x i8> %vm) {
+; CHECK-LABEL: test_fdot_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.4h, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %res = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> %vd, <8 x i8> %vn, <8 x i8> %vm)
+  ret <4 x half> %res
+}
+
+define <8 x half> @test_fdotq_f16(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fdotq_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.8h, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm)
+  ret <8 x half> %res
+}
+
+define <4 x half> @test_fdot_lane_f16(<4 x half> %vd, <8 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fdot_lane_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.4h, v1.8b, v2.2b[0]
+; CHECK-NEXT:    ret
+  %res = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> %vd, <8 x i8> %vn, <16 x i8> %vm, i32 0)
+  ret <4 x half> %res
+}
+
+define <8 x half> @test_fdotq_lane_f16(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fdotq_lane_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.8h, v1.16b, v2.2b[7]
+; CHECK-NEXT:    ret
+  %res = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm, i32 7)
+  ret <8 x half> %res
+}
+
+define <2 x float> @test_fdot_f32(<2 x float> %vd, <8 x i8> %vn, <8 x i8> %vm) {
+; CHECK-LABEL: test_fdot_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.2s, v1.8b, v2.8b
+; CHECK-NEXT:    ret
+  %res = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.v2f32.v8i8(<2 x float> %vd, <8 x i8> %vn, <8 x i8> %vm)
+  ret <2 x float> %res
+}
+
+define <4 x float> @test_fdotq_f32(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fdotq_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %res = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.v4f32.v16i8(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm)
+  ret <4 x float> %res
+}
+
+define <2 x float> @test_fdot_lane_f32(<2 x float> %vd, <8 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fdot_lane_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.2s, v1.8b, v2.4b[0]
+; CHECK-NEXT:    ret
+  %res = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> %vd, <8 x i8> %vn, <16 x i8> %vm, i32 0)
+  ret <2 x float> %res
+}
+
+define <4 x float> @test_fdotq_lane_f32(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fdotq_lane_f32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fdot v0.4s, v1.16b, v2.4b[3]
+; CHECK-NEXT:    ret
+  %res = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm, i32 3)
+  ret <4 x float> %res
+}

>From cde75eedb6ac96d9719e5c4b90be2c91f3cf14ef Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 17 Dec 2024 13:23:31 +0000
Subject: [PATCH 09/11] [fixup] Remove not needed argument (NFC)

---
 clang/lib/CodeGen/CGBuiltin.cpp     | 15 ++++++---------
 clang/lib/CodeGen/CodeGenFunction.h |  3 +--
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 1d0b454ecb62f2..33a297e210340d 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6877,8 +6877,7 @@ Value *CodeGenFunction::EmitFP8NeonCall(Function *F,
 
 llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
     unsigned IID, bool ExtendLane, llvm::Type *RetTy,
-    SmallVectorImpl<llvm::Value *> &Ops, unsigned ICEArguments,
-    const CallExpr *E, const char *name) {
+    SmallVectorImpl<llvm::Value *> &Ops, const CallExpr *E, const char *name) {
 
   const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() /
                              RetTy->getPrimitiveSizeInBits();
@@ -6890,7 +6889,7 @@ llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
                                         Builder.getInt64(0));
   }
   llvm::Value *FPM =
-      EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E);
+      EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
   return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
 }
 
@@ -14113,7 +14112,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vdot_f16_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_f16_mf8_fpm:
     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2, false, HalfTy,
-                               Ops, ICEArguments, E, "fdot2");
+                               Ops, E, "fdot2");
   case NEON::BI__builtin_neon_vdot_lane_f16_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_lane_f16_mf8_fpm:
     ExtendLane = true;
@@ -14121,12 +14120,11 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vdot_laneq_f16_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_laneq_f16_mf8_fpm:
     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2_lane,
-                               ExtendLane, HalfTy, Ops, ICEArguments, E,
-                               "fdot2_lane");
+                               ExtendLane, HalfTy, Ops, E, "fdot2_lane");
   case NEON::BI__builtin_neon_vdot_f32_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_f32_mf8_fpm:
     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4, false,
-                               FloatTy, Ops, ICEArguments, E, "fdot4");
+                               FloatTy, Ops, E, "fdot4");
   case NEON::BI__builtin_neon_vdot_lane_f32_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_lane_f32_mf8_fpm:
     ExtendLane = true;
@@ -14134,8 +14132,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vdot_laneq_f32_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm:
     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane,
-                               ExtendLane, FloatTy, Ops, ICEArguments, E,
-                               "fdot4_lane");
+                               ExtendLane, FloatTy, Ops, E, "fdot4_lane");
   case NEON::BI__builtin_neon_vamin_f16:
   case NEON::BI__builtin_neon_vaminq_f16:
   case NEON::BI__builtin_neon_vamin_f32:
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 09336e54a3be85..df37d75d5811d7 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4658,8 +4658,7 @@ class CodeGenFunction : public CodeGenTypeCache {
   llvm::Value *EmitFP8NeonFDOTCall(unsigned IID, bool ExtendLane,
                                    llvm::Type *RetTy,
                                    SmallVectorImpl<llvm::Value *> &Ops,
-                                   unsigned ICEArguments, const CallExpr *E,
-                                   const char *name);
+                                   const CallExpr *E, const char *name);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx,
                              const llvm::ElementCount &Count);
   llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx);

>From 5e1b6df74e64111594161d96c6b3f67f0cd40ad8 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Tue, 17 Dec 2024 17:10:38 +0000
Subject: [PATCH 10/11] [AArch64] Implement NEON FP8 fused multiply-add
 intrinsics (non-indexed)

This patch adds the following intrinsics:

    float16x8_t vmlalbq_f16_mf8_fpm(float16x8_t, mfloat8x16_t, mfloat8x16_t, fpm_t)
    float16x8_t vmlaltq_f16_mf8_fpm(float16x8_t, mfloat8x16_t, mfloat8x16_t, fpm_t)

    float32x4_t vmlallbbq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t, fpm_t)
    float32x4_t vmlallbtq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t, fpm_t)
    float32x4_t vmlalltbq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t, fpm_t)
    float32x4_t vmlallttq_f32_mf8_fpm(float32x4_t, mfloat8x16_t, mfloat8x16_t, fpm_t)
---
 clang/include/clang/Basic/arm_neon.td         |  11 ++
 clang/lib/CodeGen/CGBuiltin.cpp               |  43 +++++--
 clang/lib/CodeGen/CodeGenFunction.h           |   4 +-
 .../fp8-intrinsics/acle_neon_fp8_fmla.c       | 117 ++++++++++++++++++
 .../acle_neon_fp8_fmla.c                      |  22 ++++
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  17 +++
 .../lib/Target/AArch64/AArch64InstrFormats.td |   9 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  14 +--
 llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll    |  56 +++++++++
 9 files changed, 271 insertions(+), 22 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c
 create mode 100644 clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c
 create mode 100644 llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index fec91ac754ae6c..79de4a098a9dce 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2170,6 +2170,17 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot4,neon" in {
   def VDOTQ_LANEQ_F32_MF8 : VInst<"vdotq_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm",   [ImmCheck<3, ImmCheck0_3, 0>]>;
 }
 
+
+let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8fma,neon" in {
+  def VMLALB_F16_F8 : VInst<"vmlalbq_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+  def VMLALT_F16_F8 : VInst<"vmlaltq_f16_mf8_fpm", "(>F)(>F)..V", "Qm">;
+
+  def VMLALLBB_F32_F8 : VInst<"vmlallbbq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+  def VMLALLBT_F32_F8 : VInst<"vmlallbtq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+  def VMLALLTB_F32_F8 : VInst<"vmlalltbq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+  def VMLALLTT_F32_F8 : VInst<"vmlallttq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+}
+
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
   def FAMIN : WInst<"vamin", "...", "fhQdQfQh">;
   def FAMAX : WInst<"vamax", "...", "fhQdQfQh">;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 33a297e210340d..87ee60d9729d04 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6868,11 +6868,14 @@ Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl<Value*> &Ops,
     return Builder.CreateCall(F, Ops, name);
 }
 
-Value *CodeGenFunction::EmitFP8NeonCall(Function *F,
+Value *CodeGenFunction::EmitFP8NeonCall(unsigned IID,
+                                        ArrayRef<llvm::Type *> Tys,
                                         SmallVectorImpl<Value *> &Ops,
-                                        Value *FPM, const char *name) {
+                                        const CallExpr *E, const char *name) {
+  llvm::Value *FPM =
+      EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
   Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), FPM);
-  return EmitNeonCall(F, Ops, name);
+  return EmitNeonCall(CGM.getIntrinsic(IID, Tys), Ops, name);
 }
 
 llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
@@ -6888,9 +6891,7 @@ llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
     Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2],
                                         Builder.getInt64(0));
   }
-  llvm::Value *FPM =
-      EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
-  return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
+  return EmitFP8NeonCall(IID, Tys, Ops, E, name);
 }
 
 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
@@ -6911,9 +6912,7 @@ Value *CodeGenFunction::EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0,
     Tys[1] = llvm::FixedVectorType::get(Int8Ty, 8);
     Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0));
   }
-  llvm::Value *FPM =
-      EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E);
-  return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name);
+  return EmitFP8NeonCall(IID, Tys, Ops, E, name);
 }
 
 // Right-shift a vector by a constant.
@@ -14133,6 +14132,32 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm:
     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane,
                                ExtendLane, FloatTy, Ops, E, "fdot4_lane");
+
+  case NEON::BI__builtin_neon_vmlalbq_f16_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalb,
+                           {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E,
+                           "vmlal");
+  case NEON::BI__builtin_neon_vmlaltq_f16_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalt,
+                           {llvm::FixedVectorType::get(HalfTy, 8)}, Ops, E,
+                           "vmlal");
+  case NEON::BI__builtin_neon_vmlallbbq_f32_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlallbb,
+                           {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
+                           "vmlall");
+  case NEON::BI__builtin_neon_vmlallbtq_f32_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlallbt,
+                           {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
+                           "vmlall");
+  case NEON::BI__builtin_neon_vmlalltbq_f32_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalltb,
+                           {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
+                           "vmlall");
+  case NEON::BI__builtin_neon_vmlallttq_f32_mf8_fpm:
+    return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalltt,
+                           {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
+                           "vmlall");
+
   case NEON::BI__builtin_neon_vamin_f16:
   case NEON::BI__builtin_neon_vaminq_f16:
   case NEON::BI__builtin_neon_vamin_f32:
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index df37d75d5811d7..15bea6f95953ef 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4648,9 +4648,9 @@ class CodeGenFunction : public CodeGenTypeCache {
                             SmallVectorImpl<llvm::Value*> &O,
                             const char *name,
                             unsigned shift = 0, bool rightshift = false);
-  llvm::Value *EmitFP8NeonCall(llvm::Function *F,
+  llvm::Value *EmitFP8NeonCall(unsigned IID, ArrayRef<llvm::Type *> Tys,
                                SmallVectorImpl<llvm::Value *> &O,
-                               llvm::Value *FPM, const char *name);
+                               const CallExpr *E, const char *name);
   llvm::Value *EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0,
                                   llvm::Type *Ty1, bool Extract,
                                   SmallVectorImpl<llvm::Value *> &Ops,
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c
new file mode 100644
index 00000000000000..528bbd69451865
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c
@@ -0,0 +1,117 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1        -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix CHECK-CXX
+
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8fma -disable-O0-optnone -Werror -Wall -S -o /dev/null %s
+
+// REQUIES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vmlalb(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLAL1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <8 x half> [[VMLAL1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z11test_vmlalb13__Float16x8_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLAL1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VMLAL1_I]]
+//
+float16x8_t test_vmlalb(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlalbq_f16_mf8_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vmlalt(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLAL1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <8 x half> [[VMLAL1_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z11test_vmlalt13__Float16x8_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLAL1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <8 x half> [[VMLAL1_I]]
+//
+float16x8_t test_vmlalt(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlaltq_f16_mf8_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlallbb(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z13test_vmlallbb13__Float32x4_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+float32x4_t test_vmlallbb(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlallbbq_f32_mf8_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlallbt(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z13test_vmlallbt13__Float32x4_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+float32x4_t test_vmlallbt(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlallbtq_f32_mf8_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlalltb(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z13test_vmlalltb13__Float32x4_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+float32x4_t test_vmlalltb(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlalltbq_f32_mf8_fpm(vd, vn, vm, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlalltt(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z13test_vmlalltt13__Float32x4_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]])
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_I]]
+//
+float32x4_t test_vmlalltt(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlallttq_f32_mf8_fpm(vd, vn, vm, fpm);
+}
diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c
new file mode 100644
index 00000000000000..fcdd14e583101e
--- /dev/null
+++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +faminmax -target-feature +fp8 -emit-llvm -verify %s -o /dev/null
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+void test_features(float16x8_t a, float32x4_t b, mfloat8x16_t u, fpm_t fpm) {
+
+  (void) vmlalbq_f16_mf8_fpm(a, u, u, fpm);
+  // expected-error at -1 {{'vmlalbq_f16_mf8_fpm' requires target feature 'fp8fma'}}
+  (void) vmlaltq_f16_mf8_fpm(a, u, u, fpm);
+  // expected-error at -1 {{'vmlaltq_f16_mf8_fpm' requires target feature 'fp8fma'}}
+  (void) vmlallbbq_f32_mf8_fpm(b, u, u, fpm);
+  // expected-error at -1 {{'vmlallbbq_f32_mf8_fpm' requires target feature 'fp8fma'}}
+  (void) vmlallbtq_f32_mf8_fpm(b, u, u, fpm);
+  // expected-error at -1 {{'vmlallbtq_f32_mf8_fpm' requires target feature 'fp8fma'}}
+  (void) vmlalltbq_f32_mf8_fpm(b, u, u, fpm);
+  // expected-error at -1 {{'vmlalltbq_f32_mf8_fpm' requires target feature 'fp8fma'}}
+  (void) vmlallttq_f32_mf8_fpm(b, u, u, fpm);
+  // expected-error at -1 {{'vmlallttq_f32_mf8_fpm' requires target feature 'fp8fma'}}
+}
+
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 552d3010333c17..39ee36ed7ed09f 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1047,6 +1047,23 @@ def int_aarch64_st64bv0: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], dat
 
   def int_aarch64_neon_fp8_fdot4 : AdvSIMD_FP8_DOT_Intrinsic;
   def int_aarch64_neon_fp8_fdot4_lane : AdvSIMD_FP8_DOT_LANE_Intrinsic;
+
+
+// Fused multiply-add
+  class AdvSIMD_FP8_FMLA_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [LLVMMatchType<0>,
+                             llvm_v16i8_ty,
+                             llvm_v16i8_ty],
+                             [IntrReadMem, IntrInaccessibleMemOnly]>;
+
+  def int_aarch64_neon_fp8_fmlalb : AdvSIMD_FP8_FMLA_Intrinsic;
+  def int_aarch64_neon_fp8_fmlalt : AdvSIMD_FP8_FMLA_Intrinsic;
+
+  def int_aarch64_neon_fp8_fmlallbb : AdvSIMD_FP8_FMLA_Intrinsic;
+  def int_aarch64_neon_fp8_fmlallbt : AdvSIMD_FP8_FMLA_Intrinsic;
+  def int_aarch64_neon_fp8_fmlalltb : AdvSIMD_FP8_FMLA_Intrinsic;
+  def int_aarch64_neon_fp8_fmlalltt : AdvSIMD_FP8_FMLA_Intrinsic;
 }
 
 def llvm_nxv1i1_ty  : LLVMType<nxv1i1>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 484caeff05572e..5f0a8f21919222 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -6511,14 +6511,15 @@ multiclass SIMDThreeSameVectorFML<bit U, bit b13, bits<3> size, string asm,
                                          v4f32, v8f16, OpNode>;
 }
 
-multiclass SIMDThreeSameVectorMLA<bit Q, string asm>{
+multiclass SIMDThreeSameVectorMLA<bit Q, string asm, SDPatternOperator op> {
+
   def v8f16 : BaseSIMDThreeSameVectorDot<Q, 0b0, 0b11, 0b1111, asm, ".8h", ".16b",
-                                         V128, v8f16, v16i8, null_frag>;
+                                         V128, v8f16, v16i8, op>;
 }
 
-multiclass SIMDThreeSameVectorMLAL<bit Q, bits<2> sz, string asm>{
+multiclass SIMDThreeSameVectorMLAL<bit Q, bits<2> sz, string asm, SDPatternOperator op> {
   def v4f32 : BaseSIMDThreeSameVectorDot<Q, 0b0, sz, 0b1000, asm, ".4s", ".16b",
-                                         V128, v4f32, v16i8, null_frag>;
+                                         V128, v4f32, v16i8, op>;
 }
 
 // FP8 assembly/disassembly classes
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ae3f0e820a24e5..1a550f4be6db5a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -10358,7 +10358,7 @@ let Predicates = [HasNEON, HasFAMINMAX] in {
  defm FAMIN : SIMDThreeSameVectorFP<0b1, 0b1, 0b011, "famin", AArch64famin>;
 } // End let Predicates = [HasNEON, HasFAMINMAX]
 
-let Uses = [FPMR, FPCR], Predicates = [HasFP8FMA] in {
+let Predicates = [HasFP8FMA], Uses = [FPMR, FPCR], mayLoad = 1 in {
  defm FMLALBlane : SIMDThreeSameVectorMLAIndex<0b0, "fmlalb">;
  defm FMLALTlane : SIMDThreeSameVectorMLAIndex<0b1, "fmlalt">;
  defm FMLALLBBlane : SIMDThreeSameVectorMLALIndex<0b0, 0b00, "fmlallbb">;
@@ -10366,12 +10366,12 @@ let Uses = [FPMR, FPCR], Predicates = [HasFP8FMA] in {
  defm FMLALLTBlane : SIMDThreeSameVectorMLALIndex<0b1, 0b00, "fmlalltb">;
  defm FMLALLTTlane : SIMDThreeSameVectorMLALIndex<0b1, 0b01, "fmlalltt">;
 
- defm FMLALB : SIMDThreeSameVectorMLA<0b0, "fmlalb">;
- defm FMLALT : SIMDThreeSameVectorMLA<0b1, "fmlalt">;
- defm FMLALLBB : SIMDThreeSameVectorMLAL<0b0, 0b00, "fmlallbb">;
- defm FMLALLBT : SIMDThreeSameVectorMLAL<0b0, 0b01, "fmlallbt">;
- defm FMLALLTB : SIMDThreeSameVectorMLAL<0b1, 0b00, "fmlalltb">;
- defm FMLALLTT : SIMDThreeSameVectorMLAL<0b1, 0b01, "fmlalltt">;
+ defm FMLALB : SIMDThreeSameVectorMLA<0b0, "fmlalb", int_aarch64_neon_fp8_fmlalb>;
+ defm FMLALT : SIMDThreeSameVectorMLA<0b1, "fmlalt", int_aarch64_neon_fp8_fmlalt>;
+ defm FMLALLBB : SIMDThreeSameVectorMLAL<0b0, 0b00, "fmlallbb", int_aarch64_neon_fp8_fmlallbb>;
+ defm FMLALLBT : SIMDThreeSameVectorMLAL<0b0, 0b01, "fmlallbt", int_aarch64_neon_fp8_fmlallbt>;
+ defm FMLALLTB : SIMDThreeSameVectorMLAL<0b1, 0b00, "fmlalltb", int_aarch64_neon_fp8_fmlalltb>;
+ defm FMLALLTT : SIMDThreeSameVectorMLAL<0b1, 0b01, "fmlalltt", int_aarch64_neon_fp8_fmlalltt>;
 } // End let Predicates = [HasFP8FMA]
 
 let Predicates = [HasFP8DOT2] in {
diff --git a/llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll b/llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll
new file mode 100644
index 00000000000000..008069ff63761f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll
@@ -0,0 +1,56 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8fma < %s | FileCheck %s
+
+define <8 x half> @test_fmlalb(<8 x half> %d, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_fmlalb:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalb v0.8h, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %r = call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.v8f16(<8 x half> %d, <16 x i8> %a, <16 x i8> %b)
+  ret <8 x half> %r
+}
+
+define <8 x half> @test_fmlalt(<8 x half> %d, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_fmlalt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalt v0.8h, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %r = call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.v8f16(<8 x half> %d, <16 x i8> %a, <16 x i8> %b)
+  ret <8 x half> %r
+}
+
+define <4 x float> @test_fmlallbb(<4 x float> %d, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_fmlallbb:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlallbb v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %r = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.v4f32(<4 x float> %d, <16 x i8> %a, <16 x i8> %b)
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_fmlallbt(<4 x float> %d, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_fmlallbt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlallbt v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %r = call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.v4f32(<4 x float> %d, <16 x i8> %a, <16 x i8> %b)
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_fmlalltb(<4 x float> %d, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_fmlalltb:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalltb v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %r = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.v4f32(<4 x float> %d, <16 x i8> %a, <16 x i8> %b)
+  ret <4 x float> %r
+}
+
+define <4 x float> @test_fmlalltt(<4 x float> %d, <16 x i8> %a, <16 x i8> %b) {
+; CHECK-LABEL: test_fmlalltt:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalltt v0.4s, v1.16b, v2.16b
+; CHECK-NEXT:    ret
+  %r = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.v4f32(<4 x float> %d, <16 x i8> %a, <16 x i8> %b)
+  ret <4 x float> %r
+}

>From 159b7a74d751197fc986bbeaf1522f01f1ab2c11 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov at arm.com>
Date: Wed, 18 Dec 2024 10:52:34 +0000
Subject: [PATCH 11/11] [AArch64] Implement NEON FP8 intrinsics for fused
 multiply-add (indexed)

This patch adds the following intrinsics:

* Floating-point multiply-add long to half-precision (vector, by element)
    float16x8_t vmlalbq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
    float16x8_t vmlalbq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
    float16x8_t vmlaltq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
    float16x8_t vmlaltq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)

* Floating-point multiply-add long-long to single-precision (vector, by element)

    float32x4_t vmlallbbq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
    float32x4_t vmlallbbq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
    float32x4_t vmlallbtq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
    float32x4_t vmlallbtq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
    float32x4_t vmlalltbq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
    float32x4_t vmlalltbq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
    float32x4_t vmlallttq_lane_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm)
    float32x4_t vmlallttq_laneq_f32_mf8_fpm(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm)
---
 clang/include/clang/Basic/arm_neon.td         |  14 ++
 clang/lib/CodeGen/CGBuiltin.cpp               |  66 ++++-
 clang/lib/CodeGen/CodeGenFunction.h           |   6 +-
 .../fp8-intrinsics/acle_neon_fp8_fmla.c       | 228 ++++++++++++++++++
 .../acle_neon_fp8_fmla.c                      |  29 ++-
 llvm/include/llvm/IR/IntrinsicsAArch64.td     |  16 ++
 .../lib/Target/AArch64/AArch64InstrFormats.td |  24 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  16 +-
 llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll    |  54 +++++
 9 files changed, 429 insertions(+), 24 deletions(-)

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 79de4a098a9dce..d513325e36ee2b 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -2179,6 +2179,20 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8fma,neon" in {
   def VMLALLBT_F32_F8 : VInst<"vmlallbtq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
   def VMLALLTB_F32_F8 : VInst<"vmlalltbq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
   def VMLALLTT_F32_F8 : VInst<"vmlallttq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">;
+
+  def VMLALB_F16_F8_LANE  : VInst<"vmlalbq_lane_f16_mf8_fpm",  "(>F)(>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_7,  0>]>;
+  def VMLALB_F16_F8_LANEQ : VInst<"vmlalbq_laneq_f16_mf8_fpm", "(>F)(>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_15, 0>]>;
+  def VMLALT_F16_F8_LANE  : VInst<"vmlaltq_lane_f16_mf8_fpm",  "(>F)(>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_7,  0>]>;
+  def VMLALT_F16_F8_LANEQ : VInst<"vmlaltq_laneq_f16_mf8_fpm", "(>F)(>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_15, 0>]>;
+
+  def VMLALLBB_F32_F8_LANE  : VInst<"vmlallbbq_lane_f32_mf8_fpm",  "(>>F)(>>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_7,  0>]>;
+  def VMLALLBB_F32_F8_LANEQ : VInst<"vmlallbbq_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_15, 0>]>;
+  def VMLALLBT_F32_F8_LANE  : VInst<"vmlallbtq_lane_f32_mf8_fpm",  "(>>F)(>>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_7,  0>]>;
+  def VMLALLBT_F32_F8_LANEQ : VInst<"vmlallbtq_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_15, 0>]>;
+  def VMLALLTB_F32_F8_LANE  : VInst<"vmlalltbq_lane_f32_mf8_fpm",  "(>>F)(>>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_7,  0>]>;
+  def VMLALLTB_F32_F8_LANEQ : VInst<"vmlalltbq_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_15, 0>]>;
+  def VMLALLTT_F32_F8_LANE  : VInst<"vmlallttq_lane_f32_mf8_fpm",  "(>>F)(>>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_7,  0>]>;
+  def VMLALLTT_F32_F8_LANEQ : VInst<"vmlallttq_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_15, 0>]>;
 }
 
 let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in {
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 87ee60d9729d04..c66da8d551cd37 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -6879,14 +6879,14 @@ Value *CodeGenFunction::EmitFP8NeonCall(unsigned IID,
 }
 
 llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
-    unsigned IID, bool ExtendLane, llvm::Type *RetTy,
+    unsigned IID, bool ExtendLaneArg, llvm::Type *RetTy,
     SmallVectorImpl<llvm::Value *> &Ops, const CallExpr *E, const char *name) {
 
   const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() /
                              RetTy->getPrimitiveSizeInBits();
   llvm::Type *Tys[] = {llvm::FixedVectorType::get(RetTy, ElemCount),
                        Ops[1]->getType()};
-  if (ExtendLane) {
+  if (ExtendLaneArg) {
     auto *VT = llvm::FixedVectorType::get(Int8Ty, 16);
     Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2],
                                         Builder.getInt64(0));
@@ -6894,6 +6894,21 @@ llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall(
   return EmitFP8NeonCall(IID, Tys, Ops, E, name);
 }
 
+llvm::Value *CodeGenFunction::EmitFP8NeonFMLACall(
+    unsigned IID, bool ExtendLaneArg, llvm::Type *RetTy,
+    SmallVectorImpl<llvm::Value *> &Ops, const CallExpr *E, const char *name) {
+
+  if (ExtendLaneArg) {
+    auto *VT = llvm::FixedVectorType::get(Int8Ty, 16);
+    Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2],
+                                        Builder.getInt64(0));
+  }
+  const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() /
+                             RetTy->getPrimitiveSizeInBits();
+  return EmitFP8NeonCall(IID, {llvm::FixedVectorType::get(RetTy, ElemCount)},
+                         Ops, E, name);
+}
+
 Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty,
                                             bool neg) {
   int SV = cast<ConstantInt>(V)->getSExtValue();
@@ -12839,7 +12854,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
 
   unsigned Int;
   bool ExtractLow = false;
-  bool ExtendLane = false;
+  bool ExtendLaneArg = false;
   switch (BuiltinID) {
   default: return nullptr;
   case NEON::BI__builtin_neon_vbsl_v:
@@ -14114,24 +14129,24 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
                                Ops, E, "fdot2");
   case NEON::BI__builtin_neon_vdot_lane_f16_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_lane_f16_mf8_fpm:
-    ExtendLane = true;
+    ExtendLaneArg = true;
     LLVM_FALLTHROUGH;
   case NEON::BI__builtin_neon_vdot_laneq_f16_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_laneq_f16_mf8_fpm:
     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2_lane,
-                               ExtendLane, HalfTy, Ops, E, "fdot2_lane");
+                               ExtendLaneArg, HalfTy, Ops, E, "fdot2_lane");
   case NEON::BI__builtin_neon_vdot_f32_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_f32_mf8_fpm:
     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4, false,
                                FloatTy, Ops, E, "fdot4");
   case NEON::BI__builtin_neon_vdot_lane_f32_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_lane_f32_mf8_fpm:
-    ExtendLane = true;
+    ExtendLaneArg = true;
     LLVM_FALLTHROUGH;
   case NEON::BI__builtin_neon_vdot_laneq_f32_mf8_fpm:
   case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm:
     return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane,
-                               ExtendLane, FloatTy, Ops, E, "fdot4_lane");
+                               ExtendLaneArg, FloatTy, Ops, E, "fdot4_lane");
 
   case NEON::BI__builtin_neon_vmlalbq_f16_mf8_fpm:
     return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalb,
@@ -14157,7 +14172,42 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     return EmitFP8NeonCall(Intrinsic::aarch64_neon_fp8_fmlalltt,
                            {llvm::FixedVectorType::get(FloatTy, 4)}, Ops, E,
                            "vmlall");
-
+  case NEON::BI__builtin_neon_vmlalbq_lane_f16_mf8_fpm:
+    ExtendLaneArg = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vmlalbq_laneq_f16_mf8_fpm:
+    return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalb_lane,
+                               ExtendLaneArg, HalfTy, Ops, E, "vmlal_lane");
+  case NEON::BI__builtin_neon_vmlaltq_lane_f16_mf8_fpm:
+    ExtendLaneArg = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vmlaltq_laneq_f16_mf8_fpm:
+    return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalt_lane,
+                               ExtendLaneArg, HalfTy, Ops, E, "vmlal_lane");
+  case NEON::BI__builtin_neon_vmlallbbq_lane_f32_mf8_fpm:
+    ExtendLaneArg = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vmlallbbq_laneq_f32_mf8_fpm:
+    return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlallbb_lane,
+                               ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane");
+  case NEON::BI__builtin_neon_vmlallbtq_lane_f32_mf8_fpm:
+    ExtendLaneArg = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vmlallbtq_laneq_f32_mf8_fpm:
+    return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlallbt_lane,
+                               ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane");
+  case NEON::BI__builtin_neon_vmlalltbq_lane_f32_mf8_fpm:
+    ExtendLaneArg = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vmlalltbq_laneq_f32_mf8_fpm:
+    return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalltb_lane,
+                               ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane");
+  case NEON::BI__builtin_neon_vmlallttq_lane_f32_mf8_fpm:
+    ExtendLaneArg = true;
+    LLVM_FALLTHROUGH;
+  case NEON::BI__builtin_neon_vmlallttq_laneq_f32_mf8_fpm:
+    return EmitFP8NeonFMLACall(Intrinsic::aarch64_neon_fp8_fmlalltt_lane,
+                               ExtendLaneArg, FloatTy, Ops, E, "vmlall_lane");
   case NEON::BI__builtin_neon_vamin_f16:
   case NEON::BI__builtin_neon_vaminq_f16:
   case NEON::BI__builtin_neon_vamin_f32:
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 15bea6f95953ef..f8cc1f3ed9f950 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4655,7 +4655,11 @@ class CodeGenFunction : public CodeGenTypeCache {
                                   llvm::Type *Ty1, bool Extract,
                                   SmallVectorImpl<llvm::Value *> &Ops,
                                   const CallExpr *E, const char *name);
-  llvm::Value *EmitFP8NeonFDOTCall(unsigned IID, bool ExtendLane,
+  llvm::Value *EmitFP8NeonFDOTCall(unsigned IID, bool ExtendLaneArg,
+                                   llvm::Type *RetTy,
+                                   SmallVectorImpl<llvm::Value *> &Ops,
+                                   const CallExpr *E, const char *name);
+  llvm::Value *EmitFP8NeonFMLACall(unsigned IID, bool ExtendLaneArg,
                                    llvm::Type *RetTy,
                                    SmallVectorImpl<llvm::Value *> &Ops,
                                    const CallExpr *E, const char *name);
diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c
index 528bbd69451865..851833d9a0018d 100644
--- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c
+++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fmla.c
@@ -115,3 +115,231 @@ float32x4_t test_vmlalltb(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_
 float32x4_t test_vmlalltt(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
   return vmlallttq_f32_mf8_fpm(vd, vn, vm, fpm);
 }
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vmlalb_lane(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLAL_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.lane.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 0)
+// CHECK-NEXT:    ret <8 x half> [[VMLAL_LANE1]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z16test_vmlalb_lane13__Float16x8_tu14__MFloat8x16_tu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLAL_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.lane.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 0)
+// CHECK-CXX-NEXT:    ret <8 x half> [[VMLAL_LANE1]]
+//
+float16x8_t test_vmlalb_lane(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpm) {
+  return vmlalbq_lane_f16_mf8_fpm(vd, vn, vm, 0, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vmlalb_laneq(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLAL_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.lane.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <8 x half> [[VMLAL_LANE1]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z17test_vmlalb_laneq13__Float16x8_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLAL_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.lane.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-CXX-NEXT:    ret <8 x half> [[VMLAL_LANE1]]
+//
+float16x8_t test_vmlalb_laneq(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlalbq_laneq_f16_mf8_fpm(vd, vn, vm, 0, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vmlalt_lane(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLAL_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.lane.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 7)
+// CHECK-NEXT:    ret <8 x half> [[VMLAL_LANE1]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z16test_vmlalt_lane13__Float16x8_tu14__MFloat8x16_tu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLAL_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.lane.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 7)
+// CHECK-CXX-NEXT:    ret <8 x half> [[VMLAL_LANE1]]
+//
+float16x8_t test_vmlalt_lane(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpm) {
+  return vmlaltq_lane_f16_mf8_fpm(vd, vn, vm, 7, fpm);
+}
+
+// CHECK-LABEL: define dso_local <8 x half> @test_vmlalt_laneq(
+// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLAL_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.lane.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 15)
+// CHECK-NEXT:    ret <8 x half> [[VMLAL_LANE1]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z17test_vmlalt_laneq13__Float16x8_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLAL_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.lane.v8f16(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 15)
+// CHECK-CXX-NEXT:    ret <8 x half> [[VMLAL_LANE1]]
+//
+float16x8_t test_vmlalt_laneq(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlaltq_laneq_f16_mf8_fpm(vd, vn, vm, 15, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlallbb_lane(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 0)
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z18test_vmlallbb_lane13__Float32x4_tu14__MFloat8x16_tu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 0)
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+float32x4_t test_vmlallbb_lane(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpm) {
+  return vmlallbbq_lane_f32_mf8_fpm(vd, vn, vm, 0, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlallbb_laneq(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z19test_vmlallbb_laneq13__Float32x4_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 0)
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+float32x4_t test_vmlallbb_laneq(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlallbbq_laneq_f32_mf8_fpm(vd, vn, vm, 0, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlallbt_lane(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 3)
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z18test_vmlallbt_lane13__Float32x4_tu14__MFloat8x16_tu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 3)
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+float32x4_t test_vmlallbt_lane(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpm) {
+  return vmlallbtq_lane_f32_mf8_fpm(vd, vn, vm, 3, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlallbt_laneq(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z19test_vmlallbt_laneq13__Float32x4_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 3)
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+float32x4_t test_vmlallbt_laneq(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlallbtq_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlalltb_lane(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 7)
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z18test_vmlalltb_lane13__Float32x4_tu14__MFloat8x16_tu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 7)
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+float32x4_t test_vmlalltb_lane(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpm) {
+  return vmlalltbq_lane_f32_mf8_fpm(vd, vn, vm, 7, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlalltb_laneq(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z19test_vmlalltb_laneq13__Float32x4_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7)
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+float32x4_t test_vmlalltb_laneq(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlalltbq_laneq_f32_mf8_fpm(vd, vn, vm, 7, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlalltt_lane(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 7)
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z18test_vmlalltt_lane13__Float32x4_tu14__MFloat8x16_tu13__MFloat8x8_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 7)
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+float32x4_t test_vmlalltt_lane(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpm) {
+  return vmlallttq_lane_f32_mf8_fpm(vd, vn, vm, 7, fpm);
+}
+
+// CHECK-LABEL: define dso_local <4 x float> @test_vmlalltt_laneq(
+// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  [[ENTRY:.*:]]
+// CHECK-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-NEXT:    [[VMLALL_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 15)
+// CHECK-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z19test_vmlalltt_laneq13__Float32x4_tu14__MFloat8x16_tu14__MFloat8x16_tm(
+// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] {
+// CHECK-CXX-NEXT:  [[ENTRY:.*:]]
+// CHECK-CXX-NEXT:    tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]])
+// CHECK-CXX-NEXT:    [[VMLALL_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.lane.v4f32(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 15)
+// CHECK-CXX-NEXT:    ret <4 x float> [[VMLALL_LANE]]
+//
+float32x4_t test_vmlalltt_laneq(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) {
+  return vmlallttq_laneq_f32_mf8_fpm(vd, vn, vm, 15, fpm);
+}
diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c
index fcdd14e583101e..4a507b08040fff 100644
--- a/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c
+++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fmla.c
@@ -5,7 +5,6 @@
 #include <arm_neon.h>
 
 void test_features(float16x8_t a, float32x4_t b, mfloat8x16_t u, fpm_t fpm) {
-
   (void) vmlalbq_f16_mf8_fpm(a, u, u, fpm);
   // expected-error at -1 {{'vmlalbq_f16_mf8_fpm' requires target feature 'fp8fma'}}
   (void) vmlaltq_f16_mf8_fpm(a, u, u, fpm);
@@ -20,3 +19,31 @@ void test_features(float16x8_t a, float32x4_t b, mfloat8x16_t u, fpm_t fpm) {
   // expected-error at -1 {{'vmlallttq_f32_mf8_fpm' requires target feature 'fp8fma'}}
 }
 
+void test_imm(float16x8_t d, float32x4_t c, mfloat8x16_t a, mfloat8x8_t b, fpm_t fpm) {
+(void) vmlalbq_lane_f16_mf8_fpm(d, a, b, -1, fpm);
+// expected-error at -1 {{argument value -1 is outside the valid range [0, 7]}}
+(void) vmlalbq_laneq_f16_mf8_fpm(d, a, a, -1, fpm);
+// expected-error at -1 {{argument value -1 is outside the valid range [0, 15]}}
+(void) vmlaltq_lane_f16_mf8_fpm(d, a, b, -1, fpm);
+// expected-error at -1 {{argument value -1 is outside the valid range [0, 7]}}
+(void) vmlaltq_laneq_f16_mf8_fpm(d, a, a, -1, fpm);
+// expected-error at -1 {{argument value -1 is outside the valid range [0, 15]}}
+
+(void) vmlallbbq_lane_f32_mf8_fpm(c, a, b, -1, fpm);
+// expected-error at -1 {{argument value -1 is outside the valid range [0, 7]}}
+(void) vmlallbbq_laneq_f32_mf8_fpm(c, a, a, -1, fpm);
+// expected-error at -1 {{argument value -1 is outside the valid range [0, 15]}}
+(void) vmlallbtq_lane_f32_mf8_fpm(c, a, b, -1, fpm);
+// expected-error at -1 {{argument value -1 is outside the valid range [0, 7]}}
+(void) vmlallbtq_laneq_f32_mf8_fpm(c, a, a, -1, fpm);
+// expected-error at -1 {{argument value -1 is outside the valid range [0, 15]}}
+(void) vmlalltbq_lane_f32_mf8_fpm(c, a, b, -1, fpm);
+// expected-error at -1 {{argument value -1 is outside the valid range [0, 7]}}
+(void) vmlalltbq_laneq_f32_mf8_fpm(c, a, a, -1, fpm);
+// expected-error at -1 {{argument value -1 is outside the valid range [0, 15]}}
+(void) vmlallttq_lane_f32_mf8_fpm(c, a, b, -1, fpm);
+// expected-error at -1 {{argument value -1 is outside the valid range [0, 7]}}
+(void) vmlallttq_laneq_f32_mf8_fpm(c, a, a, -1, fpm);
+// expected-error at -1 {{argument value -1 is outside the valid range [0, 15]}}
+}
+
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 39ee36ed7ed09f..545336dcc37bb0 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -1057,6 +1057,14 @@ def int_aarch64_st64bv0: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], dat
                              llvm_v16i8_ty],
                              [IntrReadMem, IntrInaccessibleMemOnly]>;
 
+  class AdvSIMD_FP8_FMLA_LANE_Intrinsic
+    : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
+                            [LLVMMatchType<0>,
+                             llvm_v16i8_ty,
+                             llvm_v16i8_ty,
+                             llvm_i32_ty],
+                             [IntrReadMem, IntrInaccessibleMemOnly, ImmArg<ArgIndex<3>>]>;
+
   def int_aarch64_neon_fp8_fmlalb : AdvSIMD_FP8_FMLA_Intrinsic;
   def int_aarch64_neon_fp8_fmlalt : AdvSIMD_FP8_FMLA_Intrinsic;
 
@@ -1064,6 +1072,14 @@ def int_aarch64_st64bv0: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], dat
   def int_aarch64_neon_fp8_fmlallbt : AdvSIMD_FP8_FMLA_Intrinsic;
   def int_aarch64_neon_fp8_fmlalltb : AdvSIMD_FP8_FMLA_Intrinsic;
   def int_aarch64_neon_fp8_fmlalltt : AdvSIMD_FP8_FMLA_Intrinsic;
+
+  def int_aarch64_neon_fp8_fmlalb_lane : AdvSIMD_FP8_FMLA_LANE_Intrinsic;
+  def int_aarch64_neon_fp8_fmlalt_lane : AdvSIMD_FP8_FMLA_LANE_Intrinsic;
+
+  def int_aarch64_neon_fp8_fmlallbb_lane : AdvSIMD_FP8_FMLA_LANE_Intrinsic;
+  def int_aarch64_neon_fp8_fmlallbt_lane : AdvSIMD_FP8_FMLA_LANE_Intrinsic;
+  def int_aarch64_neon_fp8_fmlalltb_lane : AdvSIMD_FP8_FMLA_LANE_Intrinsic;
+  def int_aarch64_neon_fp8_fmlalltt_lane : AdvSIMD_FP8_FMLA_LANE_Intrinsic;
 }
 
 def llvm_nxv1i1_ty  : LLVMType<nxv1i1>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 5f0a8f21919222..76b1c007084b26 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -9100,7 +9100,7 @@ class BaseSIMDThreeSameVectorIndexB<bit Q, bit U, bits<2> sz, bits<4> opc,
                                     RegisterOperand RegType,
                                     RegisterOperand RegType_lo>
   : BaseSIMDIndexedTied<Q, U, 0b0, sz, opc,
-                        RegType, RegType, RegType_lo, VectorIndexB,
+                        RegType, RegType, RegType_lo, VectorIndexB32b_timm,
                         asm, "", dst_kind, ".16b", ".b", []> {
 
   // idx = H:L:M
@@ -9109,14 +9109,24 @@ class BaseSIMDThreeSameVectorIndexB<bit Q, bit U, bits<2> sz, bits<4> opc,
   let Inst{21-19} = idx{2-0};
 }
 
-multiclass SIMDThreeSameVectorMLAIndex<bit Q, string asm> {
-  def v8f16 : BaseSIMDThreeSameVectorIndexB<Q, 0b0, 0b11, 0b0000, asm, ".8h",
-                                            V128, V128_0to7>;
+multiclass SIMDThreeSameVectorMLAIndex<bit Q, string asm, SDPatternOperator op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v8f16 : BaseSIMDThreeSameVectorIndexB<Q, 0b0, 0b11, 0b0000, asm, ".8h",
+                                              V128, V128_0to7>;
+  }
+
+  def : Pat<(v8f16 (op (v8f16 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128_0to7:$Rm), VectorIndexB32b_timm:$Idx)),
+            (!cast<Instruction>(NAME # v8f16) $Rd, $Rn, $Rm, $Idx)>;
 }
 
-multiclass SIMDThreeSameVectorMLALIndex<bit Q, bits<2> sz, string asm> {
-  def v4f32 : BaseSIMDThreeSameVectorIndexB<Q, 0b1, sz, 0b1000, asm, ".4s",
-                                            V128, V128_0to7>;
+multiclass SIMDThreeSameVectorMLALIndex<bit Q, bits<2> sz, string asm, SDPatternOperator op> {
+  let Uses = [FPMR, FPCR], mayLoad = 1 in {
+    def v4f32 : BaseSIMDThreeSameVectorIndexB<Q, 0b1, sz, 0b1000, asm, ".4s",
+                                              V128, V128_0to7>;
+  }
+
+  def : Pat<(v4f32 (op (v4f32 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128_0to7:$Rm), VectorIndexB32b_timm:$Idx)),
+            (!cast<Instruction>(NAME # v4f32) $Rd, $Rn, $Rm, $Idx)>;
 }
 
 //----------------------------------------------------------------------------
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 1a550f4be6db5a..1e405612b6a613 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -10358,14 +10358,16 @@ let Predicates = [HasNEON, HasFAMINMAX] in {
  defm FAMIN : SIMDThreeSameVectorFP<0b1, 0b1, 0b011, "famin", AArch64famin>;
 } // End let Predicates = [HasNEON, HasFAMINMAX]
 
-let Predicates = [HasFP8FMA], Uses = [FPMR, FPCR], mayLoad = 1 in {
- defm FMLALBlane : SIMDThreeSameVectorMLAIndex<0b0, "fmlalb">;
- defm FMLALTlane : SIMDThreeSameVectorMLAIndex<0b1, "fmlalt">;
- defm FMLALLBBlane : SIMDThreeSameVectorMLALIndex<0b0, 0b00, "fmlallbb">;
- defm FMLALLBTlane : SIMDThreeSameVectorMLALIndex<0b0, 0b01, "fmlallbt">;
- defm FMLALLTBlane : SIMDThreeSameVectorMLALIndex<0b1, 0b00, "fmlalltb">;
- defm FMLALLTTlane : SIMDThreeSameVectorMLALIndex<0b1, 0b01, "fmlalltt">;
+let Predicates = [HasFP8FMA] in {
+ defm FMLALBlane : SIMDThreeSameVectorMLAIndex<0b0, "fmlalb", int_aarch64_neon_fp8_fmlalb_lane>;
+ defm FMLALTlane : SIMDThreeSameVectorMLAIndex<0b1, "fmlalt", int_aarch64_neon_fp8_fmlalt_lane>;
+ defm FMLALLBBlane : SIMDThreeSameVectorMLALIndex<0b0, 0b00, "fmlallbb", int_aarch64_neon_fp8_fmlallbb_lane>;
+ defm FMLALLBTlane : SIMDThreeSameVectorMLALIndex<0b0, 0b01, "fmlallbt", int_aarch64_neon_fp8_fmlallbt_lane>;
+ defm FMLALLTBlane : SIMDThreeSameVectorMLALIndex<0b1, 0b00, "fmlalltb", int_aarch64_neon_fp8_fmlalltb_lane>;
+ defm FMLALLTTlane : SIMDThreeSameVectorMLALIndex<0b1, 0b01, "fmlalltt", int_aarch64_neon_fp8_fmlalltt_lane>;
+}
 
+let Predicates = [HasFP8FMA], Uses = [FPMR, FPCR], mayLoad = 1 in {
  defm FMLALB : SIMDThreeSameVectorMLA<0b0, "fmlalb", int_aarch64_neon_fp8_fmlalb>;
  defm FMLALT : SIMDThreeSameVectorMLA<0b1, "fmlalt", int_aarch64_neon_fp8_fmlalt>;
  defm FMLALLBB : SIMDThreeSameVectorMLAL<0b0, 0b00, "fmlallbb", int_aarch64_neon_fp8_fmlallbb>;
diff --git a/llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll b/llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll
index 008069ff63761f..60957a7c0f2f41 100644
--- a/llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll
+++ b/llvm/test/CodeGen/AArch64/fp8-neon-fmla.ll
@@ -54,3 +54,57 @@ define <4 x float> @test_fmlalltt(<4 x float> %d, <16 x i8> %a, <16 x i8> %b) {
   %r = call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.v4f32(<4 x float> %d, <16 x i8> %a, <16 x i8> %b)
   ret <4 x float> %r
 }
+
+define <8 x half> @test_fmlalb_lane(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fmlalb_lane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalb v0.8h, v1.16b, v2.b[0]
+; CHECK-NEXT:    ret
+  %res = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalb.lane(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm, i32 0)
+  ret <8 x half> %res
+}
+
+define <8 x half> @test_fmlalt_lane(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fmlalt_lane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalt v0.8h, v1.16b, v2.b[4]
+; CHECK-NEXT:    ret
+  %res = tail call <8 x half> @llvm.aarch64.neon.fp8.fmlalt.lane(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm, i32 4)
+  ret <8 x half> %res
+}
+
+define <4 x float> @test_fmlallbb_lane(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fmlallbb_lane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlallbb v0.4s, v1.16b, v2.b[7]
+; CHECK-NEXT:    ret
+  %res = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbb.lane(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm, i32 7)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_fmlallbt_lane(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fmlallbt_lane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlallbt v0.4s, v1.16b, v2.b[10]
+; CHECK-NEXT:    ret
+  %res = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlallbt.lane(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm, i32 10)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_fmlalltb_lane(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fmlalltb_lane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalltb v0.4s, v1.16b, v2.b[13]
+; CHECK-NEXT:    ret
+  %res = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltb.lane(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm, i32 13)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_fmlalltt_lane(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm) {
+; CHECK-LABEL: test_fmlalltt_lane:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmlalltt v0.4s, v1.16b, v2.b[15]
+; CHECK-NEXT:    ret
+  %res = tail call <4 x float> @llvm.aarch64.neon.fp8.fmlalltt.lane(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm, i32 15)
+  ret <4 x float> %res
+}