[clang] [llvm] [X86][AMX] Support AMX-FP8 (PR #113850)

Feng Zou via cfe-commits cfe-commits at lists.llvm.org
Tue Oct 29 19:33:16 PDT 2024


https://github.com/fzou1 updated https://github.com/llvm/llvm-project/pull/113850

>From fd570cb8d41f5f94b61d515985245fc81aab633e Mon Sep 17 00:00:00 2001
From: Feng Zou <feng.zou at intel.com>
Date: Thu, 24 Oct 2024 21:56:48 +0800
Subject: [PATCH 1/5] Support AMX-FP8

Ref.: https://cdrdv2.intel.com/v1/dl/getContent/671368
---
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/include/clang/Basic/BuiltinsX86_64.def  |  6 +++
 clang/include/clang/Driver/Options.td         |  2 +
 clang/lib/Basic/Targets/X86.cpp               |  6 +++
 clang/lib/Basic/Targets/X86.h                 |  1 +
 clang/lib/Headers/CMakeLists.txt              |  1 +
 clang/lib/Headers/amxfp8intrin.h              | 24 ++++++++++++
 clang/lib/Headers/immintrin.h                 |  4 ++
 clang/lib/Sema/SemaX86.cpp                    |  4 ++
 clang/test/CodeGen/X86/amx_fp8.c              | 27 +++++++++++++
 clang/test/CodeGen/X86/amx_fp8_errors.c       | 10 +++++
 clang/test/CodeGen/X86/amx_fp8_inline_asm.c   | 32 +++++++++++++++
 llvm/include/llvm/IR/IntrinsicsX86.td         | 17 ++++++++
 .../llvm/TargetParser/X86TargetParser.def     |  1 +
 llvm/lib/Target/X86/X86.td                    |  3 ++
 llvm/lib/Target/X86/X86ISelLowering.cpp       | 23 +++++++++++
 llvm/lib/Target/X86/X86InstrAMX.td            | 39 +++++++++++++++++++
 llvm/lib/Target/X86/X86InstrPredicates.td     |  1 +
 llvm/lib/TargetParser/Host.cpp                |  4 ++
 llvm/lib/TargetParser/X86TargetParser.cpp     |  1 +
 llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll   | 20 ++++++++++
 .../Disassembler/X86/AMX/x86-64-amx-fp8.txt   | 34 ++++++++++++++++
 llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s     | 33 ++++++++++++++++
 llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s   | 33 ++++++++++++++++
 24 files changed, 327 insertions(+)
 create mode 100644 clang/lib/Headers/amxfp8intrin.h
 create mode 100644 clang/test/CodeGen/X86/amx_fp8.c
 create mode 100644 clang/test/CodeGen/X86/amx_fp8_errors.c
 create mode 100644 clang/test/CodeGen/X86/amx_fp8_inline_asm.c
 create mode 100644 llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll
 create mode 100644 llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-fp8.txt
 create mode 100644 llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s
 create mode 100644 llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 6a95337815174b..da0ab888ce200d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -642,6 +642,7 @@ X86 Support
 
 - Supported intrinsics for ``MOVRS AND AVX10.2``.
   * Supported intrinsics of ``_mm(256|512)_(mask(z))_loadrs_epi(8|16|32|64)``.
+- Support ISA of ``AMX-FP8``.
 
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index e1e613560167ac..68904ae8abcd15 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -155,6 +155,12 @@ TARGET_BUILTIN(__builtin_ia32_cmpccxadd64, "SLLiv*SLLiSLLiIi", "n", "cmpccxadd")
 // AMX_FP16 FP16
 TARGET_BUILTIN(__builtin_ia32_tdpfp16ps, "vIUcIUcIUc", "n", "amx-fp16")
 
+// AMX FP8
+TARGET_BUILTIN(__builtin_ia32_tdpbf8ps, "vIUcUIcUIc", "n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdpbhf8ps, "vIUcUIcUIc", "n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdphbf8ps, "vIUcUIcUIc", "n", "amx-fp8")
+TARGET_BUILTIN(__builtin_ia32_tdphf8ps, "vIUcUIcUIc", "n", "amx-fp8")
+
 // RAO-INT
 TARGET_BUILTIN(__builtin_ia32_aadd64, "vv*SOi", "n", "raoint")
 TARGET_BUILTIN(__builtin_ia32_aand64, "vv*SOi", "n", "raoint")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 5df6ddd5e6a0c5..bbada0834526d7 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6290,6 +6290,8 @@ def mamx_fp16 : Flag<["-"], "mamx-fp16">, Group<m_x86_Features_Group>;
 def mno_amx_fp16 : Flag<["-"], "mno-amx-fp16">, Group<m_x86_Features_Group>;
 def mamx_int8 : Flag<["-"], "mamx-int8">, Group<m_x86_Features_Group>;
 def mno_amx_int8 : Flag<["-"], "mno-amx-int8">, Group<m_x86_Features_Group>;
+def mamx_fp8 : Flag<["-"], "mamx-fp8">, Group<m_x86_Features_Group>;
+def mno_amx_fp8 : Flag<["-"], "mno-amx-fp8">, Group<m_x86_Features_Group>;
 def mamx_tile : Flag<["-"], "mamx-tile">, Group<m_x86_Features_Group>;
 def mno_amx_tile : Flag<["-"], "mno-amx-tile">, Group<m_x86_Features_Group>;
 def mcmpccxadd : Flag<["-"], "mcmpccxadd">, Group<m_x86_Features_Group>;
diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index d067ec218b5270..b95261c39a5993 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -420,6 +420,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
       HasAMXTILE = true;
     } else if (Feature == "+amx-complex") {
       HasAMXCOMPLEX = true;
+    } else if (Feature == "+amx-fp8") {
+      HasAMXFP8 = true;
     } else if (Feature == "+cmpccxadd") {
       HasCMPCCXADD = true;
     } else if (Feature == "+raoint") {
@@ -939,6 +941,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
     Builder.defineMacro("__AMX_FP16__");
   if (HasAMXCOMPLEX)
     Builder.defineMacro("__AMX_COMPLEX__");
+  if (HasAMXFP8)
+    Builder.defineMacro("__AMX_FP8__");
   if (HasCMPCCXADD)
     Builder.defineMacro("__CMPCCXADD__");
   if (HasRAOINT)
@@ -1069,6 +1073,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
       .Case("amx-fp16", true)
       .Case("amx-int8", true)
       .Case("amx-tile", true)
+      .Case("amx-fp8", true)
       .Case("avx", true)
       .Case("avx10.1-256", true)
       .Case("avx10.1-512", true)
@@ -1187,6 +1192,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
       .Case("amx-fp16", HasAMXFP16)
       .Case("amx-int8", HasAMXINT8)
       .Case("amx-tile", HasAMXTILE)
+      .Case("amx-fp8", HasAMXFP8)
       .Case("avx", SSELevel >= AVX)
       .Case("avx10.1-256", HasAVX10_1)
       .Case("avx10.1-512", HasAVX10_1_512)
diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h
index e8aad3ec5a74b1..a1b2a0cec209ab 100644
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@@ -157,6 +157,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
   bool HasAMXINT8 = false;
   bool HasAMXBF16 = false;
   bool HasAMXCOMPLEX = false;
+  bool HasAMXFP8 = false;
   bool HasSERIALIZE = false;
   bool HasTSXLDTRK = false;
   bool HasUSERMSR = false;
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index e97953d87a2ff9..142cd01ac5aec0 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -149,6 +149,7 @@ set(x86_files
   amxcomplexintrin.h
   amxfp16intrin.h
   amxintrin.h
+  amxfp8intrin.h
   avx10_2_512bf16intrin.h
   avx10_2_512convertintrin.h
   avx10_2_512minmaxintrin.h
diff --git a/clang/lib/Headers/amxfp8intrin.h b/clang/lib/Headers/amxfp8intrin.h
new file mode 100644
index 00000000000000..d187b5f0421bbb
--- /dev/null
+++ b/clang/lib/Headers/amxfp8intrin.h
@@ -0,0 +1,24 @@
+/*===---------- amxfp8intrin.h - AMX intrinsics -*- C++ -*------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxfp8intrin.h> directly; include <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMXFP8INTRIN_H
+#define __AMXFP8INTRIN_H
+#ifdef __x86_64__
+
+#define _tile_dpbf8ps __builtin_ia32_tdpbf8ps
+#define _tile_dpbhf8ps __builtin_ia32_tdpbhf8ps
+#define _tile_dphbf8ps __builtin_ia32_tdphbf8ps
+#define _tile_dphf8ps __builtin_ia32_tdphf8ps
+
+#endif /* __x86_64__ */
+#endif /* __AMXFP8INTRIN_H */
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index 5f296d0a3324d0..5529f99ab0c6b6 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -648,6 +648,10 @@ _storebe_i64(void * __P, long long __D) {
 #include <amxcomplexintrin.h>
 #endif
 
+#if !defined(__SCE__) || __has_feature(modules) || defined(__AMX_FP8__)
+#include <amxfp8intrin.h>
+#endif
+
 #if !defined(__SCE__) || __has_feature(modules) ||                             \
     defined(__AVX512VP2INTERSECT__)
 #include <avx512vp2intersectintrin.h>
diff --git a/clang/lib/Sema/SemaX86.cpp b/clang/lib/Sema/SemaX86.cpp
index 6a4d78f0ca9084..0e43b030e70d41 100644
--- a/clang/lib/Sema/SemaX86.cpp
+++ b/clang/lib/Sema/SemaX86.cpp
@@ -640,6 +640,10 @@ bool SemaX86::CheckBuiltinTileArguments(unsigned BuiltinID, CallExpr *TheCall) {
   case X86::BI__builtin_ia32_tdpfp16ps:
   case X86::BI__builtin_ia32_tcmmimfp16ps:
   case X86::BI__builtin_ia32_tcmmrlfp16ps:
+  case X86::BI__builtin_ia32_tdpbf8ps:
+  case X86::BI__builtin_ia32_tdpbhf8ps:
+  case X86::BI__builtin_ia32_tdphbf8ps:
+  case X86::BI__builtin_ia32_tdphf8ps:
     return CheckBuiltinTileRangeAndDuplicate(TheCall, {0, 1, 2});
   }
 }
diff --git a/clang/test/CodeGen/X86/amx_fp8.c b/clang/test/CodeGen/X86/amx_fp8.c
new file mode 100644
index 00000000000000..9c79514f891299
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_fp8.c
@@ -0,0 +1,27 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown  -target-feature +amx-fp8  \
+// RUN: -emit-llvm -o - -Werror -pedantic | FileCheck %s
+#include <immintrin.h>
+
+void test_amx(void *data) {
+  //CHECK-LABEL: @test_amx
+  //CHECK: call void @llvm.x86.tdpbf8ps(i8 1, i8 2, i8 3)
+  _tile_dpbf8ps(1, 2, 3);
+}
+
+void test_amx2(void *data) {
+  //CHECK-LABEL: @test_amx2
+  //CHECK: call void @llvm.x86.tdpbhf8ps(i8 1, i8 2, i8 3)
+  _tile_dpbhf8ps(1, 2, 3);
+}
+
+void test_amx3(void *data) {
+  //CHECK-LABEL: @test_amx3
+  //CHECK: call void @llvm.x86.tdphbf8ps(i8 1, i8 2, i8 3)
+  _tile_dphbf8ps(1, 2, 3);
+}
+
+void test_amx4(void *data) {
+  //CHECK-LABEL: @test_amx4
+  //CHECK: call void @llvm.x86.tdphf8ps(i8 1, i8 2, i8 3)
+  _tile_dphf8ps(1, 2, 3);
+}
diff --git a/clang/test/CodeGen/X86/amx_fp8_errors.c b/clang/test/CodeGen/X86/amx_fp8_errors.c
new file mode 100644
index 00000000000000..77cbd34905b8ba
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_fp8_errors.c
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown -target-feature +amx-tile -target-feature +amx-fp8 -verify
+
+#include <immintrin.h>
+
+void test_amx(void *data) {
+  _tile_dpbf8ps(4, 3, 3); // expected-error {{tile arguments must refer to different tiles}}
+  _tile_dpbhf8ps(4, 3, 3); // expected-error {{tile arguments must refer to different tiles}}
+  _tile_dphbf8ps(4, 3, 3); // expected-error {{tile arguments must refer to different tiles}}
+  _tile_dphf8ps(4, 3, 3); // expected-error {{tile arguments must refer to different tiles}}
+}
diff --git a/clang/test/CodeGen/X86/amx_fp8_inline_asm.c b/clang/test/CodeGen/X86/amx_fp8_inline_asm.c
new file mode 100644
index 00000000000000..49331bd9d368ab
--- /dev/null
+++ b/clang/test/CodeGen/X86/amx_fp8_inline_asm.c
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 %s -ffreestanding -triple=x86_64-unknown-unknown  -target-feature +amx-fp8 -emit-llvm -o - -Wall -Werror -pedantic | FileCheck %s
+
+void f_tilemul(short a)
+{
+  //CHECK:  call void asm sideeffect "tileloadd 0(%rsi,%r13,4), %tmm0   \0A\09tileloadd 0(%rdx,%r14,4), %tmm6   \0A\09tdpbf8ps %tmm6, %tmm0, %tmm7    \0A\09tilestored %tmm7, 0(%r12,%r15,4) \0A\09", "~{memory},~{tmm0},~{tmm6},~{tmm7},~{dirflag},~{fpsr},~{flags}"()
+  __asm__ volatile ("tileloadd 0(%%rsi,%%r13,4), %%tmm0   \n\t"
+                    "tileloadd 0(%%rdx,%%r14,4), %%tmm6   \n\t"
+                    "tdpbf8ps %%tmm6, %%tmm0, %%tmm7    \n\t"
+                    "tilestored %%tmm7, 0(%%r12,%%r15,4) \n\t"
+          ::: "memory", "tmm0", "tmm6", "tmm7");
+
+  //CHECK:  call void asm sideeffect "tileloadd 0(%rsi,%r13,4), %tmm0   \0A\09tileloadd 0(%rdx,%r14,4), %tmm6   \0A\09tdpbhf8ps %tmm6, %tmm0, %tmm7    \0A\09tilestored %tmm7, 0(%r12,%r15,4) \0A\09", "~{memory},~{tmm0},~{tmm6},~{tmm7},~{dirflag},~{fpsr},~{flags}"()
+  __asm__ volatile ("tileloadd 0(%%rsi,%%r13,4), %%tmm0   \n\t"
+                    "tileloadd 0(%%rdx,%%r14,4), %%tmm6   \n\t"
+                    "tdpbhf8ps %%tmm6, %%tmm0, %%tmm7    \n\t"
+                    "tilestored %%tmm7, 0(%%r12,%%r15,4) \n\t"
+          ::: "memory", "tmm0", "tmm6", "tmm7");
+
+  //CHECK:  call void asm sideeffect "tileloadd 0(%rsi,%r13,4), %tmm0   \0A\09tileloadd 0(%rdx,%r14,4), %tmm6   \0A\09tdphbf8ps %tmm6, %tmm0, %tmm7    \0A\09tilestored %tmm7, 0(%r12,%r15,4) \0A\09", "~{memory},~{tmm0},~{tmm6},~{tmm7},~{dirflag},~{fpsr},~{flags}"()
+  __asm__ volatile ("tileloadd 0(%%rsi,%%r13,4), %%tmm0   \n\t"
+                    "tileloadd 0(%%rdx,%%r14,4), %%tmm6   \n\t"
+                    "tdphbf8ps %%tmm6, %%tmm0, %%tmm7    \n\t"
+                    "tilestored %%tmm7, 0(%%r12,%%r15,4) \n\t"
+          ::: "memory", "tmm0", "tmm6", "tmm7");
+
+  //CHECK:  call void asm sideeffect "tileloadd 0(%rsi,%r13,4), %tmm0   \0A\09tileloadd 0(%rdx,%r14,4), %tmm6   \0A\09tdphf8ps %tmm6, %tmm0, %tmm7    \0A\09tilestored %tmm7, 0(%r12,%r15,4) \0A\09", "~{memory},~{tmm0},~{tmm6},~{tmm7},~{dirflag},~{fpsr},~{flags}"()
+  __asm__ volatile ("tileloadd 0(%%rsi,%%r13,4), %%tmm0   \n\t"
+                    "tileloadd 0(%%rdx,%%r14,4), %%tmm6   \n\t"
+                    "tdphf8ps %%tmm6, %%tmm0, %%tmm7    \n\t"
+                    "tilestored %%tmm7, 0(%%r12,%%r15,4) \n\t"
+          ::: "memory", "tmm0", "tmm6", "tmm7");
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td
index d0083017fb9383..6530051e0d3e9f 100644
--- a/llvm/include/llvm/IR/IntrinsicsX86.td
+++ b/llvm/include/llvm/IR/IntrinsicsX86.td
@@ -5994,6 +5994,23 @@ let TargetPrefix = "x86" in {
                         [llvm_i16_ty, llvm_i16_ty, llvm_i16_ty,
                          llvm_x86amx_ty, llvm_x86amx_ty,
                          llvm_x86amx_ty], []>;
+
+  def int_x86_tdpbf8ps : ClangBuiltin<"__builtin_ia32_tdpbf8ps">,
+              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
+                        [ImmArg<ArgIndex<0>>,
+                         ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+  def int_x86_tdpbhf8ps : ClangBuiltin<"__builtin_ia32_tdpbhf8ps">,
+              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
+                        [ImmArg<ArgIndex<0>>,
+                         ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+  def int_x86_tdphbf8ps : ClangBuiltin<"__builtin_ia32_tdphbf8ps">,
+              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
+                        [ImmArg<ArgIndex<0>>,
+                         ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
+  def int_x86_tdphf8ps : ClangBuiltin<"__builtin_ia32_tdphf8ps">,
+              Intrinsic<[], [llvm_i8_ty, llvm_i8_ty, llvm_i8_ty],
+                        [ImmArg<ArgIndex<0>>,
+                        ImmArg<ArgIndex<1>>, ImmArg<ArgIndex<2>>]>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index 073e19f8187c65..19e8e0013ef6a0 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -264,6 +264,7 @@ X86_FEATURE_COMPAT(AVX10_2_512,     "avx10.2-512",            0)
 //FIXME: make MOVRS _COMPAT defined when gcc landed relate patch.
 X86_FEATURE       (MOVRS,           "movrs")
 X86_FEATURE       (ZU,              "zu")
+X86_FEATURE       (AMX_FP8,         "amx-fp8")
 // These features aren't really CPU features, but the frontend can set them.
 X86_FEATURE       (RETPOLINE_EXTERNAL_THUNK,    "retpoline-external-thunk")
 X86_FEATURE       (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches")
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 6bedf9e1d13ac3..79c60402a49f0a 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -270,6 +270,9 @@ def FeatureAMXFP16     : SubtargetFeature<"amx-fp16", "HasAMXFP16", "true",
 def FeatureAMXCOMPLEX : SubtargetFeature<"amx-complex", "HasAMXCOMPLEX", "true",
                                          "Support AMX-COMPLEX instructions",
                                          [FeatureAMXTILE]>;
+def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true",
+                                         "Support AMX-FP8 instructions",
+                                         [FeatureAMXTILE]>;
 def FeatureCMPCCXADD : SubtargetFeature<"cmpccxadd", "HasCMPCCXADD", "true",
                                         "Support CMPCCXADD instructions">;
 def FeatureRAOINT : SubtargetFeature<"raoint", "HasRAOINT", "true",
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a6d77873ec2901..5a7313ac3e1234 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -37503,6 +37503,29 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MI.eraseFromParent(); // The pseudo is gone now.
     return BB;
   }
+  case X86::PTDPBF8PS:
+  case X86::PTDPBHF8PS:
+  case X86::PTDPHBF8PS:
+  case X86::PTDPHF8PS: {
+    const DebugLoc &DL = MI.getDebugLoc();
+    unsigned Opc;
+    switch(MI.getOpcode()) {
+      default: llvm_unreachable("Unexpected instruction!");
+      case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
+      case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
+      case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
+      case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
+    }
+
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
+    MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
+    MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
+    MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
+    MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
+
+    MI.eraseFromParent();
+    return BB;
+  }
   }
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index 99deacc811a170..d0c91ab7b5e696 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -267,3 +267,42 @@ let Predicates = [HasAMXCOMPLEX, In64BitMode] in {
     }
   } // SchedRW = [WriteSystem]
 }
+
+// AMX-FP8
+let Predicates = [HasAMXFP8, In64BitMode] in {
+  let SchedRW = [WriteSystem] in {
+    let Constraints = "$src1 = $dst" in {
+      class AMX_FP8_BASE<bits<8> Opcode, string Opstr> :
+        I<Opcode, MRMSrcReg4VOp3, (outs TILE:$dst),
+        (ins TILE:$src1, TILE:$src2, TILE:$src3),
+        !strconcat(Opstr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+        []>, VEX, VVVV;
+    }
+
+    def TDPBF8PS : AMX_FP8_BASE<0xfd, "tdpbf8ps">, T_MAP5, PS;
+    def TDPBHF8PS : AMX_FP8_BASE<0xfd, "tdpbhf8ps">, T_MAP5, XD;
+    def TDPHBF8PS : AMX_FP8_BASE<0xfd, "tdphbf8ps">, T_MAP5, XS;
+    def TDPHF8PS : AMX_FP8_BASE<0xfd, "tdphf8ps">, T_MAP5, PD;
+
+    let usesCustomInserter = 1 in {
+      // Pseudo instructions, using immediates instead of tile registers.
+      // To be translated to the actual instructions in X86ISelLowering.cpp
+      def PTDPBF8PS : PseudoI<(outs), (ins u8imm:$src1,
+                                       u8imm:$src2, u8imm:$src3),
+                                       [(int_x86_tdpbf8ps timm:$src1,
+                                         timm:$src2, timm:$src3)]>;
+      def PTDPBHF8PS : PseudoI<(outs), (ins u8imm:$src1,
+                                        u8imm:$src2, u8imm:$src3),
+                                        [(int_x86_tdpbhf8ps timm:$src1,
+                                          timm:$src2, timm:$src3)]>;
+      def PTDPHBF8PS : PseudoI<(outs), (ins u8imm:$src1,
+                                        u8imm:$src2, u8imm:$src3),
+                                        [(int_x86_tdphbf8ps timm:$src1,
+                                          timm:$src2, timm:$src3)]>;
+      def PTDPHF8PS : PseudoI<(outs), (ins u8imm:$src1,
+                                       u8imm:$src2, u8imm:$src3),
+                                       [(int_x86_tdphf8ps timm:$src1,
+                                         timm:$src2, timm:$src3)]>;
+    }
+  }
+}
diff --git a/llvm/lib/Target/X86/X86InstrPredicates.td b/llvm/lib/Target/X86/X86InstrPredicates.td
index 7fb566fba51818..5b659d3b072dca 100644
--- a/llvm/lib/Target/X86/X86InstrPredicates.td
+++ b/llvm/lib/Target/X86/X86InstrPredicates.td
@@ -183,6 +183,7 @@ def HasAMXTILE   : Predicate<"Subtarget->hasAMXTILE()">;
 def HasAMXBF16   : Predicate<"Subtarget->hasAMXBF16()">;
 def HasAMXINT8   : Predicate<"Subtarget->hasAMXINT8()">;
 def HasAMXCOMPLEX : Predicate<"Subtarget->hasAMXCOMPLEX()">;
+def HasAMXFP8    : Predicate<"Subtarget->hasAMXFP8()">;
 def HasUINTR     : Predicate<"Subtarget->hasUINTR()">;
 def HasUSERMSR   : Predicate<"Subtarget->hasUSERMSR()">;
 def HasCRC32     : Predicate<"Subtarget->hasCRC32()">;
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 5c4e3a9dc52b0f..78991b3936505c 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1876,6 +1876,10 @@ const StringMap<bool> sys::getHostCPUFeatures() {
       MaxLevel >= 0x19 && !getX86CpuIDAndInfo(0x19, &EAX, &EBX, &ECX, &EDX);
   Features["widekl"] = HasLeaf7 && HasLeaf19 && ((EBX >> 2) & 1);
 
+  bool HasLeaf1E =
+      MaxLevel >= 0x1e && !getX86CpuIDAndInfo(0x1e, &EAX, &EBX, &ECX, &EDX);
+  Features["amx-fp8"] = HasLeaf1E && ((EAX >> 4) & 1) && HasAMXSave;
+
   bool HasLeaf24 =
       MaxLevel >= 0x24 && !getX86CpuIDAndInfo(0x24, &EAX, &EBX, &ECX, &EDX);
 
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index 586df5748aa822..7d60b81d4bb1c3 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -598,6 +598,7 @@ constexpr FeatureBitset ImpliedFeaturesAMX_BF16 = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_FP16 = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesAMX_COMPLEX = FeatureAMX_TILE;
+constexpr FeatureBitset ImpliedFeaturesAMX_FP8 = FeatureAMX_TILE;
 constexpr FeatureBitset ImpliedFeaturesHRESET = {};
 
 constexpr FeatureBitset ImpliedFeaturesPREFETCHI = {};
diff --git a/llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll b/llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll
new file mode 100644
index 00000000000000..f5d3f6ec9ec298
--- /dev/null
+++ b/llvm/test/CodeGen/X86/amx_fp8_intrinsics.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+amx-fp8 | FileCheck %s
+
+; CHECK-LABEL: test_amx:
+; CHECK:       # %bb.0:
+; CHECK:    tdpbf8ps        %tmm3, %tmm2, %tmm1
+; CHECK:    tdpbhf8ps        %tmm3, %tmm2, %tmm1
+; CHECK:    tdphbf8ps        %tmm3, %tmm2, %tmm1
+; CHECK:    tdphf8ps        %tmm3, %tmm2, %tmm1
+
+define void @test_amx(){
+call void @llvm.x86.tdpbf8ps(i8 1, i8 2, i8 3)
+call void @llvm.x86.tdpbhf8ps(i8 1, i8 2, i8 3)
+call void @llvm.x86.tdphbf8ps(i8 1, i8 2, i8 3)
+call void @llvm.x86.tdphf8ps(i8 1, i8 2, i8 3)
+ret void
+}
+declare void @llvm.x86.tdpbf8ps(i8 %tile0, i8 %tile1, i8 %tile2)
+declare void @llvm.x86.tdpbhf8ps(i8 %tile0, i8 %tile1, i8 %tile2)
+declare void @llvm.x86.tdphbf8ps(i8 %tile0, i8 %tile1, i8 %tile2)
+declare void @llvm.x86.tdphf8ps(i8 %tile0, i8 %tile1, i8 %tile2)
diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-fp8.txt b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-fp8.txt
new file mode 100644
index 00000000000000..e714a52d2c31a7
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-fp8.txt
@@ -0,0 +1,34 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
+
+# ATT:   tdpbf8ps %tmm4, %tmm5, %tmm6
+# INTEL: tdpbf8ps tmm6, tmm5, tmm4
+0xc4,0xe5,0x58,0xfd,0xf5
+
+# ATT:   tdpbf8ps %tmm1, %tmm2, %tmm3
+# INTEL: tdpbf8ps tmm3, tmm2, tmm1
+0xc4,0xe5,0x70,0xfd,0xda
+
+# ATT:   tdpbhf8ps %tmm4, %tmm5, %tmm6
+# INTEL: tdpbhf8ps tmm6, tmm5, tmm4
+0xc4,0xe5,0x5b,0xfd,0xf5
+
+# ATT:   tdpbhf8ps %tmm1, %tmm2, %tmm3
+# INTEL: tdpbhf8ps tmm3, tmm2, tmm1
+0xc4,0xe5,0x73,0xfd,0xda
+
+# ATT:   tdphbf8ps %tmm4, %tmm5, %tmm6
+# INTEL: tdphbf8ps tmm6, tmm5, tmm4
+0xc4,0xe5,0x5a,0xfd,0xf5
+
+# ATT:   tdphbf8ps %tmm1, %tmm2, %tmm3
+# INTEL: tdphbf8ps tmm3, tmm2, tmm1
+0xc4,0xe5,0x72,0xfd,0xda
+
+# ATT:   tdphf8ps %tmm4, %tmm5, %tmm6
+# INTEL: tdphf8ps tmm6, tmm5, tmm4
+0xc4,0xe5,0x59,0xfd,0xf5
+
+# ATT:   tdphf8ps %tmm1, %tmm2, %tmm3
+# INTEL: tdphf8ps tmm3, tmm2, tmm1
+0xc4,0xe5,0x71,0xfd,0xda
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s b/llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s
new file mode 100644
index 00000000000000..904539ec4917fe
--- /dev/null
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s
@@ -0,0 +1,33 @@
+// RUN: llvm-mc -triple x86_64 --show-encoding %s | FileCheck %s
+
+// CHECK: tdpbf8ps %tmm4, %tmm5, %tmm6
+// CHECK: encoding: [0xc4,0xe5,0x58,0xfd,0xf5]
+          tdpbf8ps %tmm4, %tmm5, %tmm6
+
+// CHECK: tdpbf8ps %tmm1, %tmm2, %tmm3
+// CHECK: encoding: [0xc4,0xe5,0x70,0xfd,0xda]
+          tdpbf8ps %tmm1, %tmm2, %tmm3
+
+// CHECK: tdpbhf8ps %tmm4, %tmm5, %tmm6
+// CHECK: encoding: [0xc4,0xe5,0x5b,0xfd,0xf5]
+          tdpbhf8ps %tmm4, %tmm5, %tmm6
+
+// CHECK: tdpbhf8ps %tmm1, %tmm2, %tmm3
+// CHECK: encoding: [0xc4,0xe5,0x73,0xfd,0xda]
+          tdpbhf8ps %tmm1, %tmm2, %tmm3
+
+// CHECK: tdphbf8ps %tmm4, %tmm5, %tmm6
+// CHECK: encoding: [0xc4,0xe5,0x5a,0xfd,0xf5]
+          tdphbf8ps %tmm4, %tmm5, %tmm6
+
+// CHECK: tdphbf8ps %tmm1, %tmm2, %tmm3
+// CHECK: encoding: [0xc4,0xe5,0x72,0xfd,0xda]
+          tdphbf8ps %tmm1, %tmm2, %tmm3
+
+// CHECK: tdphf8ps %tmm4, %tmm5, %tmm6
+// CHECK: encoding: [0xc4,0xe5,0x59,0xfd,0xf5]
+          tdphf8ps %tmm4, %tmm5, %tmm6
+
+// CHECK: tdphf8ps %tmm1, %tmm2, %tmm3
+// CHECK: encoding: [0xc4,0xe5,0x71,0xfd,0xda]
+          tdphf8ps %tmm1, %tmm2, %tmm3
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s b/llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s
new file mode 100644
index 00000000000000..4191ae6f5cd133
--- /dev/null
+++ b/llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s
@@ -0,0 +1,33 @@
+// RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: tdpbf8ps tmm6, tmm5, tmm4
+// CHECK: encoding: [0xc4,0xe5,0x58,0xfd,0xf5]
+          tdpbf8ps tmm6, tmm5, tmm4
+
+// CHECK: tdpbf8ps tmm3, tmm2, tmm1
+// CHECK: encoding: [0xc4,0xe5,0x70,0xfd,0xda]
+          tdpbf8ps tmm3, tmm2, tmm1
+
+// CHECK: tdpbhf8ps tmm6, tmm5, tmm4
+// CHECK: encoding: [0xc4,0xe5,0x5b,0xfd,0xf5]
+          tdpbhf8ps tmm6, tmm5, tmm4
+
+// CHECK: tdpbhf8ps tmm3, tmm2, tmm1
+// CHECK: encoding: [0xc4,0xe5,0x73,0xfd,0xda]
+          tdpbhf8ps tmm3, tmm2, tmm1
+
+// CHECK: tdphbf8ps tmm6, tmm5, tmm4
+// CHECK: encoding: [0xc4,0xe5,0x5a,0xfd,0xf5]
+          tdphbf8ps tmm6, tmm5, tmm4
+
+// CHECK: tdphbf8ps tmm3, tmm2, tmm1
+// CHECK: encoding: [0xc4,0xe5,0x72,0xfd,0xda]
+          tdphbf8ps tmm3, tmm2, tmm1
+
+// CHECK: tdphf8ps tmm6, tmm5, tmm4
+// CHECK: encoding: [0xc4,0xe5,0x59,0xfd,0xf5]
+          tdphf8ps tmm6, tmm5, tmm4
+
+// CHECK: tdphf8ps tmm3, tmm2, tmm1
+// CHECK: encoding: [0xc4,0xe5,0x71,0xfd,0xda]
+          tdphf8ps tmm3, tmm2, tmm1

>From e960360797ec027b6de3e19dcf26d83e75365e11 Mon Sep 17 00:00:00 2001
From: Feng Zou <feng.zou at intel.com>
Date: Mon, 28 Oct 2024 10:12:16 +0800
Subject: [PATCH 2/5] Apply clang format

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5a7313ac3e1234..a83f3925813daf 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -37509,12 +37509,21 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::PTDPHF8PS: {
     const DebugLoc &DL = MI.getDebugLoc();
     unsigned Opc;
-    switch(MI.getOpcode()) {
-      default: llvm_unreachable("Unexpected instruction!");
-      case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
-      case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
-      case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
-      case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
+    switch (MI.getOpcode()) {
+    default:
+      llvm_unreachable("Unexpected instruction!");
+    case X86::PTDPBF8PS:
+      Opc = X86::TDPBF8PS;
+      break;
+    case X86::PTDPBHF8PS:
+      Opc = X86::TDPBHF8PS;
+      break;
+    case X86::PTDPHBF8PS:
+      Opc = X86::TDPHBF8PS;
+      break;
+    case X86::PTDPHF8PS:
+      Opc = X86::TDPHF8PS;
+      break;
     }
 
     MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));

>From e28c3230bbb34e27a85de2496db6b5a873eb526a Mon Sep 17 00:00:00 2001
From: Feng Zou <feng.zou at intel.com>
Date: Wed, 30 Oct 2024 09:53:20 +0800
Subject: [PATCH 3/5] Address comments.

---
 clang/lib/Headers/amxfp8intrin.h        | 61 ++++++++++++++++++++++++-
 llvm/lib/Target/X86/X86.td              |  4 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp | 42 ++++-------------
 llvm/lib/Target/X86/X86InstrAMX.td      | 38 +++++++--------
 4 files changed, 90 insertions(+), 55 deletions(-)

diff --git a/clang/lib/Headers/amxfp8intrin.h b/clang/lib/Headers/amxfp8intrin.h
index d187b5f0421bbb..b6e6a105bc42f3 100644
--- a/clang/lib/Headers/amxfp8intrin.h
+++ b/clang/lib/Headers/amxfp8intrin.h
@@ -1,4 +1,4 @@
-/*===---------- amxfp8intrin.h - AMX intrinsics -*- C++ -*------------===
+/*===------------- amxfp8intrin.h - AMX intrinsics -*- C++ -*----------------===
  *
  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
  * See https://llvm.org/LICENSE.txt for license information.
@@ -15,9 +15,68 @@
 #define __AMXFP8INTRIN_H
 #ifdef __x86_64__
 
+
+/// Compute dot-product of brain-float8 (BF8) or hybrid-float8 (HF8)
+///    floating-point pairs in tiles \a a and \a b, accumulating the
+///    intermediate single-precision (32-bit) floating-point elements with
+///    elements in \a dst, and store the 32-bit result back to tile \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void _tile_dpbf8ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TDPBF8PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
 #define _tile_dpbf8ps __builtin_ia32_tdpbf8ps
+
+/// \code
+/// void _tile_dpbhf8ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TDPBHF8PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
 #define _tile_dpbhf8ps __builtin_ia32_tdpbhf8ps
+
+/// \code
+/// void _tile_dphbf8ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TDPHBF8PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
 #define _tile_dphbf8ps __builtin_ia32_tdphbf8ps
+
+/// \code
+/// void _tile_dphf8ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TDPHF8PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
 #define _tile_dphf8ps __builtin_ia32_tdphf8ps
 
 #endif /* __x86_64__ */
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 79c60402a49f0a..c7882acc044e04 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -271,8 +271,8 @@ def FeatureAMXCOMPLEX : SubtargetFeature<"amx-complex", "HasAMXCOMPLEX", "true",
                                          "Support AMX-COMPLEX instructions",
                                          [FeatureAMXTILE]>;
 def FeatureAMXFP8 : SubtargetFeature<"amx-fp8", "HasAMXFP8", "true",
-                                         "Support AMX-FP8 instructions",
-                                         [FeatureAMXTILE]>;
+                                     "Support AMX-FP8 instructions",
+                                     [FeatureAMXTILE]>;
 def FeatureCMPCCXADD : SubtargetFeature<"cmpccxadd", "HasCMPCCXADD", "true",
                                         "Support CMPCCXADD instructions">;
 def FeatureRAOINT : SubtargetFeature<"raoint", "HasRAOINT", "true",
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index a83f3925813daf..2f2987eec24e62 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -37410,7 +37410,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::PTDPBUSD:
   case X86::PTDPBUUD:
   case X86::PTDPBF16PS:
-  case X86::PTDPFP16PS: {
+  case X86::PTDPFP16PS:
+  case X86::PTDPBF8PS:
+  case X86::PTDPBHF8PS:
+  case X86::PTDPHBF8PS:
+  case X86::PTDPHF8PS: {
     unsigned Opc;
     switch (MI.getOpcode()) {
     // clang-format off
@@ -37421,6 +37425,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
     case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
     case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break;
+    case X86::PTDPBF8PS: Opc = X86::TDPBF8PS; break;
+    case X86::PTDPBHF8PS: Opc = X86::TDPBHF8PS; break;
+    case X86::PTDPHBF8PS: Opc = X86::TDPHBF8PS; break;
+    case X86::PTDPHF8PS: Opc = X86::TDPHF8PS; break;
     // clang-format on
     }
 
@@ -37503,38 +37511,6 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MI.eraseFromParent(); // The pseudo is gone now.
     return BB;
   }
-  case X86::PTDPBF8PS:
-  case X86::PTDPBHF8PS:
-  case X86::PTDPHBF8PS:
-  case X86::PTDPHF8PS: {
-    const DebugLoc &DL = MI.getDebugLoc();
-    unsigned Opc;
-    switch (MI.getOpcode()) {
-    default:
-      llvm_unreachable("Unexpected instruction!");
-    case X86::PTDPBF8PS:
-      Opc = X86::TDPBF8PS;
-      break;
-    case X86::PTDPBHF8PS:
-      Opc = X86::TDPBHF8PS;
-      break;
-    case X86::PTDPHBF8PS:
-      Opc = X86::TDPHBF8PS;
-      break;
-    case X86::PTDPHF8PS:
-      Opc = X86::TDPHF8PS;
-      break;
-    }
-
-    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
-    MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
-    MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
-    MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
-    MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
-
-    MI.eraseFromParent();
-    return BB;
-  }
   }
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
index d0c91ab7b5e696..202232ccb8bc72 100644
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -274,9 +274,9 @@ let Predicates = [HasAMXFP8, In64BitMode] in {
     let Constraints = "$src1 = $dst" in {
       class AMX_FP8_BASE<bits<8> Opcode, string Opstr> :
         I<Opcode, MRMSrcReg4VOp3, (outs TILE:$dst),
-        (ins TILE:$src1, TILE:$src2, TILE:$src3),
-        !strconcat(Opstr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-        []>, VEX, VVVV;
+          (ins TILE:$src1, TILE:$src2, TILE:$src3),
+          !strconcat(Opstr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+          []>, VEX, VVVV;
     }
 
     def TDPBF8PS : AMX_FP8_BASE<0xfd, "tdpbf8ps">, T_MAP5, PS;
@@ -287,22 +287,22 @@ let Predicates = [HasAMXFP8, In64BitMode] in {
     let usesCustomInserter = 1 in {
       // Pseudo instructions, using immediates instead of tile registers.
       // To be translated to the actual instructions in X86ISelLowering.cpp
-      def PTDPBF8PS : PseudoI<(outs), (ins u8imm:$src1,
-                                       u8imm:$src2, u8imm:$src3),
-                                       [(int_x86_tdpbf8ps timm:$src1,
-                                         timm:$src2, timm:$src3)]>;
-      def PTDPBHF8PS : PseudoI<(outs), (ins u8imm:$src1,
-                                        u8imm:$src2, u8imm:$src3),
-                                        [(int_x86_tdpbhf8ps timm:$src1,
-                                          timm:$src2, timm:$src3)]>;
-      def PTDPHBF8PS : PseudoI<(outs), (ins u8imm:$src1,
-                                        u8imm:$src2, u8imm:$src3),
-                                        [(int_x86_tdphbf8ps timm:$src1,
-                                          timm:$src2, timm:$src3)]>;
-      def PTDPHF8PS : PseudoI<(outs), (ins u8imm:$src1,
-                                       u8imm:$src2, u8imm:$src3),
-                                       [(int_x86_tdphf8ps timm:$src1,
-                                         timm:$src2, timm:$src3)]>;
+      def PTDPBF8PS : PseudoI<(outs),
+                              (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
+                              [(int_x86_tdpbf8ps timm:$src1, timm:$src2,
+                                timm:$src3)]>;
+      def PTDPBHF8PS : PseudoI<(outs),
+                               (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
+                               [(int_x86_tdpbhf8ps timm:$src1, timm:$src2,
+                                 timm:$src3)]>;
+      def PTDPHBF8PS : PseudoI<(outs),
+                               (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
+                               [(int_x86_tdphbf8ps timm:$src1, timm:$src2,
+                                 timm:$src3)]>;
+      def PTDPHF8PS : PseudoI<(outs),
+                              (ins u8imm:$src1, u8imm:$src2, u8imm:$src3),
+                              [(int_x86_tdphf8ps timm:$src1, timm:$src2,
+                                timm:$src3)]>;
     }
   }
 }

>From b9a8b40364a7285c8e4acc79b3adbdeeddb515fa Mon Sep 17 00:00:00 2001
From: Feng Zou <feng.zou at intel.com>
Date: Wed, 30 Oct 2024 10:01:14 +0800
Subject: [PATCH 4/5] Address more comments.

---
 llvm/lib/TargetParser/Host.cpp                                | 4 ++--
 .../Disassembler/X86/AMX/{x86-64-amx-fp8.txt => amx-fp8.txt}  | 0
 llvm/test/MC/X86/AMX/{x86-64-amx-fp8-att.s => amx-fp8-att.s}  | 0
 .../MC/X86/AMX/{x86-64-amx-fp8-intel.s => amx-fp8-intel.s}    | 0
 4 files changed, 2 insertions(+), 2 deletions(-)
 rename llvm/test/MC/Disassembler/X86/AMX/{x86-64-amx-fp8.txt => amx-fp8.txt} (100%)
 rename llvm/test/MC/X86/AMX/{x86-64-amx-fp8-att.s => amx-fp8-att.s} (100%)
 rename llvm/test/MC/X86/AMX/{x86-64-amx-fp8-intel.s => amx-fp8-intel.s} (100%)

diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 78991b3936505c..fd34a276cf3ce5 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1876,8 +1876,8 @@ const StringMap<bool> sys::getHostCPUFeatures() {
       MaxLevel >= 0x19 && !getX86CpuIDAndInfo(0x19, &EAX, &EBX, &ECX, &EDX);
   Features["widekl"] = HasLeaf7 && HasLeaf19 && ((EBX >> 2) & 1);
 
-  bool HasLeaf1E =
-      MaxLevel >= 0x1e && !getX86CpuIDAndInfo(0x1e, &EAX, &EBX, &ECX, &EDX);
+  bool HasLeaf1E = MaxLevel >= 0x1e &&
+                   !getX86CpuIDAndInfoEx(0x1e, 0x1, &EAX, &EBX, &ECX, &EDX);
   Features["amx-fp8"] = HasLeaf1E && ((EAX >> 4) & 1) && HasAMXSave;
 
   bool HasLeaf24 =
diff --git a/llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-fp8.txt b/llvm/test/MC/Disassembler/X86/AMX/amx-fp8.txt
similarity index 100%
rename from llvm/test/MC/Disassembler/X86/AMX/x86-64-amx-fp8.txt
rename to llvm/test/MC/Disassembler/X86/AMX/amx-fp8.txt
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s b/llvm/test/MC/X86/AMX/amx-fp8-att.s
similarity index 100%
rename from llvm/test/MC/X86/AMX/x86-64-amx-fp8-att.s
rename to llvm/test/MC/X86/AMX/amx-fp8-att.s
diff --git a/llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s b/llvm/test/MC/X86/AMX/amx-fp8-intel.s
similarity index 100%
rename from llvm/test/MC/X86/AMX/x86-64-amx-fp8-intel.s
rename to llvm/test/MC/X86/AMX/amx-fp8-intel.s

>From 4ce35440ca0436c3bf5619dd7c0646418a78d977 Mon Sep 17 00:00:00 2001
From: Feng Zou <feng.zou at intel.com>
Date: Wed, 30 Oct 2024 10:32:51 +0800
Subject: [PATCH 5/5] Update intrinsics comments.

---
 clang/lib/Headers/amxfp8intrin.h | 35 ++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 13 deletions(-)

diff --git a/clang/lib/Headers/amxfp8intrin.h b/clang/lib/Headers/amxfp8intrin.h
index b6e6a105bc42f3..8ff4ddf2acc446 100644
--- a/clang/lib/Headers/amxfp8intrin.h
+++ b/clang/lib/Headers/amxfp8intrin.h
@@ -15,11 +15,12 @@
 #define __AMXFP8INTRIN_H
 #ifdef __x86_64__
 
-
-/// Compute dot-product of brain-float8 (BF8) or hybrid-float8 (HF8)
-///    floating-point pairs in tiles \a a and \a b, accumulating the
-///    intermediate single-precision (32-bit) floating-point elements with
-///    elements in \a dst, and store the 32-bit result back to tile \a dst.
+/// These instructions compute dot product of brain-float8 (BF8) or
+/// hybrid-float8 (HF8) accumulating into a single precision (FP32). The input
+/// elements can be BF8 or HF8. These instructions have three tile operands, one
+/// source/dest accumulator operand, and two source operands, \a a and \a b. The
+/// \a a and \a b operands can be BF8 or HF8 independently, and the source/dest
+/// operand, \a dst is always FP32.
 ///
 /// \headerfile <immintrin.h>
 ///
@@ -27,7 +28,9 @@
 /// void _tile_dpbf8ps (__tile dst, __tile a, __tile b)
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c TDPBF8PS instruction.
+/// This intrinsic corresponds to the \c TDPBF8PS instruction, which is the dot
+/// product of a BF8 value (\a a) by a BF8 value (\a b) accumulating into a
+/// Single Precision (FP32) source/dest (\a dst).
 ///
 /// \param dst
 ///    The destination tile. Max size is 1024 Bytes.
@@ -35,13 +38,15 @@
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param b
 ///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbf8ps __builtin_ia32_tdpbf8ps
+#define _tile_dpbf8ps(dst, a, b) __builtin_ia32_tdpbf8ps((dst), (a), (b))
 
 /// \code
 /// void _tile_dpbhf8ps (__tile dst, __tile a, __tile b)
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c TDPBHF8PS instruction.
+/// This intrinsic corresponds to the \c TDPBHF8PS instruction, which is the dot
+/// product of a BF8 value (\a a) by an HF8 value (\a b) accumulating into a
+/// Single Precision (FP32) source/dest (\a dst).
 ///
 /// \param dst
 ///    The destination tile. Max size is 1024 Bytes.
@@ -49,13 +54,15 @@
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param b
 ///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dpbhf8ps __builtin_ia32_tdpbhf8ps
+#define _tile_dpbhf8ps(dst, a, b) __builtin_ia32_tdpbhf8ps((dst), (a), (b))
 
 /// \code
 /// void _tile_dphbf8ps (__tile dst, __tile a, __tile b)
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c TDPHBF8PS instruction.
+/// This intrinsic corresponds to the \c TDPHBF8PS instruction, which is the dot
+/// product of an HF8 value (\a a) by a BF8 value (\a b) accumulating into a
+/// Single Precision (FP32) source/dest (\a dst).
 ///
 /// \param dst
 ///    The destination tile. Max size is 1024 Bytes.
@@ -63,13 +70,15 @@
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param b
 ///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dphbf8ps __builtin_ia32_tdphbf8ps
+#define _tile_dphbf8ps(dst, a, b) __builtin_ia32_tdphbf8ps((dst), (a), (b))
 
 /// \code
 /// void _tile_dphf8ps (__tile dst, __tile a, __tile b)
 /// \endcode
 ///
-/// This intrinsic corresponds to the \c TDPHF8PS instruction.
+/// This intrinsic corresponds to the \c TDPHF8PS instruction, which is the dot
+/// product of an HF8 value (\a a) by an HF8 value (\a b) accumulating into a
+/// Single Precision (FP32) source/dest (\a dst).
 ///
 /// \param dst
 ///    The destination tile. Max size is 1024 Bytes.
@@ -77,7 +86,7 @@
 ///    The 1st source tile. Max size is 1024 Bytes.
 /// \param b
 ///    The 2nd source tile. Max size is 1024 Bytes.
-#define _tile_dphf8ps __builtin_ia32_tdphf8ps
+#define _tile_dphf8ps(dst, a, b) __builtin_ia32_tdphf8ps((dst), (a), (b))
 
 #endif /* __x86_64__ */
 #endif /* __AMXFP8INTRIN_H */



More information about the cfe-commits mailing list