[clang] [llvm] [X86][AVX10.2] Support AVX10.2-CONVERT new instructions. (PR #101600)
via cfe-commits
cfe-commits at lists.llvm.org
Mon Aug 5 20:26:14 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-ir
Author: Freddy Ye (FreddyLeaf)
<details>
<summary>Changes</summary>
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965
---
Patch is 613.01 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/101600.diff
23 Files Affected:
- (modified) clang/include/clang/Basic/BuiltinsX86.def (+47)
- (modified) clang/lib/Headers/CMakeLists.txt (+2)
- (added) clang/lib/Headers/avx10_2_512convertintrin.h (+304)
- (added) clang/lib/Headers/avx10_2convertintrin.h (+560)
- (modified) clang/lib/Headers/immintrin.h (+2)
- (modified) clang/lib/Sema/SemaX86.cpp (+2)
- (added) clang/test/CodeGen/X86/avx10_2_512convert-builtins.c (+274)
- (added) clang/test/CodeGen/X86/avx10_2convert-builtins.c (+530)
- (modified) llvm/include/llvm/IR/IntrinsicsX86.td (+130)
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+38)
- (modified) llvm/lib/Target/X86/X86ISelLowering.h (+24)
- (modified) llvm/lib/Target/X86/X86InstrAVX10.td (+431)
- (modified) llvm/lib/Target/X86/X86InstrFragmentsSIMD.td (+89)
- (modified) llvm/lib/Target/X86/X86IntrinsicsInfo.h (+85)
- (added) llvm/test/CodeGen/X86/avx10_2_512convert-intrinsics.ll (+578)
- (added) llvm/test/CodeGen/X86/avx10_2convert-intrinsics.ll (+1147)
- (added) llvm/test/MC/Disassembler/X86/avx10.2convert-32.txt (+1491)
- (added) llvm/test/MC/Disassembler/X86/avx10.2convert-64.txt (+1491)
- (added) llvm/test/MC/X86/avx10.2convert-32-att.s (+1490)
- (added) llvm/test/MC/X86/avx10.2convert-32-intel.s (+1490)
- (added) llvm/test/MC/X86/avx10.2convert-64-att.s (+1490)
- (added) llvm/test/MC/X86/avx10.2convert-64-intel.s (+1490)
- (modified) llvm/test/TableGen/x86-fold-tables.inc (+243)
``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index c49b5c36da4fc..1deac6247a361 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -2158,6 +2158,53 @@ TARGET_BUILTIN(__builtin_ia32_vminmaxps512_round_mask, "V16fV16fV16fIiV16fUsIi",
TARGET_BUILTIN(__builtin_ia32_vminmaxsd_round_mask, "V2dV2dV2dIiV2dUcIi", "nV:128:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vminmaxsh_round_mask, "V8xV8xV8xIiV8xUcIi", "nV:128:", "avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vminmaxss_round_mask, "V4fV4fV4fIiV4fUcIi", "nV:128:", "avx10.2-256")
+
+// AVX10.2 CONVERT
+TARGET_BUILTIN(__builtin_ia32_vcvt2ps2phx128_mask, "V8xV4fV4fV8xUc", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvt2ps2phx256_mask, "V16xV8fV8fV16xUsIi", "ncV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvt2ps2phx512_mask, "V32xV16fV16fV32xUiIi", "ncV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8_128_mask, "V16cV16cV8xV16cUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8_256_mask, "V16cV32cV16xV16cUs", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8_512_mask, "V32cV64cV32xV32cUi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8s_128_mask, "V16cV16cV8xV16cUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8s_256_mask, "V16cV32cV16xV16cUs", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2bf8s_512_mask, "V32cV64cV32xV32cUi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8_128_mask, "V16cV16cV8xV16cUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8_256_mask, "V16cV32cV16xV16cUs", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8_512_mask, "V32cV64cV32xV32cUi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8s_128_mask, "V16cV16cV8xV16cUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8s_256_mask, "V16cV32cV16xV16cUs", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtbiasph2hf8s_512_mask, "V32cV64cV32xV32cUi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8_128, "V16cV8xV8x", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8_256, "V32cV16xV16x", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8_512, "V64cV32xV32x", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8s_128, "V16cV8xV8x", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8s_256, "V32cV16xV16x", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2bf8s_512, "V64cV32xV32x", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8_128, "V16cV8xV8x", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8_256, "V32cV16xV16x", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8_512, "V64cV32xV32x", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8s_128, "V16cV8xV8x", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8s_256, "V32cV16xV16x", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtne2ph2hf8s_512, "V64cV32xV32x", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvtnebf8_2ph128_mask, "V8xV16cV8xUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtnebf8_2ph256_mask, "V16xV16cV16xUs", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtnebf8_2ph512_mask, "V32xV32cV32xUi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvtnehf8_2ph128_mask, "V8xV16cV8xUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtnehf8_2ph256_mask, "V16xV16cV16xUs", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtnehf8_2ph512_mask, "V32xV32cV32xUi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8s_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8s_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtneph2bf8s_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_128_mask, "V16cV8xV16cUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_256_mask, "V16cV16xV16cUs", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvtneph2hf8s_512_mask, "V32cV32xV32cUi", "nV:512:", "avx10.2-512")
#undef BUILTIN
#undef TARGET_BUILTIN
#undef TARGET_HEADER_BUILTIN
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index f3d19e38f8f2b..2eb550d07afaa 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -147,8 +147,10 @@ set(x86_files
amxcomplexintrin.h
amxfp16intrin.h
amxintrin.h
+ avx10_2_512convertintrin.h
avx10_2_512minmaxintrin.h
avx10_2_512niintrin.h
+ avx10_2convertintrin.h
avx10_2minmaxintrin.h
avx10_2niintrin.h
avx2intrin.h
diff --git a/clang/lib/Headers/avx10_2_512convertintrin.h b/clang/lib/Headers/avx10_2_512convertintrin.h
new file mode 100644
index 0000000000000..c56c917772b32
--- /dev/null
+++ b/clang/lib/Headers/avx10_2_512convertintrin.h
@@ -0,0 +1,304 @@
+/*===--------- avx10_2_512convertintrin.h - AVX10_2_512CONVERT -------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avx10_2_512convertintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifdef __SSE2__
+
+#ifndef __AVX10_2_512CONVERTINTRIN_H
+#define __AVX10_2_512CONVERTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS512 \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \
+ __min_vector_width__(512)))
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_cvtx2ps_ph(__m512 __A,
+ __m512 __B) {
+ return (__m512h)__builtin_ia32_vcvt2ps2phx512_mask(
+ (__v16sf)__A, (__v16sf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)(-1),
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtx2ps_ph(__m512h __W, __mmask32 __U, __m512 __A, __m512 __B) {
+ return (__m512h)__builtin_ia32_vcvt2ps2phx512_mask(
+ (__v16sf)__A, (__v16sf)__B, (__v32hf)__W, (__mmask32)__U,
+ _MM_FROUND_CUR_DIRECTION);
+}
+
+#define _mm512_cvtx_round2ps_ph(A, B, R) \
+ ((__m512h)__builtin_ia32_vcvt2ps2phx512_mask( \
+ (__v16sf)(A), (__v16sf)(B), (__v32hf)_mm512_undefined_ph(), \
+ (__mmask32)(-1), (const int)(R)))
+
+#define _mm512_mask_cvtx_round2ps_ph(W, U, A, B, R) \
+ ((__m512h)__builtin_ia32_vcvt2ps2phx512_mask((__v16sf)(A), (__v16sf)(B), \
+ (__v32hf)(W), (__mmask32)(U), \
+ (const int)(R)))
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtbiasph_pbf8(__m512i __A, __m512h __B) {
+ return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask(
+ (__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(),
+ (__mmask32)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiasph_pbf8(
+ __m256i __W, __mmask32 __U, __m512i __A, __m512h __B) {
+ return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask(
+ (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtbiasph_pbf8(__mmask32 __U, __m512i __A, __m512h __B) {
+ return (__m256i)__builtin_ia32_vcvtbiasph2bf8_512_mask(
+ (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(),
+ (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtbiassph_pbf8(__m512i __A, __m512h __B) {
+ return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask(
+ (__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(),
+ (__mmask32)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiassph_pbf8(
+ __m256i __W, __mmask32 __U, __m512i __A, __m512h __B) {
+ return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask(
+ (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtbiassph_pbf8(__mmask32 __U, __m512i __A, __m512h __B) {
+ return (__m256i)__builtin_ia32_vcvtbiasph2bf8s_512_mask(
+ (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(),
+ (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtbiasph_phf8(__m512i __A, __m512h __B) {
+ return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask(
+ (__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(),
+ (__mmask32)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiasph_phf8(
+ __m256i __W, __mmask32 __U, __m512i __A, __m512h __B) {
+ return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask(
+ (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtbiasph_phf8(__mmask32 __U, __m512i __A, __m512h __B) {
+ return (__m256i)__builtin_ia32_vcvtbiasph2hf8_512_mask(
+ (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(),
+ (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtbiassph_phf8(__m512i __A, __m512h __B) {
+ return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask(
+ (__v64qi)__A, (__v32hf)__B, (__v32qi)_mm256_undefined_si256(),
+ (__mmask32)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtbiassph_phf8(
+ __m256i __W, __mmask32 __U, __m512i __A, __m512h __B) {
+ return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask(
+ (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)__W, (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtbiassph_phf8(__mmask32 __U, __m512i __A, __m512h __B) {
+ return (__m256i)__builtin_ia32_vcvtbiasph2hf8s_512_mask(
+ (__v64qi)__A, (__v32hf)__B, (__v32qi)(__m256i)_mm256_setzero_si256(),
+ (__mmask32)__U);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtne2ph_pbf8(__m512h __A, __m512h __B) {
+ return (__m512i)__builtin_ia32_vcvtne2ph2bf8_512((__v32hf)(__A),
+ (__v32hf)(__B));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtne2ph_pbf8(
+ __m512i __W, __mmask64 __U, __m512h __A, __m512h __B) {
+ return (__m512i)__builtin_ia32_selectb_512(
+ (__mmask64)__U, (__v64qi)_mm512_cvtne2ph_pbf8(__A, __B), (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtnes2ph_pbf8(__m512h __A, __m512h __B) {
+ return (__m512i)__builtin_ia32_vcvtne2ph2bf8s_512((__v32hf)(__A),
+ (__v32hf)(__B));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtnes2ph_pbf8(
+ __m512i __W, __mmask64 __U, __m512h __A, __m512h __B) {
+ return (__m512i)__builtin_ia32_selectb_512(
+ (__mmask64)__U, (__v64qi)_mm512_cvtnes2ph_pbf8(__A, __B), (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtne2ph_phf8(__m512h __A, __m512h __B) {
+ return (__m512i)__builtin_ia32_vcvtne2ph2hf8_512((__v32hf)(__A),
+ (__v32hf)(__B));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtne2ph_phf8(
+ __m512i __W, __mmask64 __U, __m512h __A, __m512h __B) {
+ return (__m512i)__builtin_ia32_selectb_512(
+ (__mmask64)__U, (__v64qi)_mm512_cvtne2ph_phf8(__A, __B), (__v64qi)__W);
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512
+_mm512_cvtne2ph2hf8s_phf8(__m512h __A, __m512h __B) {
+ return (__m512i)__builtin_ia32_vcvtne2ph2hf8s_512((__v32hf)(__A),
+ (__v32hf)(__B));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_mask_cvtne2ph2hf8s_phf8(
+ __m512i __W, __mmask64 __U, __m512h __A, __m512h __B) {
+ return (__m512i)__builtin_ia32_selectb_512(
+ (__mmask64)__U, (__v64qi)_mm512_cvtne2ph2hf8s_phf8(__A, __B),
+ (__v64qi)__W);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_cvtnebf8_ph(__m256i __A) {
+ return (__m512h)__builtin_ia32_vcvtnebf8_2ph512_mask(
+ (__v32qi)__A, (__v32hf)(__m512h)_mm512_undefined_ph(), (__mmask32)-1);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtnebf8_ph(__m512h __W, __mmask32 __U, __m256i __A) {
+ return (__m512h)__builtin_ia32_vcvtnebf8_2ph512_mask(
+ (__v32qi)__A, (__v32hf)(__m512h)__W, (__mmask32)__U);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtnebf8_ph(__mmask32 __U, __m256i __A) {
+ return (__m512h)__builtin_ia32_vcvtnebf8_2ph512_mask(
+ (__v32qi)__A, (__v32hf)(__m512h)_mm512_setzero_ph(), (__mmask32)__U);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_cvtnehf8_ph(__m256i __A) {
+ return (__m512h)__builtin_ia32_vcvtnehf8_2ph512_mask(
+ (__v32qi)__A, (__v32hf)(__m512h)_mm512_undefined_ph(), (__mmask32)-1);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtnehf8_ph(__m512h __W, __mmask32 __U, __m256i __A) {
+ return (__m512h)__builtin_ia32_vcvtnehf8_2ph512_mask(
+ (__v32qi)__A, (__v32hf)(__m512h)__W, (__mmask32)__U);
+}
+
+static __inline__ __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtnehf8_ph(__mmask32 __U, __m256i __A) {
+ return (__m512h)__builtin_ia32_vcvtnehf8_2ph512_mask(
+ (__v32qi)__A, (__v32hf)(__m512h)_mm512_setzero_ph(), (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtneph_pbf8(__m512h __A) {
+ return (__m256i)__builtin_ia32_vcvtneph2bf8_512_mask(
+ (__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtneph_pbf8(__m256i __W, __mmask32 __U, __m512h __A) {
+ return (__m256i)__builtin_ia32_vcvtneph2bf8_512_mask(
+ (__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtneph_pbf8(__mmask32 __U, __m512h __A) {
+ return (__m256i)__builtin_ia32_vcvtneph2bf8_512_mask(
+ (__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtnesph_pbf8(__m512h __A) {
+ return (__m256i)__builtin_ia32_vcvtneph2bf8s_512_mask(
+ (__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtnesph_pbf8(__m256i __W, __mmask32 __U, __m512h __A) {
+ return (__m256i)__builtin_ia32_vcvtneph2bf8s_512_mask(
+ (__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtnesph_pbf8(__mmask32 __U, __m512h __A) {
+ return (__m256i)__builtin_ia32_vcvtneph2bf8s_512_mask(
+ (__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtneph_phf8(__m512h __A) {
+ return (__m256i)__builtin_ia32_vcvtneph2hf8_512_mask(
+ (__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtneph_phf8(__m256i __W, __mmask32 __U, __m512h __A) {
+ return (__m256i)__builtin_ia32_vcvtneph2hf8_512_mask(
+ (__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtneph_phf8(__mmask32 __U, __m512h __A) {
+ return (__m256i)__builtin_ia32_vcvtneph2hf8_512_mask(
+ (__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_cvtnesph_phf8(__m512h __A) {
+ return (__m256i)__builtin_ia32_vcvtneph2hf8s_512_mask(
+ (__v32hf)__A, (__v32qi)(__m256i)_mm256_undefined_si256(), (__mmask32)-1);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtnesph_phf8(__m256i __W, __mmask32 __U, __m512h __A) {
+ return (__m256i)__builtin_ia32_vcvtneph2hf8s_512_mask(
+ (__v32hf)__A, (__v32qi)(__m256i)__W, (__mmask32)__U);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtnesph_phf8(__mmask32 __U, __m512h __A) {
+ return (__m256i)__builtin_ia32_vcvtneph2hf8s_512_mask(
+ (__v32hf)__A, (__v32qi)(__m256i)_mm256_setzero_si256(), (__mmask32)__U);
+}
+
+static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_cvtpbf8_ph(__m256i __A) {
+ return _mm512_castsi512_ph(_mm512_slli_epi16(_mm512_cvtepi8_epi16(__A), 8));
+}
+
+static __inline __m512h __DEFAULT_FN_ATTRS512
+_mm512_mask_cvtpbf8_ph(__m512h __S, __mmask16 __U, __m256i __A) {
+ return _mm512_castsi512_ph(
+ _mm512_mask_slli_epi16((__m512i)__S, __U, _mm512_cvtepi8_epi16(__A), 8));
+}
+
+static __inline __m512h __DEFAULT_FN_ATTRS512
+_mm512_maskz_cvtpbf8_ph(__mmask16 __U, __m256i __A) {
+ return _mm512_castsi512_ph(
+ _mm512_slli_epi16(_mm512_maskz_cvtepi8_epi16(__U, __A), 8));
+}
+
+#undef __DEFAULT_FN_ATTRS512
+
+#endif // __AVX10_2_512CONVERTINTRIN_H
+#endif // __SSE2__
diff --git a/clang/lib/Headers/avx10_2convertintrin.h b/clang/lib/Headers/avx10_2convertintrin.h
new file mode 100644
index 0000000000000..9d7b6008cb6af
--- /dev/null
+++ b/clang/lib/Headers/avx10_2convertintrin.h
@@ -0,0 +1,560 @@
+/*===---------- avx10_2convertintrin.h - AVX512NECONVERTFP8 ----------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avx10_2convertintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifdef __SSE2__
+
+#ifndef __AVX10_2CONVERTINTRIN_H
+#define __AVX10_2CONVERTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128 \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
+ __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256 \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
+ __min_vector_width__(256)))
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtx2ps_ph(__m128 __A,
+ __m128 __B) {
+ return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask(
+ (__v4sf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)(-1));
+}
+
+static __inline__ __m128h __DEFAULT_FN_ATTRS128
+_mm_mask_cvtx2ps_ph(__m128h __W, __mmask8 __U, __m128 __A, __m128 __B) {
+ return (__m128h)__builtin_ia32_vcvt2ps2phx128_mask(
+ (__v4sf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U);
+}
+
+static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_cvtx2ps_ph(__m256 __A,
+ ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/101600
More information about the cfe-commits
mailing list