[clang] [llvm] [X86][AVX10.2] Support saturated converts (PR #102592)
via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 9 02:54:19 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Malay Sanghi (MalaySanghi)
<details>
<summary>Changes</summary>
Ref.: https://cdrdv2.intel.com/v1/dl/getContent/828965
---
Patch is 494.14 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/102592.diff
31 Files Affected:
- (modified) clang/include/clang/Basic/BuiltinsX86.def (+30)
- (modified) clang/include/clang/Basic/BuiltinsX86_64.def (+6)
- (modified) clang/lib/Headers/CMakeLists.txt (+2)
- (added) clang/lib/Headers/avx10_2_512satcvtdsintrin.h (+302)
- (added) clang/lib/Headers/avx10_2satcvtdsintrin.h (+453)
- (modified) clang/lib/Headers/immintrin.h (+8)
- (modified) clang/lib/Sema/SemaX86.cpp (+26)
- (added) clang/test/CodeGen/X86/avx10_2_512satcvtds-builtins-errors.c (+52)
- (added) clang/test/CodeGen/X86/avx10_2_512satcvtds-builtins-x64-error.c (+76)
- (added) clang/test/CodeGen/X86/avx10_2_512satcvtds-builtins-x64.c (+184)
- (added) clang/test/CodeGen/X86/avx10_2_512satcvtds-builtins.c (+183)
- (added) clang/test/CodeGen/X86/avx10_2satcvtds-builtins-errors.c (+57)
- (added) clang/test/CodeGen/X86/avx10_2satcvtds-builtins-x64.c (+223)
- (added) clang/test/CodeGen/X86/avx10_2satcvtds-builtins.c (+220)
- (modified) llvm/include/llvm/IR/IntrinsicsX86.td (+100)
- (modified) llvm/lib/Target/X86/X86ISelLowering.cpp (+21-2)
- (modified) llvm/lib/Target/X86/X86ISelLowering.h (+18)
- (modified) llvm/lib/Target/X86/X86InstrAVX10.td (+311)
- (modified) llvm/lib/Target/X86/X86InstrFragmentsSIMD.td (+12)
- (modified) llvm/lib/Target/X86/X86IntrinsicsInfo.h (+65-1)
- (added) llvm/test/CodeGen/X86/avx10_2_512satcvtds-intrinsics.ll (+548)
- (added) llvm/test/CodeGen/X86/avx10_2fptosi_satcvtds.ll (+115)
- (added) llvm/test/CodeGen/X86/avx10_2satcvtds-intrinsics.ll (+1098)
- (added) llvm/test/CodeGen/X86/avx10_2satcvtds-x64-intrinsics.ll (+58)
- (added) llvm/test/MC/Disassembler/X86/avx10.2-satcvtds-32.txt (+1043)
- (added) llvm/test/MC/Disassembler/X86/avx10.2-satcvtds-64.txt (+1171)
- (added) llvm/test/MC/X86/avx10_2satcvtds-32-att.s (+1042)
- (added) llvm/test/MC/X86/avx10_2satcvtds-32-intel.s (+1042)
- (added) llvm/test/MC/X86/avx10_2satcvtds-64-att.s (+1170)
- (added) llvm/test/MC/X86/avx10_2satcvtds-64-intel.s (+1170)
- (modified) llvm/test/TableGen/x86-fold-tables.inc (+160)
``````````diff
diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def
index a696cf117908e2..a8639c341d0a43 100644
--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@@ -2122,6 +2122,36 @@ TARGET_BUILTIN(__builtin_ia32_vpdpwuud256, "V8iV8iV8iV8i", "nV:256:", "avxvnniin
TARGET_BUILTIN(__builtin_ia32_vpdpwuuds128, "V4iV4iV4iV4i", "nV:128:", "avxvnniint16|avx10.2-256")
TARGET_BUILTIN(__builtin_ia32_vpdpwuuds256, "V8iV8iV8iV8i", "nV:256:", "avxvnniint16|avx10.2-256")
+// AVX10.2 SATCVT-DS
+TARGET_BUILTIN(__builtin_ia32_vcvttssd2si32, "iV2dIi", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttssd2usi32, "UiV2dIi", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttsss2si32, "iV4fIi", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttsss2usi32, "UiV4fIi", "ncV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttpd2dqs128_mask, "V4iV2dV4iUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttpd2dqs256_round_mask, "V4iV4dV4iUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttpd2dqs512_round_mask, "V8iV8dV8iUcIi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvttpd2udqs128_mask, "V4iV2dV4iUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttpd2udqs256_round_mask, "V4iV4dV4iUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttpd2udqs512_round_mask, "V8iV8dV8iUcIi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvttpd2qqs128_mask, "V2OiV2dV2OiUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttpd2qqs256_round_mask, "V4OiV4dV4OiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttpd2qqs512_round_mask, "V8OiV8dV8OiUcIi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvttpd2uqqs128_mask, "V2OiV2dV2OiUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttpd2uqqs256_round_mask, "V4OiV4dV4OiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttpd2uqqs512_round_mask, "V8OiV8dV8OiUcIi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvttps2dqs128_mask, "V4iV4fV4iUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttps2dqs256_round_mask, "V8iV8fV8iUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttps2dqs512_round_mask, "V16iV16fV16iUsIi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvttps2udqs128_mask, "V4iV4fV4iUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttps2udqs256_round_mask, "V8iV8fV8iUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttps2udqs512_round_mask, "V16iV16fV16iUsIi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvttps2qqs128_mask, "V2OiV4fV2OiUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttps2qqs256_round_mask, "V4OiV4fV4OiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttps2qqs512_round_mask, "V8OiV8fV8OiUcIi", "nV:512:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvttps2uqqs128_mask, "V2OiV4fV2OiUc", "nV:128:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttps2uqqs256_round_mask, "V4OiV4fV4OiUcIi", "nV:256:", "avx10.2-256")
+TARGET_BUILTIN(__builtin_ia32_vcvttps2uqqs512_round_mask, "V8OiV8fV8OiUcIi", "nV:512:", "avx10.2-512")
+
// AVX-NE-CONVERT
TARGET_BUILTIN(__builtin_ia32_vbcstnebf162ps128, "V4fyC*", "nV:128:", "avxneconvert")
TARGET_BUILTIN(__builtin_ia32_vbcstnebf162ps256, "V8fyC*", "nV:256:", "avxneconvert")
diff --git a/clang/include/clang/Basic/BuiltinsX86_64.def b/clang/include/clang/Basic/BuiltinsX86_64.def
index 5e00916d4b25ae..ed9b17b8bd7b8e 100644
--- a/clang/include/clang/Basic/BuiltinsX86_64.def
+++ b/clang/include/clang/Basic/BuiltinsX86_64.def
@@ -99,6 +99,12 @@ TARGET_BUILTIN(__builtin_ia32_vcvttsh2si64, "OiV8xIi", "ncV:128:", "avx512fp16")
TARGET_BUILTIN(__builtin_ia32_vcvttsh2usi64, "UOiV8xIi", "ncV:128:", "avx512fp16")
TARGET_BUILTIN(__builtin_ia32_directstore_u64, "vULi*ULi", "n", "movdiri")
+// AVX10.2 SATCVT-DS
+TARGET_BUILTIN(__builtin_ia32_vcvttssd2si64, "OiV2dIi", "ncV:128:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvttssd2usi64, "UOiV2dIi", "ncV:128:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvttsss2si64, "OiV4fIi", "ncV:128:", "avx10.2-512")
+TARGET_BUILTIN(__builtin_ia32_vcvttsss2usi64, "UOiV4fIi", "ncV:128:", "avx10.2-512")
+
// UINTR
TARGET_BUILTIN(__builtin_ia32_clui, "v", "n", "uintr")
TARGET_BUILTIN(__builtin_ia32_stui, "v", "n", "uintr")
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index b61aeca6bbc910..9981290628697c 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -150,9 +150,11 @@ set(x86_files
avx10_2_512minmaxintrin.h
avx10_2_512niintrin.h
avx10_2_512satcvtintrin.h
+ avx10_2_512satcvtdsintrin.h
avx10_2minmaxintrin.h
avx10_2niintrin.h
avx10_2satcvtintrin.h
+ avx10_2satcvtdsintrin.h
avx2intrin.h
avx512bf16intrin.h
avx512bitalgintrin.h
diff --git a/clang/lib/Headers/avx10_2_512satcvtdsintrin.h b/clang/lib/Headers/avx10_2_512satcvtdsintrin.h
new file mode 100644
index 00000000000000..e8b815653c3d6e
--- /dev/null
+++ b/clang/lib/Headers/avx10_2_512satcvtdsintrin.h
@@ -0,0 +1,302 @@
+/*===----- avx10_2_512satcvtdsintrin.h - AVX10_2_512SATCVTDS intrinsics ----===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avx10_2_512satcvtdsintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVX10_2_512SATCVTDSINTRIN_H
+#define __AVX10_2_512SATCVTDSINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-512"), \
+ __min_vector_width__(512)))
+
+// 512 bit : Double -> Int
+static __inline__ __m256i __DEFAULT_FN_ATTRS _mm512_cvttspd_epi32(__m512d A) {
+ return ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask(
+ (__v8df)A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttspd_epi32(__m256i W, __mmask8 U, __m512d A) {
+ return ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask(
+ (__v8df)A, (__v8si)W, U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttspd_epi32(__mmask8 U, __m512d A) {
+ return ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask(
+ (__v8df)A, (__v8si)_mm256_setzero_si256(), U, _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm512_cvtts_roundpd_epi32(A, R) \
+ ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8si)_mm256_undefined_si256(), (__mmask8) - 1, \
+ (const int)(R)))
+
+#define _mm512_mask_cvtts_roundpd_epi32(W, U, A, R) \
+ ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8si)(__m256i)(W), (__mmask8)(U), \
+ (const int)(R)))
+
+#define _mm512_maskz_cvtts_roundpd_epi32(U, A, R) \
+ ((__m256i)__builtin_ia32_vcvttpd2dqs512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), \
+ (const int)(R)))
+
+// 512 bit : Double -> uInt
+static __inline__ __m256i __DEFAULT_FN_ATTRS _mm512_cvttspd_epu32(__m512d A) {
+ return ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask(
+ (__v8df)A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttspd_epu32(__m256i W, __mmask8 U, __m512d A) {
+ return ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask(
+ (__v8df)A, (__v8si)W, U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttspd_epu32(__mmask8 U, __m512d A) {
+ return ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask(
+ (__v8df)A, (__v8si)_mm256_setzero_si256(), U, _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm512_cvtts_roundpd_epu32(A, R) \
+ ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8si)_mm256_undefined_si256(), (__mmask8) - 1, \
+ (const int)(R)))
+
+#define _mm512_mask_cvtts_roundpd_epu32(W, U, A, R) \
+ ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8si)(__m256i)(W), (__mmask8)(U), \
+ (const int)(R)))
+
+#define _mm512_maskz_cvtts_roundpd_epu32(U, A, R) \
+ ((__m256i)__builtin_ia32_vcvttpd2udqs512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8si)_mm256_setzero_si256(), (__mmask8)(U), \
+ (const int)(R)))
+
+#ifdef __x86_64__
+// 512 bit : Double -> Long
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttspd_epi64(__m512d A) {
+ return ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask(
+ (__v8df)A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION));
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttspd_epi64(__m512i W, __mmask8 U, __m512d A) {
+ return ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask(
+ (__v8df)A, (__v8di)W, U, _MM_FROUND_CUR_DIRECTION));
+}
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttspd_epi64(__mmask8 U, __m512d A) {
+ return ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask(
+ (__v8df)A, (__v8di)_mm512_setzero_si512(), U, _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm512_cvtts_roundpd_epi64(A, R) \
+ ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8) - 1, \
+ (const int)(R)))
+
+#define _mm512_mask_cvtts_roundpd_epi64(W, U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8di)(__m512i)(W), (__mmask8)(U), \
+ (const int)(R)))
+
+#define _mm512_maskz_cvtts_roundpd_epi64(U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttpd2qqs512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8di)_mm512_setzero_si512(), (__mmask8)(U), \
+ (const int)(R)))
+
+// 512 bit : Double -> ULong
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttspd_epu64(__m512d A) {
+ return ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask(
+ (__v8df)A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttspd_epu64(__m512i W, __mmask8 U, __m512d A) {
+ return ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask(
+ (__v8df)A, (__v8di)W, U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttspd_epu64(__mmask8 U, __m512d A) {
+ return ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask(
+ (__v8df)A, (__v8di)_mm512_setzero_si512(), U, _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm512_cvtts_roundpd_epu64(A, R) \
+ ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8) - 1, \
+ (const int)(R)))
+
+#define _mm512_mask_cvtts_roundpd_epu64(W, U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8di)(__m512i)(W), (__mmask8)(U), \
+ (const int)(R)))
+
+#define _mm512_maskz_cvtts_roundpd_epu64(U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttpd2uqqs512_round_mask( \
+ (__v8df)(__m512d)(A), (__v8di)_mm512_setzero_si512(), (__mmask8)(U), \
+ (const int)(R)))
+
+#endif
+
+// 512 bit: Float -> int
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttsps_epi32(__m512 A) {
+ return ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask(
+ (__v16sf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)-1,
+ _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttsps_epi32(__m512i W, __mmask16 U, __m512 A) {
+ return ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask(
+ (__v16sf)(A), (__v16si)(W), U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttsps_epi32(__mmask16 U, __m512 A) {
+ return ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask(
+ (__v16sf)(A), (__v16si)_mm512_setzero_si512(), U,
+ _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm512_cvtts_roundps_epi32(A, R) \
+ ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16si)_mm512_undefined_epi32(), \
+ (__mmask16) - 1, (const int)(R)))
+
+#define _mm512_mask_cvtts_roundps_epi32(W, U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16si)(__m512i)(W), (__mmask16)(U), \
+ (const int)(R)))
+
+#define _mm512_maskz_cvtts_roundps_epi32(U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttps2dqs512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16si)_mm512_setzero_si512(), (__mmask16)(U), \
+ (const int)(R)))
+
+// 512 bit: Float -> uint
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttsps_epu32(__m512 A) {
+ return ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask(
+ (__v16sf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)-1,
+ _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttsps_epu32(__m512i W, __mmask16 U, __m512 A) {
+ return ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask(
+ (__v16sf)(A), (__v16si)(W), U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttsps_epu32(__mmask16 U, __m512 A) {
+ return ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask(
+ (__v16sf)(A), (__v16si)_mm512_setzero_si512(), U,
+ _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm512_cvtts_roundps_epu32(A, R) \
+ ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16si)_mm512_undefined_epi32(), \
+ (__mmask16) - 1, (const int)(R)))
+
+#define _mm512_mask_cvtts_roundps_epu32(W, U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16si)(__m512i)(W), (__mmask16)(U), \
+ (const int)(R)))
+
+#define _mm512_maskz_cvtts_roundps_epu32(U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttps2udqs512_round_mask( \
+ (__v16sf)(__m512)(A), (__v16si)_mm512_setzero_si512(), (__mmask16)(U), \
+ (const int)(R)))
+
+#ifdef __x86_64__
+// 512 bit : float -> long
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttsps_epi64(__m256 A) {
+ return ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask(
+ (__v8sf)A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttsps_epi64(__m512i W, __mmask8 U, __m256 A) {
+ return ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask(
+ (__v8sf)A, (__v8di)W, U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttsps_epi64(__mmask8 U, __m256 A) {
+ return ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask(
+ (__v8sf)A, (__v8di)_mm512_setzero_si512(), U, _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm512_cvtts_roundps_epi64(A, R) \
+ ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask( \
+ (__v8sf)(__m256)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8) - 1, \
+ (const int)(R)))
+
+#define _mm512_mask_cvtts_roundps_epi64(W, U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask( \
+ (__v8sf)(__m256)(A), (__v8di)(__m512i)(W), (__mmask8)(U), \
+ (const int)(R)))
+
+#define _mm512_maskz_cvtts_roundps_epi64(U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttps2qqs512_round_mask( \
+ (__v8sf)(__m256)(A), (__v8di)_mm512_setzero_si512(), (__mmask8)(U), \
+ (const int)(R)))
+
+// 512 bit : float -> ulong
+static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_cvttsps_epu64(__m256 A) {
+ return ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask(
+ (__v8sf)A, (__v8di)_mm512_undefined_epi32(), (__mmask8)-1,
+ _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_mask_cvttsps_epu64(__m512i W, __mmask8 U, __m256 A) {
+ return ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask(
+ (__v8sf)A, (__v8di)W, U, _MM_FROUND_CUR_DIRECTION));
+}
+
+static __inline__ __m512i __DEFAULT_FN_ATTRS
+_mm512_maskz_cvttsps_epu64(__mmask8 U, __m256 A) {
+ return ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask(
+ (__v8sf)A, (__v8di)_mm512_setzero_si512(), U, _MM_FROUND_CUR_DIRECTION));
+}
+
+#define _mm512_cvtts_roundps_epu64(A, R) \
+ ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask( \
+ (__v8sf)(__m256)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8) - 1, \
+ (const int)(R)))
+
+#define _mm512_mask_cvtts_roundps_epu64(W, U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask( \
+ (__v8sf)(__m256)(A), (__v8di)(__m512i)(W), (__mmask8)(U), \
+ (const int)(R)))
+
+#define _mm512_maskz_cvtts_roundps_epu64(U, A, R) \
+ ((__m512i)__builtin_ia32_vcvttps2uqqs512_round_mask( \
+ (__v8sf)(__m256)(A), (__v8di)_mm512_setzero_si512(), (__mmask8)(U), \
+ (const int)(R)))
+#endif
+
+#undef __DEFAULT_FN_ATTRS
+#endif // __AVX10_2_512SATCVTDSINTRIN_H
diff --git a/clang/lib/Headers/avx10_2satcvtdsintrin.h b/clang/lib/Headers/avx10_2satcvtdsintrin.h
new file mode 100644
index 00000000000000..5588c9ccfa4319
--- /dev/null
+++ b/clang/lib/Headers/avx10_2satcvtdsintrin.h
@@ -0,0 +1,453 @@
+/*===----------- avx10_2satcvtdsintrin.h - AVX512SATCVTDS intrinsics --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error \
+ "Never use <avx10_2satcvtdsintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifndef __AVX10_2SATCVTDSINTRIN_H
+#define __AVX10_2SATCVTDSINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS \
+ __attribute__((__always_inline__, __nodebug__, __target__("avx10.2-256"), \
+ __min_vector_width__(256)))
+
+#define _mm_cvtt_roundssd_i32(A, R) \
+ ((int)__builtin_ia32_vcvttssd2si32((__v2df)(__m128)(A), (const int)(R)))
+
+#define _mm_cvtt_roundssd_si32(A, R) \
+ ((int)__builtin_ia32_vcvttssd2si32...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/102592
More information about the llvm-commits
mailing list