[clang] 406bde9 - [PowerPC] [Clang] Add SSE4 and BMI intrinsics implementation
Qiu Chaofan via cfe-commits
cfe-commits at lists.llvm.org
Thu Mar 24 05:07:55 PDT 2022
Author: Qiu Chaofan
Date: 2022-03-24T20:03:08+08:00
New Revision: 406bde9a15136254f2b10d9ef3a42033b3cb1b16
URL: https://github.com/llvm/llvm-project/commit/406bde9a15136254f2b10d9ef3a42033b3cb1b16
DIFF: https://github.com/llvm/llvm-project/commit/406bde9a15136254f2b10d9ef3a42033b3cb1b16.diff
LOG: [PowerPC] [Clang] Add SSE4 and BMI intrinsics implementation
Reviewed By: jsji
Differential Revision: https://reviews.llvm.org/D119407
Added:
clang/lib/Headers/ppc_wrappers/bmi2intrin.h
clang/lib/Headers/ppc_wrappers/bmiintrin.h
clang/lib/Headers/ppc_wrappers/immintrin.h
clang/lib/Headers/ppc_wrappers/nmmintrin.h
clang/lib/Headers/ppc_wrappers/x86gprintrin.h
clang/lib/Headers/ppc_wrappers/x86intrin.h
clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c
Modified:
clang/lib/Headers/CMakeLists.txt
clang/lib/Headers/ppc_wrappers/emmintrin.h
clang/lib/Headers/ppc_wrappers/pmmintrin.h
clang/lib/Headers/ppc_wrappers/smmintrin.h
clang/lib/Headers/ppc_wrappers/tmmintrin.h
clang/lib/Headers/ppc_wrappers/xmmintrin.h
clang/test/CodeGen/PowerPC/ppc-emmintrin.c
clang/test/CodeGen/PowerPC/ppc-smmintrin.c
clang/test/CodeGen/PowerPC/ppc-xmmintrin.c
Removed:
################################################################################
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index e218a6a46a147..f1106b97bb382 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -164,6 +164,12 @@ set(ppc_wrapper_files
ppc_wrappers/pmmintrin.h
ppc_wrappers/tmmintrin.h
ppc_wrappers/smmintrin.h
+ ppc_wrappers/bmiintrin.h
+ ppc_wrappers/bmi2intrin.h
+ ppc_wrappers/immintrin.h
+ ppc_wrappers/tmmintrin.h
+ ppc_wrappers/x86intrin.h
+ ppc_wrappers/x86gprintrin.h
)
set(openmp_wrapper_files
diff --git a/clang/lib/Headers/ppc_wrappers/bmi2intrin.h b/clang/lib/Headers/ppc_wrappers/bmi2intrin.h
new file mode 100644
index 0000000000000..433fe0887a071
--- /dev/null
+++ b/clang/lib/Headers/ppc_wrappers/bmi2intrin.h
@@ -0,0 +1,133 @@
+/*===---- bmiintrin.h - Implementation of BMI2 intrinsics on PowerPC -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined X86GPRINTRIN_H_
+#error "Never use <bmi2intrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef BMI2INTRIN_H_
+#define BMI2INTRIN_H_
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _bzhi_u32(unsigned int __X, unsigned int __Y) {
+ return ((__X << (32 - __Y)) >> (32 - __Y));
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _mulx_u32(unsigned int __X, unsigned int __Y, unsigned int *__P) {
+ unsigned long long __res = (unsigned long long)__X * __Y;
+ *__P = (unsigned int)(__res >> 32);
+ return (unsigned int)__res;
+}
+
+#ifdef __PPC64__
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _bzhi_u64(unsigned long long __X, unsigned long long __Y) {
+ return ((__X << (64 - __Y)) >> (64 - __Y));
+}
+
+/* __int128 requires base 64-bit. */
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _mulx_u64(unsigned long long __X, unsigned long long __Y,
+ unsigned long long *__P) {
+ unsigned __int128 __res = (unsigned __int128)__X * __Y;
+ *__P = (unsigned long long)(__res >> 64);
+ return (unsigned long long)__res;
+}
+
+#ifdef _ARCH_PWR7
+/* popcount and bpermd require power7 minimum. */
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _pdep_u64(unsigned long long __X, unsigned long long __M) {
+ unsigned long result = 0x0UL;
+ const unsigned long mask = 0x8000000000000000UL;
+ unsigned long m = __M;
+ unsigned long c, t;
+ unsigned long p;
+
+ /* The pop-count of the mask gives the number of the bits from
+ source to process. This is also needed to shift bits from the
+ source into the correct position for the result. */
+ p = 64 - __builtin_popcountl(__M);
+
+ /* The loop is for the number of '1' bits in the mask and clearing
+ each mask bit as it is processed. */
+ while (m != 0) {
+ c = __builtin_clzl(m);
+ t = __X << (p - c);
+ m ^= (mask >> c);
+ result |= (t & (mask >> c));
+ p++;
+ }
+ return (result);
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _pext_u64(unsigned long long __X, unsigned long long __M) {
+ unsigned long p = 0x4040404040404040UL; // initial bit permute control
+ const unsigned long mask = 0x8000000000000000UL;
+ unsigned long m = __M;
+ unsigned long c;
+ unsigned long result;
+
+ /* if the mask is constant and selects 8 bits or less we can use
+ the Power8 Bit permute instruction. */
+ if (__builtin_constant_p(__M) && (__builtin_popcountl(__M) <= 8)) {
+ /* Also if the pext mask is constant, then the popcount is
+ constant, we can evaluate the following loop at compile
+ time and use a constant bit permute vector. */
+ for (long i = 0; i < __builtin_popcountl(__M); i++) {
+ c = __builtin_clzl(m);
+ p = (p << 8) | c;
+ m ^= (mask >> c);
+ }
+ result = __builtin_bpermd(p, __X);
+ } else {
+ p = 64 - __builtin_popcountl(__M);
+ result = 0;
+ /* We could a use a for loop here, but that combined with
+ -funroll-loops can expand to a lot of code. The while
+ loop avoids unrolling and the compiler commons the xor
+ from clearing the mask bit with the (m != 0) test. The
+ result is a more compact loop setup and body. */
+ while (m != 0) {
+ unsigned long t;
+ c = __builtin_clzl(m);
+ t = (__X & (mask >> c)) >> (p - c);
+ m ^= (mask >> c);
+ result |= (t);
+ p++;
+ }
+ }
+ return (result);
+}
+
+/* these 32-bit implementations depend on 64-bit pdep/pext
+ which depend on _ARCH_PWR7. */
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _pdep_u32(unsigned int __X, unsigned int __Y) {
+ return _pdep_u64(__X, __Y);
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _pext_u32(unsigned int __X, unsigned int __Y) {
+ return _pext_u64(__X, __Y);
+}
+#endif /* _ARCH_PWR7 */
+#endif /* __PPC64__ */
+
+#endif /* BMI2INTRIN_H_ */
diff --git a/clang/lib/Headers/ppc_wrappers/bmiintrin.h b/clang/lib/Headers/ppc_wrappers/bmiintrin.h
new file mode 100644
index 0000000000000..7d3315958c7b7
--- /dev/null
+++ b/clang/lib/Headers/ppc_wrappers/bmiintrin.h
@@ -0,0 +1,165 @@
+/*===---- bmiintrin.h - Implementation of BMI intrinsics on PowerPC --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#if !defined X86GPRINTRIN_H_
+#error "Never use <bmiintrin.h> directly; include <x86gprintrin.h> instead."
+#endif
+
+#ifndef BMIINTRIN_H_
+#define BMIINTRIN_H_
+
+extern __inline unsigned short
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __tzcnt_u16(unsigned short __X) {
+ return __builtin_ctz(__X);
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __andn_u32(unsigned int __X, unsigned int __Y) {
+ return (~__X & __Y);
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _bextr_u32(unsigned int __X, unsigned int __P, unsigned int __L) {
+ return ((__X << (32 - (__L + __P))) >> (32 - __L));
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __bextr_u32(unsigned int __X, unsigned int __Y) {
+ unsigned int __P, __L;
+ __P = __Y & 0xFF;
+ __L = (__Y >> 8) & 0xFF;
+ return (_bextr_u32(__X, __P, __L));
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __blsi_u32(unsigned int __X) {
+ return (__X & -__X);
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _blsi_u32(unsigned int __X) {
+ return __blsi_u32(__X);
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __blsmsk_u32(unsigned int __X) {
+ return (__X ^ (__X - 1));
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _blsmsk_u32(unsigned int __X) {
+ return __blsmsk_u32(__X);
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __blsr_u32(unsigned int __X) {
+ return (__X & (__X - 1));
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _blsr_u32(unsigned int __X) {
+ return __blsr_u32(__X);
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __tzcnt_u32(unsigned int __X) {
+ return __builtin_ctz(__X);
+}
+
+extern __inline unsigned int
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _tzcnt_u32(unsigned int __X) {
+ return __builtin_ctz(__X);
+}
+
+/* use the 64-bit shift, rotate, and count leading zeros instructions
+ for long long. */
+#ifdef __PPC64__
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __andn_u64(unsigned long long __X, unsigned long long __Y) {
+ return (~__X & __Y);
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _bextr_u64(unsigned long long __X, unsigned int __P, unsigned int __L) {
+ return ((__X << (64 - (__L + __P))) >> (64 - __L));
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __bextr_u64(unsigned long long __X, unsigned long long __Y) {
+ unsigned int __P, __L;
+ __P = __Y & 0xFF;
+ __L = (__Y & 0xFF00) >> 8;
+ return (_bextr_u64(__X, __P, __L));
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __blsi_u64(unsigned long long __X) {
+ return __X & -__X;
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _blsi_u64(unsigned long long __X) {
+ return __blsi_u64(__X);
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __blsmsk_u64(unsigned long long __X) {
+ return (__X ^ (__X - 1));
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _blsmsk_u64(unsigned long long __X) {
+ return __blsmsk_u64(__X);
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __blsr_u64(unsigned long long __X) {
+ return (__X & (__X - 1));
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _blsr_u64(unsigned long long __X) {
+ return __blsr_u64(__X);
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ __tzcnt_u64(unsigned long long __X) {
+ return __builtin_ctzll(__X);
+}
+
+extern __inline unsigned long long
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+ _tzcnt_u64(unsigned long long __X) {
+ return __builtin_ctzll(__X);
+}
+#endif /* __PPC64__ */
+
+#endif /* BMIINTRIN_H_ */
diff --git a/clang/lib/Headers/ppc_wrappers/emmintrin.h b/clang/lib/Headers/ppc_wrappers/emmintrin.h
index 82a71788b27a8..278225f919758 100644
--- a/clang/lib/Headers/ppc_wrappers/emmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/emmintrin.h
@@ -405,20 +405,10 @@ _mm_cmpnge_pd (__m128d __A, __m128d __B)
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cmpord_pd (__m128d __A, __m128d __B)
{
-#if _ARCH_PWR8
__v2du c, d;
/* Compare against self will return false (0's) if NAN. */
c = (__v2du)vec_cmpeq (__A, __A);
d = (__v2du)vec_cmpeq (__B, __B);
-#else
- __v2du a, b;
- __v2du c, d;
- const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000};
- a = (__v2du)vec_abs ((__v2df)__A);
- b = (__v2du)vec_abs ((__v2df)__B);
- c = (__v2du)vec_cmpgt (double_exp_mask, a);
- d = (__v2du)vec_cmpgt (double_exp_mask, b);
-#endif
/* A != NAN and B != NAN. */
return ((__m128d)vec_and(c, d));
}
@@ -861,7 +851,11 @@ _mm_cvtpd_epi32 (__m128d __A)
: );
#ifdef _ARCH_PWR8
+#ifdef __LITTLE_ENDIAN__
temp = vec_mergeo (temp, temp);
+#else
+ temp = vec_mergee (temp, temp);
+#endif
result = (__v4si) vec_vpkudum ((__vector long long) temp,
(__vector long long) vzero);
#else
@@ -896,7 +890,11 @@ _mm_cvtpd_ps (__m128d __A)
: );
#ifdef _ARCH_PWR8
+#ifdef __LITTLE_ENDIAN__
temp = vec_mergeo (temp, temp);
+#else
+ temp = vec_mergee (temp, temp);
+#endif
result = (__v4sf) vec_vpkudum ((__vector long long) temp,
(__vector long long) vzero);
#else
@@ -925,7 +923,11 @@ _mm_cvttpd_epi32 (__m128d __A)
: );
#ifdef _ARCH_PWR8
+#ifdef __LITTLE_ENDIAN__
temp = vec_mergeo (temp, temp);
+#else
+ temp = vec_mergee (temp, temp);
+#endif
result = (__v4si) vec_vpkudum ((__vector long long) temp,
(__vector long long) vzero);
#else
@@ -1205,6 +1207,9 @@ _mm_loadl_pd (__m128d __A, double const *__B)
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pd (__m128d __A)
{
+#ifdef _ARCH_PWR10
+ return vec_extractm ((__v2du) __A);
+#else
__vector unsigned long long result;
static const __vector unsigned int perm_mask =
{
@@ -1224,6 +1229,7 @@ _mm_movemask_pd (__m128d __A)
#else
return result[0];
#endif
+#endif /* !_ARCH_PWR10 */
}
#endif /* _ARCH_PWR8 */
@@ -1434,6 +1440,7 @@ _mm_mul_su32 (__m64 __A, __m64 __B)
return ((__m64)a * (__m64)b);
}
+#ifdef _ARCH_PWR8
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mul_epu32 (__m128i __A, __m128i __B)
{
@@ -1460,6 +1467,7 @@ _mm_mul_epu32 (__m128i __A, __m128i __B)
return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
#endif
}
+#endif
extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_slli_epi16 (__m128i __A, int __B)
@@ -1749,7 +1757,7 @@ _mm_sll_epi64 (__m128i __A, __m128i __B)
lshift = vec_splat ((__v2du) __B, 0);
shmask = vec_cmplt (lshift, shmax);
result = vec_sl ((__v2du) __A, lshift);
- result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask);
+ result = vec_sel ((__v2du) shmask, result, shmask);
return (__m128i) result;
}
@@ -1843,7 +1851,7 @@ _mm_srl_epi64 (__m128i __A, __m128i __B)
rshift = vec_splat ((__v2du) __B, 0);
shmask = vec_cmplt (rshift, shmax);
result = vec_sr ((__v2du) __A, rshift);
- result = (__v2du)vec_sel ((__v2df) shmask, (__v2df)result, shmask);
+ result = vec_sel ((__v2du) shmask, result, shmask);
return (__m128i) result;
}
@@ -1995,10 +2003,14 @@ _mm_min_epu8 (__m128i __A, __m128i __B)
#ifdef _ARCH_PWR8
/* Intrinsic functions that require PowerISA 2.07 minimum. */
-/* Creates a 4-bit mask from the most significant bits of the SPFP values. */
+/* Return a mask created from the most significant bit of each 8-bit
+ element in A. */
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_epi8 (__m128i __A)
{
+#ifdef _ARCH_PWR10
+ return vec_extractm ((__v16qu) __A);
+#else
__vector unsigned long long result;
static const __vector unsigned char perm_mask =
{
@@ -2015,6 +2027,7 @@ _mm_movemask_epi8 (__m128i __A)
#else
return result[0];
#endif
+#endif /* !_ARCH_PWR10 */
}
#endif /* _ARCH_PWR8 */
@@ -2158,27 +2171,37 @@ extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __arti
_mm_sad_epu8 (__m128i __A, __m128i __B)
{
__v16qu a, b;
- __v16qu vmin, vmax, vabs
diff ;
+ __v16qu vabs
diff ;
__v4si vsum;
const __v4su zero = { 0, 0, 0, 0 };
__v4si result;
a = (__v16qu) __A;
b = (__v16qu) __B;
- vmin = vec_min (a, b);
- vmax = vec_max (a, b);
+#ifndef _ARCH_PWR9
+ __v16qu vmin = vec_min (a, b);
+ __v16qu vmax = vec_max (a, b);
vabs
diff = vec_sub (vmax, vmin);
+#else
+ vabs
diff = vec_absd (a, b);
+#endif
/* Sum four groups of bytes into integers. */
vsum = (__vector signed int) vec_sum4s (vabs
diff , zero);
+#ifdef __LITTLE_ENDIAN__
+ /* Sum across four integers with two integer results. */
+ asm ("vsum2sws %0,%1,%2" : "=v" (result) : "v" (vsum), "v" (zero));
+ /* Note: vec_sum2s could be used here, but on little-endian, vector
+ shifts are added that are not needed for this use-case.
+ A vector shift to correctly position the 32-bit integer results
+ (currently at [0] and [2]) to [1] and [3] would then need to be
+ swapped back again since the desired results are two 64-bit
+ integers ([1]|[0] and [3]|[2]). Thus, no shift is performed. */
+#else
/* Sum across four integers with two integer results. */
result = vec_sum2s (vsum, (__vector signed int) zero);
/* Rotate the sums into the correct position. */
-#ifdef __LITTLE_ENDIAN__
- result = vec_sld (result, result, 4);
-#else
result = vec_sld (result, result, 6);
#endif
- /* Rotate the sums into the correct position. */
return (__m128i) result;
}
diff --git a/clang/lib/Headers/ppc_wrappers/immintrin.h b/clang/lib/Headers/ppc_wrappers/immintrin.h
new file mode 100644
index 0000000000000..c1ada9889d4af
--- /dev/null
+++ b/clang/lib/Headers/ppc_wrappers/immintrin.h
@@ -0,0 +1,27 @@
+/*===---- immintrin.h - Implementation of Intel intrinsics on PowerPC ------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef IMMINTRIN_H_
+#define IMMINTRIN_H_
+
+#include <x86gprintrin.h>
+
+#include <mmintrin.h>
+
+#include <xmmintrin.h>
+
+#include <emmintrin.h>
+
+#include <pmmintrin.h>
+
+#include <tmmintrin.h>
+
+#include <smmintrin.h>
+
+#endif /* IMMINTRIN_H_ */
diff --git a/clang/lib/Headers/ppc_wrappers/nmmintrin.h b/clang/lib/Headers/ppc_wrappers/nmmintrin.h
new file mode 100644
index 0000000000000..789bba6bc0d33
--- /dev/null
+++ b/clang/lib/Headers/ppc_wrappers/nmmintrin.h
@@ -0,0 +1,26 @@
+/*===---- nmmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+ makes explicit use of Intel intrinsics to powerpc64le.
+ It is the user's responsibility to determine if the results are
+ acceptable and make additional changes as necessary.
+ Note that much code that uses Intel intrinsics can be rewritten in
+ standard C or GNU C extensions, which are more portable and better
+ optimized across multiple targets. */
+#endif
+
+#ifndef NMMINTRIN_H_
+#define NMMINTRIN_H_
+
+/* We just include SSE4.1 header file. */
+#include <smmintrin.h>
+
+#endif /* NMMINTRIN_H_ */
diff --git a/clang/lib/Headers/ppc_wrappers/pmmintrin.h b/clang/lib/Headers/ppc_wrappers/pmmintrin.h
index 8d4046bd43f1b..d8afdf6295507 100644
--- a/clang/lib/Headers/ppc_wrappers/pmmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/pmmintrin.h
@@ -111,17 +111,21 @@ _mm_hsub_pd (__m128d __X, __m128d __Y)
vec_mergel ((__v2df) __X, (__v2df)__Y));
}
+#ifdef _ARCH_PWR8
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movehdup_ps (__m128 __X)
{
return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X);
}
+#endif
+#ifdef _ARCH_PWR8
extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_moveldup_ps (__m128 __X)
{
return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X);
}
+#endif
extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_loaddup_pd (double const *__P)
diff --git a/clang/lib/Headers/ppc_wrappers/smmintrin.h b/clang/lib/Headers/ppc_wrappers/smmintrin.h
index 674703245a699..9a69114f902c9 100644
--- a/clang/lib/Headers/ppc_wrappers/smmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/smmintrin.h
@@ -34,77 +34,683 @@
#include <altivec.h>
#include <tmmintrin.h>
-extern __inline int
- __attribute__((__gnu_inline__, __always_inline__, __artificial__))
- _mm_extract_epi8(__m128i __X, const int __N) {
- return (unsigned char)((__v16qi)__X)[__N & 15];
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_ZERO 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_NEG_INF 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+
+#define _MM_FROUND_NINT \
+ (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_FLOOR \
+ (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_CEIL \
+ (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_TRUNC \
+ (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_RINT \
+ (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
+#define _MM_FROUND_NEARBYINT \
+ (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
+
+#define _MM_FROUND_RAISE_EXC 0x00
+#define _MM_FROUND_NO_EXC 0x08
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_pd (__m128d __A, int __rounding)
+{
+ __v2df __r;
+ union {
+ double __fr;
+ long long __fpscr;
+ } __enables_save, __fpscr_save;
+
+ if (__rounding & _MM_FROUND_NO_EXC)
+ {
+ /* Save enabled exceptions, disable all exceptions,
+ and preserve the rounding mode. */
+#ifdef _ARCH_PWR9
+ __asm__ ("mffsce %0" : "=f" (__fpscr_save.__fr));
+ __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
+#else
+ __fpscr_save.__fr = __builtin_mffs ();
+ __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
+ __fpscr_save.__fpscr &= ~0xf8;
+ __builtin_mtfsf (0b00000011, __fpscr_save.__fr);
+#endif
+ /* Insert an artificial "read/write" reference to the variable
+ read below, to ensure the compiler does not schedule
+ a read/use of the variable before the FPSCR is modified, above.
+ This can be removed if and when GCC PR102783 is fixed.
+ */
+ __asm__ ("" : "+wa" (__A));
+ }
+
+ switch (__rounding)
+ {
+ case _MM_FROUND_TO_NEAREST_INT:
+ __fpscr_save.__fr = __builtin_mffsl ();
+ __attribute__ ((fallthrough));
+ case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
+ __builtin_set_fpscr_rn (0b00);
+ /* Insert an artificial "read/write" reference to the variable
+ read below, to ensure the compiler does not schedule
+ a read/use of the variable before the FPSCR is modified, above.
+ This can be removed if and when GCC PR102783 is fixed.
+ */
+ __asm__ ("" : "+wa" (__A));
+
+ __r = vec_rint ((__v2df) __A);
+
+ /* Insert an artificial "read" reference to the variable written
+ above, to ensure the compiler does not schedule the computation
+ of the value after the manipulation of the FPSCR, below.
+ This can be removed if and when GCC PR102783 is fixed.
+ */
+ __asm__ ("" : : "wa" (__r));
+ __builtin_set_fpscr_rn (__fpscr_save.__fpscr);
+ break;
+ case _MM_FROUND_TO_NEG_INF:
+ case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
+ __r = vec_floor ((__v2df) __A);
+ break;
+ case _MM_FROUND_TO_POS_INF:
+ case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
+ __r = vec_ceil ((__v2df) __A);
+ break;
+ case _MM_FROUND_TO_ZERO:
+ case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
+ __r = vec_trunc ((__v2df) __A);
+ break;
+ case _MM_FROUND_CUR_DIRECTION:
+ __r = vec_rint ((__v2df) __A);
+ break;
+ }
+ if (__rounding & _MM_FROUND_NO_EXC)
+ {
+ /* Insert an artificial "read" reference to the variable written
+ above, to ensure the compiler does not schedule the computation
+ of the value after the manipulation of the FPSCR, below.
+ This can be removed if and when GCC PR102783 is fixed.
+ */
+ __asm__ ("" : : "wa" (__r));
+ /* Restore enabled exceptions. */
+ __fpscr_save.__fr = __builtin_mffsl ();
+ __fpscr_save.__fpscr |= __enables_save.__fpscr;
+ __builtin_mtfsf (0b00000011, __fpscr_save.__fr);
+ }
+ return (__m128d) __r;
}
-extern __inline int
- __attribute__((__gnu_inline__, __always_inline__, __artificial__))
- _mm_extract_epi32(__m128i __X, const int __N) {
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_sd (__m128d __A, __m128d __B, int __rounding)
+{
+ __B = _mm_round_pd (__B, __rounding);
+ __v2df __r = { ((__v2df) __B)[0], ((__v2df) __A)[1] };
+ return (__m128d) __r;
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_ps (__m128 __A, int __rounding)
+{
+ __v4sf __r;
+ union {
+ double __fr;
+ long long __fpscr;
+ } __enables_save, __fpscr_save;
+
+ if (__rounding & _MM_FROUND_NO_EXC)
+ {
+ /* Save enabled exceptions, disable all exceptions,
+ and preserve the rounding mode. */
+#ifdef _ARCH_PWR9
+ __asm__ ("mffsce %0" : "=f" (__fpscr_save.__fr));
+ __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
+#else
+ __fpscr_save.__fr = __builtin_mffs ();
+ __enables_save.__fpscr = __fpscr_save.__fpscr & 0xf8;
+ __fpscr_save.__fpscr &= ~0xf8;
+ __builtin_mtfsf (0b00000011, __fpscr_save.__fr);
+#endif
+ /* Insert an artificial "read/write" reference to the variable
+ read below, to ensure the compiler does not schedule
+ a read/use of the variable before the FPSCR is modified, above.
+ This can be removed if and when GCC PR102783 is fixed.
+ */
+ __asm__ ("" : "+wa" (__A));
+ }
+
+ switch (__rounding)
+ {
+ case _MM_FROUND_TO_NEAREST_INT:
+ __fpscr_save.__fr = __builtin_mffsl ();
+ __attribute__ ((fallthrough));
+ case _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC:
+ __builtin_set_fpscr_rn (0b00);
+ /* Insert an artificial "read/write" reference to the variable
+ read below, to ensure the compiler does not schedule
+ a read/use of the variable before the FPSCR is modified, above.
+ This can be removed if and when GCC PR102783 is fixed.
+ */
+ __asm__ ("" : "+wa" (__A));
+
+ __r = vec_rint ((__v4sf) __A);
+
+ /* Insert an artificial "read" reference to the variable written
+ above, to ensure the compiler does not schedule the computation
+ of the value after the manipulation of the FPSCR, below.
+ This can be removed if and when GCC PR102783 is fixed.
+ */
+ __asm__ ("" : : "wa" (__r));
+ __builtin_set_fpscr_rn (__fpscr_save.__fpscr);
+ break;
+ case _MM_FROUND_TO_NEG_INF:
+ case _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC:
+ __r = vec_floor ((__v4sf) __A);
+ break;
+ case _MM_FROUND_TO_POS_INF:
+ case _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC:
+ __r = vec_ceil ((__v4sf) __A);
+ break;
+ case _MM_FROUND_TO_ZERO:
+ case _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC:
+ __r = vec_trunc ((__v4sf) __A);
+ break;
+ case _MM_FROUND_CUR_DIRECTION:
+ __r = vec_rint ((__v4sf) __A);
+ break;
+ }
+ if (__rounding & _MM_FROUND_NO_EXC)
+ {
+ /* Insert an artificial "read" reference to the variable written
+ above, to ensure the compiler does not schedule the computation
+ of the value after the manipulation of the FPSCR, below.
+ This can be removed if and when GCC PR102783 is fixed.
+ */
+ __asm__ ("" : : "wa" (__r));
+ /* Restore enabled exceptions. */
+ __fpscr_save.__fr = __builtin_mffsl ();
+ __fpscr_save.__fpscr |= __enables_save.__fpscr;
+ __builtin_mtfsf (0b00000011, __fpscr_save.__fr);
+ }
+ return (__m128) __r;
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_round_ss (__m128 __A, __m128 __B, int __rounding)
+{
+ __B = _mm_round_ps (__B, __rounding);
+ __v4sf __r = (__v4sf) __A;
+ __r[0] = ((__v4sf) __B)[0];
+ return (__m128) __r;
+}
+
+#define _mm_ceil_pd(V) _mm_round_pd ((V), _MM_FROUND_CEIL)
+#define _mm_ceil_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_CEIL)
+
+#define _mm_floor_pd(V) _mm_round_pd((V), _MM_FROUND_FLOOR)
+#define _mm_floor_sd(D, V) _mm_round_sd ((D), (V), _MM_FROUND_FLOOR)
+
+#define _mm_ceil_ps(V) _mm_round_ps ((V), _MM_FROUND_CEIL)
+#define _mm_ceil_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_CEIL)
+
+#define _mm_floor_ps(V) _mm_round_ps ((V), _MM_FROUND_FLOOR)
+#define _mm_floor_ss(D, V) _mm_round_ss ((D), (V), _MM_FROUND_FLOOR)
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi8 (__m128i const __A, int const __D, int const __N)
+{
+ __v16qi result = (__v16qi)__A;
+
+ result [__N & 0xf] = __D;
+
+ return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi32 (__m128i const __A, int const __D, int const __N)
+{
+ __v4si result = (__v4si)__A;
+
+ result [__N & 3] = __D;
+
+ return (__m128i) result;
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_insert_epi64 (__m128i const __A, long long const __D, int const __N)
+{
+ __v2di result = (__v2di)__A;
+
+ result [__N & 1] = __D;
+
+ return (__m128i) result;
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi8 (__m128i __X, const int __N)
+{
+ return (unsigned char) ((__v16qi)__X)[__N & 15];
+}
+
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi32 (__m128i __X, const int __N)
+{
return ((__v4si)__X)[__N & 3];
}
-extern __inline int
- __attribute__((__gnu_inline__, __always_inline__, __artificial__))
- _mm_extract_epi64(__m128i __X, const int __N) {
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_epi64 (__m128i __X, const int __N)
+{
return ((__v2di)__X)[__N & 1];
}
-extern __inline int
- __attribute__((__gnu_inline__, __always_inline__, __artificial__))
- _mm_extract_ps(__m128 __X, const int __N) {
+extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_extract_ps (__m128 __X, const int __N)
+{
return ((__v4si)__X)[__N & 3];
}
+#ifdef _ARCH_PWR8
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_epi16 (__m128i __A, __m128i __B, const int __imm8)
+{
+ __v16qi __charmask = vec_splats ((signed char) __imm8);
+ __charmask = vec_gb (__charmask);
+ __v8hu __shortmask = (__v8hu) vec_unpackh (__charmask);
+ #ifdef __BIG_ENDIAN__
+ __shortmask = vec_reve (__shortmask);
+ #endif
+ return (__m128i) vec_sel ((__v8hu) __A, (__v8hu) __B, __shortmask);
+}
+#endif
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask)
+{
+#ifdef _ARCH_PWR10
+ return (__m128i) vec_blendv ((__v16qi) __A, (__v16qi) __B, (__v16qu) __mask);
+#else
+ const __v16qu __seven = vec_splats ((unsigned char) 0x07);
+ __v16qu __lmask = vec_sra ((__v16qu) __mask, __seven);
+ return (__m128i) vec_sel ((__v16qi) __A, (__v16qi) __B, __lmask);
+#endif
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_ps (__m128 __A, __m128 __B, const int __imm8)
+{
+ __v16qu __pcv[] =
+ {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15 },
+ { 16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 12, 13, 14, 15 },
+ { 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 },
+ { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 12, 13, 14, 15 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31 },
+ { 16, 17, 18, 19, 4, 5, 6, 7, 8, 9, 10, 11, 28, 29, 30, 31 },
+ { 0, 1, 2, 3, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 },
+ { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 28, 29, 30, 31 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 },
+ { 16, 17, 18, 19, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 },
+ { 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
+ { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
+ };
+ __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]);
+ return (__m128) __r;
+}
+
+extern __inline __m128
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_ps (__m128 __A, __m128 __B, __m128 __mask)
+{
+#ifdef _ARCH_PWR10
+ return (__m128) vec_blendv ((__v4sf) __A, (__v4sf) __B, (__v4su) __mask);
+#else
+ const __v4si __zero = {0};
+ const __vector __bool int __boolmask = vec_cmplt ((__v4si) __mask, __zero);
+ return (__m128) vec_sel ((__v4su) __A, (__v4su) __B, (__v4su) __boolmask);
+#endif
+}
+
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blend_pd (__m128d __A, __m128d __B, const int __imm8)
+{
+ __v16qu __pcv[] =
+ {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 16, 17, 18, 19, 20, 21, 22, 23, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31 },
+ { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }
+ };
+ __v16qu __r = vec_perm ((__v16qu) __A, (__v16qu)__B, __pcv[__imm8]);
+ return (__m128d) __r;
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128d
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask)
+{
+#ifdef _ARCH_PWR10
+ return (__m128d) vec_blendv ((__v2df) __A, (__v2df) __B, (__v2du) __mask);
+#else
+ const __v2di __zero = {0};
+ const __vector __bool long long __boolmask = vec_cmplt ((__v2di) __mask, __zero);
+ return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __boolmask);
+#endif
+}
+#endif
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testz_si128 (__m128i __A, __m128i __B)
+{
+ /* Note: This implementation does NOT set "zero" or "carry" flags. */
+ const __v16qu __zero = {0};
+ return vec_all_eq (vec_and ((__v16qu) __A, (__v16qu) __B), __zero);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testc_si128 (__m128i __A, __m128i __B)
+{
+ /* Note: This implementation does NOT set "zero" or "carry" flags. */
+ const __v16qu __zero = {0};
+ const __v16qu __notA = vec_nor ((__v16qu) __A, (__v16qu) __A);
+ return vec_all_eq (vec_and ((__v16qu) __notA, (__v16qu) __B), __zero);
+}
+
+extern __inline int
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_testnzc_si128 (__m128i __A, __m128i __B)
+{
+ /* Note: This implementation does NOT set "zero" or "carry" flags. */
+ return _mm_testz_si128 (__A, __B) == 0 && _mm_testc_si128 (__A, __B) == 0;
+}
+
+#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
+
+#define _mm_test_all_ones(V) \
+ _mm_testc_si128 ((V), _mm_cmpeq_epi32 ((V), (V)))
+
+#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128 ((M), (V))
+
+#ifdef _ARCH_PWR8
extern __inline __m128i
- __attribute__((__gnu_inline__, __always_inline__, __artificial__))
- _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
- __v16qi __charmask = vec_splats((signed char)__imm8);
- __charmask = vec_gb(__charmask);
- __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask);
-#ifdef __BIG_ENDIAN__
- __shortmask = vec_reve(__shortmask);
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpeq_epi64 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_cmpeq ((__v2di) __X, (__v2di) __Y);
+}
#endif
- return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi8 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_min ((__v16qi)__X, (__v16qi)__Y);
}
extern __inline __m128i
- __attribute__((__gnu_inline__, __always_inline__, __artificial__))
- _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
- const __v16qu __seven = vec_splats((unsigned char)0x07);
- __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
- return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask);
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu16 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_min ((__v8hu)__X, (__v8hu)__Y);
}
extern __inline __m128i
- __attribute__((__gnu_inline__, __always_inline__, __artificial__))
- _mm_insert_epi8(__m128i const __A, int const __D, int const __N) {
- __v16qi result = (__v16qi)__A;
- result[__N & 0xf] = __D;
- return (__m128i)result;
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_min ((__v4si)__X, (__v4si)__Y);
}
extern __inline __m128i
- __attribute__((__gnu_inline__, __always_inline__, __artificial__))
- _mm_insert_epi32(__m128i const __A, int const __D, int const __N) {
- __v4si result = (__v4si)__A;
- result[__N & 3] = __D;
- return (__m128i)result;
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_min_epu32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_min ((__v4su)__X, (__v4su)__Y);
}
extern __inline __m128i
- __attribute__((__gnu_inline__, __always_inline__, __artificial__))
- _mm_insert_epi64(__m128i const __A, long long const __D, int const __N) {
- __v2di result = (__v2di)__A;
- result[__N & 1] = __D;
- return (__m128i)result;
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi8 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_max ((__v16qi)__X, (__v16qi)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu16 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_max ((__v8hu)__X, (__v8hu)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_max ((__v4si)__X, (__v4si)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_max_epu32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_max ((__v4su)__X, (__v4su)__Y);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mullo_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_mul ((__v4su) __X, (__v4su) __Y);
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mul_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_mule ((__v4si) __X, (__v4si) __Y);
+}
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi16 (__m128i __A)
+{
+ return (__m128i) vec_unpackh ((__v16qi) __A);
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi32 (__m128i __A)
+{
+ __A = (__m128i) vec_unpackh ((__v16qi) __A);
+ return (__m128i) vec_unpackh ((__v8hi) __A);
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi8_epi64 (__m128i __A)
+{
+ __A = (__m128i) vec_unpackh ((__v16qi) __A);
+ __A = (__m128i) vec_unpackh ((__v8hi) __A);
+ return (__m128i) vec_unpackh ((__v4si) __A);
+}
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi16_epi32 (__m128i __A)
+{
+ return (__m128i) vec_unpackh ((__v8hi) __A);
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi16_epi64 (__m128i __A)
+{
+ __A = (__m128i) vec_unpackh ((__v8hi) __A);
+ return (__m128i) vec_unpackh ((__v4si) __A);
+}
+#endif
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepi32_epi64 (__m128i __A)
+{
+ return (__m128i) vec_unpackh ((__v4si) __A);
}
+#endif
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi16 (__m128i __A)
+{
+ const __v16qu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+ __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
+#else /* __BIG_ENDIAN__. */
+ __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
+#endif /* __BIG_ENDIAN__. */
+ return __A;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi32 (__m128i __A)
+{
+ const __v16qu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+ __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
+ __A = (__m128i) vec_mergeh ((__v8hu) __A, (__v8hu) __zero);
+#else /* __BIG_ENDIAN__. */
+ __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
+ __A = (__m128i) vec_mergeh ((__v8hu) __zero, (__v8hu) __A);
+#endif /* __BIG_ENDIAN__. */
+ return __A;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu8_epi64 (__m128i __A)
+{
+ const __v16qu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+ __A = (__m128i) vec_mergeh ((__v16qu) __A, __zero);
+ __A = (__m128i) vec_mergeh ((__v8hu) __A, (__v8hu) __zero);
+ __A = (__m128i) vec_mergeh ((__v4su) __A, (__v4su) __zero);
+#else /* __BIG_ENDIAN__. */
+ __A = (__m128i) vec_mergeh (__zero, (__v16qu) __A);
+ __A = (__m128i) vec_mergeh ((__v8hu) __zero, (__v8hu) __A);
+ __A = (__m128i) vec_mergeh ((__v4su) __zero, (__v4su) __A);
+#endif /* __BIG_ENDIAN__. */
+ return __A;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu16_epi32 (__m128i __A)
+{
+ const __v8hu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+ __A = (__m128i) vec_mergeh ((__v8hu) __A, __zero);
+#else /* __BIG_ENDIAN__. */
+ __A = (__m128i) vec_mergeh (__zero, (__v8hu) __A);
+#endif /* __BIG_ENDIAN__. */
+ return __A;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu16_epi64 (__m128i __A)
+{
+ const __v8hu __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+ __A = (__m128i) vec_mergeh ((__v8hu) __A, __zero);
+ __A = (__m128i) vec_mergeh ((__v4su) __A, (__v4su) __zero);
+#else /* __BIG_ENDIAN__. */
+ __A = (__m128i) vec_mergeh (__zero, (__v8hu) __A);
+ __A = (__m128i) vec_mergeh ((__v4su) __zero, (__v4su) __A);
+#endif /* __BIG_ENDIAN__. */
+ return __A;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cvtepu32_epi64 (__m128i __A)
+{
+ const __v4su __zero = {0};
+#ifdef __LITTLE_ENDIAN__
+ __A = (__m128i) vec_mergeh ((__v4su) __A, __zero);
+#else /* __BIG_ENDIAN__. */
+ __A = (__m128i) vec_mergeh (__zero, (__v4su) __A);
+#endif /* __BIG_ENDIAN__. */
+ return __A;
+}
+
+/* Return horizontal packed word minimum and its index in bits [15:0]
+ and bits [18:16] respectively. */
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_minpos_epu16 (__m128i __A)
+{
+ union __u
+ {
+ __m128i __m;
+ __v8hu __uh;
+ };
+ union __u __u = { .__m = __A }, __r = { .__m = {0} };
+ unsigned short __ridx = 0;
+ unsigned short __rmin = __u.__uh[__ridx];
+ for (unsigned long __i = 1; __i < 8; __i++)
+ {
+ if (__u.__uh[__i] < __rmin)
+ {
+ __rmin = __u.__uh[__i];
+ __ridx = __i;
+ }
+ }
+ __r.__uh[0] = __rmin;
+ __r.__uh[1] = __ridx;
+ return __r.__m;
+}
+
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_packus_epi32 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_packsu ((__v4si) __X, (__v4si) __Y);
+}
+
+#ifdef _ARCH_PWR8
+extern __inline __m128i
+__attribute__ ((__gnu_inline__, __always_inline__, __artificial__))
+_mm_cmpgt_epi64 (__m128i __X, __m128i __Y)
+{
+ return (__m128i) vec_cmpgt ((__v2di) __X, (__v2di) __Y);
+}
+#endif
#else
#include_next <smmintrin.h>
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
*/
-#endif /* _SMMINTRIN_H_ */
+#endif /* SMMINTRIN_H_ */
diff --git a/clang/lib/Headers/ppc_wrappers/tmmintrin.h b/clang/lib/Headers/ppc_wrappers/tmmintrin.h
index ebef7b8192d75..6e4677d8a6eff 100644
--- a/clang/lib/Headers/ppc_wrappers/tmmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/tmmintrin.h
@@ -339,6 +339,7 @@ _mm_shuffle_pi8 (__m64 __A, __m64 __B)
return (__m64) ((__v2du) (__C))[0];
}
+#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi8 (__m128i __A, __m128i __B)
@@ -350,7 +351,9 @@ _mm_sign_epi8 (__m128i __A, __m128i __B)
__v16qi __conv = vec_add (__selectneg, __selectpos);
return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
}
+#endif
+#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi16 (__m128i __A, __m128i __B)
@@ -362,7 +365,9 @@ _mm_sign_epi16 (__m128i __A, __m128i __B)
__v8hi __conv = vec_add (__selectneg, __selectpos);
return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
}
+#endif
+#ifdef _ARCH_PWR8
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_epi32 (__m128i __A, __m128i __B)
@@ -374,7 +379,9 @@ _mm_sign_epi32 (__m128i __A, __m128i __B)
__v4si __conv = vec_add (__selectneg, __selectpos);
return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
}
+#endif
+#ifdef _ARCH_PWR8
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi8 (__m64 __A, __m64 __B)
@@ -385,7 +392,9 @@ _mm_sign_pi8 (__m64 __A, __m64 __B)
__C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
return (__m64) ((__v2du) (__C))[0];
}
+#endif
+#ifdef _ARCH_PWR8
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi16 (__m64 __A, __m64 __B)
@@ -396,7 +405,9 @@ _mm_sign_pi16 (__m64 __A, __m64 __B)
__C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
return (__m64) ((__v2du) (__C))[0];
}
+#endif
+#ifdef _ARCH_PWR8
extern __inline __m64
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_sign_pi32 (__m64 __A, __m64 __B)
@@ -407,6 +418,7 @@ _mm_sign_pi32 (__m64 __A, __m64 __B)
__C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
return (__m64) ((__v2du) (__C))[0];
}
+#endif
extern __inline __m128i
__attribute__((__gnu_inline__, __always_inline__, __artificial__))
diff --git a/clang/lib/Headers/ppc_wrappers/x86gprintrin.h b/clang/lib/Headers/ppc_wrappers/x86gprintrin.h
new file mode 100644
index 0000000000000..cbfac262395c4
--- /dev/null
+++ b/clang/lib/Headers/ppc_wrappers/x86gprintrin.h
@@ -0,0 +1,17 @@
+/*===--- x86gprintrin.h - Implementation of X86 GPR intrinsics on PowerPC --===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef X86GPRINTRIN_H_
+#define X86GPRINTRIN_H_
+
+#include <bmiintrin.h>
+
+#include <bmi2intrin.h>
+
+#endif /* X86GPRINTRIN_H_ */
diff --git a/clang/lib/Headers/ppc_wrappers/x86intrin.h b/clang/lib/Headers/ppc_wrappers/x86intrin.h
new file mode 100644
index 0000000000000..f5c201262e69e
--- /dev/null
+++ b/clang/lib/Headers/ppc_wrappers/x86intrin.h
@@ -0,0 +1,28 @@
+/*===---- x86intrin.h - Implementation of X86 intrinsics on PowerPC --------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+ makes explicit use of Intel intrinsics to powerpc64le.
+ It is the user's responsibility to determine if the results are
+ acceptable and make additional changes as necessary.
+ Note that much code that uses Intel intrinsics can be rewritten in
+ standard C or GNU C extensions, which are more portable and better
+ optimized across multiple targets. */
+#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
+#endif
+
+#ifndef X86INTRIN_H_
+#define X86INTRIN_H_
+
+#ifdef __ALTIVEC__
+#include <immintrin.h>
+#endif /* __ALTIVEC__ */
+
+#endif /* X86INTRIN_H_ */
diff --git a/clang/lib/Headers/ppc_wrappers/xmmintrin.h b/clang/lib/Headers/ppc_wrappers/xmmintrin.h
index 956603d364082..14b9cd1796d6e 100644
--- a/clang/lib/Headers/ppc_wrappers/xmmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/xmmintrin.h
@@ -31,8 +31,8 @@
#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
#endif
-#ifndef _XMMINTRIN_H_INCLUDED
-#define _XMMINTRIN_H_INCLUDED
+#ifndef XMMINTRIN_H_
+#define XMMINTRIN_H_
#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
@@ -881,7 +881,7 @@ _mm_cvtss_f32 (__m128 __A)
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_si32 (__m128 __A)
{
- __m64 res = 0;
+ int res;
#ifdef _ARCH_PWR8
double dtmp;
__asm__(
@@ -914,8 +914,8 @@ _mm_cvt_ss2si (__m128 __A)
extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_cvtss_si64 (__m128 __A)
{
- __m64 res = 0;
-#ifdef _ARCH_PWR8
+ long long res;
+#if defined (_ARCH_PWR8) && defined (__powerpc64__)
double dtmp;
__asm__(
#ifdef __LITTLE_ENDIAN__
@@ -1328,6 +1328,9 @@ _mm_storel_pi (__m64 *__P, __m128 __A)
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_ps (__m128 __A)
{
+#ifdef _ARCH_PWR10
+ return vec_extractm ((__vector unsigned int) __A);
+#else
__vector unsigned long long result;
static const __vector unsigned int perm_mask =
{
@@ -1347,6 +1350,7 @@ _mm_movemask_ps (__m128 __A)
#else
return result[0];
#endif
+#endif /* !_ARCH_PWR10 */
}
#endif /* _ARCH_PWR8 */
@@ -1553,6 +1557,7 @@ _m_pminub (__m64 __A, __m64 __B)
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_movemask_pi8 (__m64 __A)
{
+#ifdef __powerpc64__
unsigned long long p =
#ifdef __LITTLE_ENDIAN__
0x0008101820283038UL; // permute control for sign bits
@@ -1560,6 +1565,18 @@ _mm_movemask_pi8 (__m64 __A)
0x3830282018100800UL; // permute control for sign bits
#endif
return __builtin_bpermd (p, __A);
+#else
+#ifdef __LITTLE_ENDIAN__
+ unsigned int mask = 0x20283038UL;
+ unsigned int r1 = __builtin_bpermd (mask, __A) & 0xf;
+ unsigned int r2 = __builtin_bpermd (mask, __A >> 32) & 0xf;
+#else
+ unsigned int mask = 0x38302820UL;
+ unsigned int r1 = __builtin_bpermd (mask, __A >> 32) & 0xf;
+ unsigned int r2 = __builtin_bpermd (mask, __A) & 0xf;
+#endif
+ return (r2 << 4) | r1;
+#endif
}
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
@@ -1841,4 +1858,4 @@ do { \
#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
*/
-#endif /* _XMMINTRIN_H_INCLUDED */
+#endif /* XMMINTRIN_H_ */
diff --git a/clang/test/CodeGen/PowerPC/ppc-emmintrin.c b/clang/test/CodeGen/PowerPC/ppc-emmintrin.c
index 6e4a2e4309d4a..33af7b91af788 100644
--- a/clang/test/CodeGen/PowerPC/ppc-emmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-emmintrin.c
@@ -5,6 +5,9 @@
// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-LE
+// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK-P10-LE
+
// CHECK-BE-DAG: @_mm_movemask_pd.perm_mask = internal constant <4 x i32> <i32 -2139062144, i32 -2139062144, i32 -2139062144, i32 -2139078656>, align 16
// CHECK-BE-DAG: @_mm_shuffle_epi32.permute_selectors = internal constant [4 x i32] [i32 66051, i32 67438087, i32 134810123, i32 202182159], align 4
// CHECK-BE-DAG: @_mm_shufflehi_epi16.permute_selectors = internal constant [4 x i16] [i16 2057, i16 2571, i16 3085, i16 3599], align 2
@@ -440,7 +443,8 @@ test_converts() {
// CHECK: call <2 x double> @vec_rint(double vector[2])
// CHECK: store <4 x i32> zeroinitializer, <4 x i32>* %{{[0-9a-zA-Z_.]+}}, align 16
// CHECK: call <4 x i32> asm "xvcvdpsxws ${0:x},${1:x}", "=^wa,^wa"(<2 x double> %{{[0-9a-zA-Z_.]+}})
-// CHECK: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
+// CHECK-LE: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
+// CHECK-BE: call <4 x i32> @vec_mergee(int vector[4], int vector[4])
// CHECK: call <4 x i32> @vec_vpkudum(long long vector[2], long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef zeroinitializer)
// CHECK-LABEL: define available_externally i64 @_mm_cvtpd_pi32
@@ -450,7 +454,8 @@ test_converts() {
// CHECK-LABEL: define available_externally <4 x float> @_mm_cvtpd_ps
// CHECK: store <4 x i32> zeroinitializer, <4 x i32>* %{{[0-9a-zA-Z_.]+}}, align 16
// CHECK: call <4 x i32> asm "xvcvdpsp ${0:x},${1:x}", "=^wa,^wa"(<2 x double> %{{[0-9a-zA-Z_.]+}})
-// CHECK: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
+// CHECK-LE: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
+// CHECK-BE: call <4 x i32> @vec_mergee(int vector[4], int vector[4])
// CHECK: call <4 x i32> @vec_vpkudum(long long vector[2], long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef zeroinitializer)
// CHECK-LABEL: define available_externally <2 x double> @_mm_cvtpi32_pd
@@ -530,7 +535,8 @@ test_converts() {
// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvttpd_epi32
// CHECK: call <4 x i32> asm "xvcvdpsxws ${0:x},${1:x}", "=^wa,^wa"
-// CHECK: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
+// CHECK-LE: call <4 x i32> @vec_mergeo(int vector[4], int vector[4])
+// CHECK-BE: call <4 x i32> @vec_mergee(int vector[4], int vector[4])
// CHECK: call <4 x i32> @vec_vpkudum(long long vector[2], long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef zeroinitializer)
// CHECK-LABEL: define available_externally i64 @_mm_cvttpd_pi32
@@ -756,12 +762,18 @@ test_move() {
// CHECK: %[[EXT:[0-9a-zA-Z_.]+]] = extractelement <2 x double> %{{[0-9a-zA-Z_.]+}}, i32 0
// CHECK: insertelement <2 x double> %{{[0-9a-zA-Z_.]+}}, double %[[EXT]], i32 0
+// CHECK-P10-LE-LABEL: define available_externally signext i32 @_mm_movemask_epi8
+// CHECK-P10-LE: call zeroext i32 @vec_extractm(unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}})
+
// CHECK-LABEL: define available_externally signext i32 @_mm_movemask_epi8
// CHECK: call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef <i8 120, i8 112, i8 104, i8 96, i8 88, i8 80, i8 72, i8 64, i8 56, i8 48, i8 40, i8 32, i8 24, i8 16, i8 8, i8 0>)
// CHECK-LE: %[[VAL:[0-9a-zA-Z_.]+]] = extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 1
// CHECK-BE: %[[VAL:[0-9a-zA-Z_.]+]] = extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 0
// CHECK: trunc i64 %[[VAL]] to i32
+// CHECK-P10-LE-LABEL: define available_externally signext i32 @_mm_movemask_pd
+// CHECK-P10-LE: call zeroext i32 @vec_extractm(unsigned long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+
// CHECK-LABEL: define available_externally signext i32 @_mm_movemask_pd
// CHECK-LE: call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef bitcast (<4 x i32> <i32 -2139094976, i32 -2139062144, i32 -2139062144, i32 -2139062144> to <16 x i8>))
// CHECK-LE: extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 1
@@ -857,9 +869,9 @@ test_sad() {
// CHECK: call <16 x i8> @vec_max(unsigned char vector[16], unsigned char vector[16])
// CHECK: call <16 x i8> @vec_sub(unsigned char vector[16], unsigned char vector[16])
// CHECK: call <4 x i32> @vec_sum4s(unsigned char vector[16], unsigned int vector[4])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
-// CHECK: call <4 x i32> @vec_sum2s(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
-// CHECK-LE: call <4 x i32> @vec_sld(int vector[4], int vector[4], unsigned int)(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef zeroext 4)
-// CHECK-BE: call <4 x i32> @vec_sld(int vector[4], int vector[4], unsigned int)(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef zeroext 6)
+// CHECK-LE: call <4 x i32> asm "vsum2sws $0,$1,$2", "=v,v,v"(<4 x i32> %11, <4 x i32> zeroinitializer)
+// CHECK-BE: call <4 x i32> @vec_sum2s(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
+// CHECK-BE: call <4 x i32> @vec_sld(int vector[4], int vector[4], unsigned int)
void __attribute__((noinline))
test_set() {
@@ -1086,7 +1098,7 @@ test_sll() {
// CHECK: call <2 x i64> @vec_splat(unsigned long long vector[2], unsigned int)(<2 x i64> noundef {{[0-9a-zA-Z_%.]+}}, i32 noundef zeroext 0)
// CHECK: call <2 x i64> @vec_cmplt(unsigned long long vector[2], unsigned long long vector[2])(<2 x i64> noundef {{[0-9a-zA-Z_%.]+}}, <2 x i64> noundef <i64 64, i64 64>)
// CHECK: call <2 x i64> @vec_sl(unsigned long long vector[2], unsigned long long vector[2])
-// CHECK: call <2 x double> @vec_sel(double vector[2], double vector[2], bool vector[2])
+// CHECK: call <2 x i64> @vec_sel(unsigned long long vector[2], unsigned long long vector[2], bool vector[2])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_slli_epi16
// CHECK: store <8 x i16> zeroinitializer, <8 x i16>* %{{[0-9a-zA-Z_.]+}}, align 16
@@ -1232,7 +1244,7 @@ test_srl() {
// CHECK: call <2 x i64> @vec_splat(unsigned long long vector[2], unsigned int)(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef zeroext 0)
// CHECK: call <2 x i64> @vec_cmplt(unsigned long long vector[2], unsigned long long vector[2])(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef <i64 64, i64 64>)
// CHECK: call <2 x i64> @vec_sr(unsigned long long vector[2], unsigned long long vector[2])
-// CHECK: call <2 x double> @vec_sel(double vector[2], double vector[2], bool vector[2])
+// CHECK: call <2 x i64> @vec_sel(unsigned long long vector[2], unsigned long long vector[2], bool vector[2])
// CHECK-LABEL: define available_externally <2 x i64> @_mm_srli_epi16
// CHECK: store <8 x i16> zeroinitializer, <8 x i16>* %{{[0-9a-zA-Z_.]+}}, align 16
diff --git a/clang/test/CodeGen/PowerPC/ppc-smmintrin.c b/clang/test/CodeGen/PowerPC/ppc-smmintrin.c
index 5b2027dd52197..eaf96a71afca9 100644
--- a/clang/test/CodeGen/PowerPC/ppc-smmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-smmintrin.c
@@ -10,6 +10,11 @@
// RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
+// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefix=P10
+// RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefix=P10
+
#include <smmintrin.h>
__m128 mn1, mn2;
@@ -48,6 +53,10 @@ void __attribute__((noinline))
test_blend() {
_mm_blend_epi16(m1, m2, 0);
_mm_blendv_epi8(m1, m2, mi);
+ _mm_blend_ps(mn1, mn2, 0);
+ _mm_blendv_ps(mn1, mn2, mn1);
+ _mm_blend_pd(md1, md2, 0);
+ _mm_blendv_pd(md1, md2, md1);
}
// CHECK-LABEL: @test_blend
@@ -62,10 +71,13 @@ test_blend() {
// BE: store <8 x i16> %[[REVE]], <8 x i16>* %{{[0-9a-zA-Z_.]+}}, align 16
// CHECK: call <8 x i16> @vec_sel(unsigned short vector[8], unsigned short vector[8], unsigned short vector[8])
+// P10-LABEL: define available_externally <2 x i64> @_mm_blendv_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// P10: call <16 x i8> @vec_blendv(signed char vector[16], signed char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}})
+
// CHECK-LABEL: define available_externally <2 x i64> @_mm_blendv_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
// CHECK: call <16 x i8> @vec_splats(unsigned char)(i8 noundef zeroext 7)
// CHECK: call <16 x i8> @vec_sra(unsigned char vector[16], unsigned char vector[16])
-// CHECK: call <16 x i8> @vec_sel(unsigned char vector[16], unsigned char vector[16], unsigned char vector[16])
+// CHECK: call <16 x i8> @vec_sel(signed char vector[16], signed char vector[16], unsigned char vector[16])
void __attribute__((noinline))
test_insert() {
@@ -89,6 +101,239 @@ test_insert() {
// CHECK: %[[AND:[0-9a-zA-Z_.]+]] = and i32 %{{[0-9a-zA-Z_.]+}}, 1
// CHECK: insertelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}, i32 %[[AND:[0-9a-zA-Z_.]+]]
+void __attribute__((noinline))
+test_convert() {
+ _mm_cvtepi16_epi32(m1);
+ _mm_cvtepi16_epi64(m1);
+ _mm_cvtepi32_epi64(m1);
+ _mm_cvtepi8_epi16(m1);
+ _mm_cvtepi8_epi32(m1);
+ _mm_cvtepi8_epi64(m1);
+ _mm_cvtepu16_epi32(m1);
+ _mm_cvtepu16_epi64(m1);
+ _mm_cvtepu32_epi64(m1);
+ _mm_cvtepu8_epi16(m1);
+ _mm_cvtepu8_epi32(m1);
+ _mm_cvtepu8_epi64(m1);
+}
+
+// CHECK-LABEL: @test_convert
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi16_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x i32> @vec_unpackh(short vector[8])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi16_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x i32> @vec_unpackh(short vector[8])
+// CHECK: call <2 x i64> @vec_unpackh(int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi32_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <2 x i64> @vec_unpackh(int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16])
+// CHECK: call <4 x i32> @vec_unpackh(short vector[8])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepi8_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <8 x i16> @vec_unpackh(signed char vector[16])
+// CHECK: call <4 x i32> @vec_unpackh(short vector[8])
+// CHECK: call <2 x i64> @vec_unpackh(int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu16_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer)
+// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu16_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer)
+// LE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
+// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}})
+// BE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef zeroinitializer, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu32_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// LE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}}, <4 x i32> noundef zeroinitializer)
+// BE: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])(<4 x i32> noundef zeroinitializer, <4 x i32> noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// LE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef zeroinitializer)
+// BE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef zeroinitializer, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// LE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef zeroinitializer)
+// LE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef %{{[0-9a-zA-Z_.]+}}, <8 x i16> noundef zeroinitializer)
+// BE: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef zeroinitializer, <16 x i8> noundef %{{[0-9a-zA-Z_.]+}})
+// BE: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])(<8 x i16> noundef zeroinitializer, <8 x i16> noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cvtepu8_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <16 x i8> @vec_mergeh(unsigned char vector[16], unsigned char vector[16])
+// CHECK: call <8 x i16> @vec_mergeh(unsigned short vector[8], unsigned short vector[8])
+// CHECK: call <4 x i32> @vec_mergeh(unsigned int vector[4], unsigned int vector[4])
+
+void __attribute__((noinline))
+test_minmax() {
+ _mm_max_epi32(m1, m2);
+ _mm_max_epi8(m1, m2);
+ _mm_max_epu16(m1, m2);
+ _mm_max_epu32(m1, m2);
+ _mm_min_epi32(m1, m2);
+ _mm_min_epi8(m1, m2);
+ _mm_min_epu16(m1, m2);
+ _mm_min_epu32(m1, m2);
+ _mm_minpos_epu16(m1);
+}
+
+// CHECK-LABEL: @test_minmax
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x i32> @vec_max(int vector[4], int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <16 x i8> @vec_max(signed char vector[16], signed char vector[16])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <8 x i16> @vec_max(unsigned short vector[8], unsigned short vector[8])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_max_epu32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x i32> @vec_max(unsigned int vector[4], unsigned int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x i32> @vec_min(int vector[4], int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epi8(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <16 x i8> @vec_min(signed char vector[16], signed char vector[16])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <8 x i16> @vec_min(unsigned short vector[8], unsigned short vector[8])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_min_epu32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x i32> @vec_min(unsigned int vector[4], unsigned int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_minpos_epu16(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call void @llvm.memset.p0i8.i64(i8* align 16 %{{[0-9a-zA-Z_.]+}}, i8 0, i64 16, i1 false)
+// CHECK: extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i16 %{{[0-9a-zA-Z_.]+}}
+// CHECK: %[[VEXT:[0-9a-zA-Z_.]+]] = extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: zext i16 %[[VEXT]] to i32
+// CHECK: zext i16 %{{[0-9a-zA-Z_.]+}} to i32
+// CHECK: extractelement <8 x i16> %{{[0-9a-zA-Z_.]+}}, i64 %{{[0-9a-zA-Z_.]+}}
+// CHECK: add i64 %{{[0-9a-zA-Z_.]+}}, 1
+
+void __attribute__((noinline))
+test_round() {
+ _mm_round_ps(mn1, 0);
+ _mm_round_ss(mn1, mn2, 0);
+ _mm_round_pd(mn1, 0);
+ _mm_round_sd(mn1, mn2, 0);
+}
+
+// CHECK-LABEL: @test_round
+
+// CHECK-LABEL: define available_externally <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffs to i32 ()*)()
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0"
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)()
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i32)*)(i32 noundef signext 0)
+// CHECK: %{{[0-9a-zA-Z_.]+}} = call <4 x float> asm "", "=^wa,0"
+// CHECK: call <4 x float> @vec_rint(float vector[4])
+// CHECK: call void asm sideeffect "", "^wa"
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i64)*)(i64 noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x float> @vec_floor(float vector[4])
+// CHECK: call <4 x float> @vec_ceil(float vector[4])
+// CHECK: call <4 x float> @vec_trunc(float vector[4])
+// CHECK: call <4 x float> @vec_rint(float vector[4])
+// CHECK: call void asm sideeffect "", "^wa"
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)()
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <4 x float> @_mm_round_ss(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, <4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <4 x float> @_mm_round_ps(<4 x float> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: extractelement <4 x float> %{{[0-9a-zA-Z_.]+}}, i32 0
+
+// CHECK-LABEL: define available_externally <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffs to i32 ()*)()
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0"
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)()
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i32)*)(i32 noundef signext 0)
+// CHECK: %{{[0-9a-zA-Z_.]+}} = call <2 x double> asm "", "=^wa,0"
+// CHECK: call <2 x double> @vec_rint(double vector[2])
+// CHECK: call void asm sideeffect "", "^wa"
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_set_fpscr_rn to i32 (i64)*)(i64 noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <2 x double> @vec_floor(double vector[2])
+// CHECK: call <2 x double> @vec_ceil(double vector[2])
+// CHECK: call <2 x double> @vec_trunc(double vector[2])
+// CHECK: call <2 x double> @vec_rint(double vector[2])
+// CHECK: call void asm sideeffect "", "^wa"
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mffsl to i32 ()*)()
+// CHECK: call signext i32 bitcast (i32 (...)* @__builtin_mtfsf to i32 (i32, double)*)(i32 noundef signext 3, double noundef %{{[0-9a-zA-Z_.]+}})
+
+// CHECK-LABEL: define available_externally <2 x double> @_mm_round_sd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, <2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <2 x double> @_mm_round_pd(<2 x double> noundef %{{[0-9a-zA-Z_.]+}}, i32 noundef signext %{{[0-9a-zA-Z_.]+}})
+// CHECK: extractelement <2 x double> %{{[0-9a-zA-Z_.]+}}, i32 0
+// CHECK: extractelement <2 x double> %{{[0-9a-zA-Z_.]+}}, i32 1
+
+void __attribute__((noinline))
+test_testing() {
+ _mm_testc_si128(m1, m2);
+ _mm_testnzc_si128(m1, m2);
+ _mm_testz_si128(m1, m2);
+}
+
+// CHECK-LABEL: @test_testing
+
+// CHECK-LABEL: define available_externally signext i32 @_mm_testc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <16 x i8> @vec_nor(unsigned char vector[16], unsigned char vector[16])
+// CHECK: call <16 x i8> @vec_and(unsigned char vector[16], unsigned char vector[16])
+// CHECK: call signext i32 @vec_all_eq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %call1, <16 x i8> noundef zeroinitializer)
+
+// CHECK-LABEL: define available_externally signext i32 @_mm_testnzc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call signext i32 @_mm_testz_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call signext i32 @_mm_testc_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: zext i1 %{{[0-9a-zA-Z_.]+}} to i32
+
+// CHECK-LABEL: define available_externally signext i32 @_mm_testz_si128(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <16 x i8> @vec_and(unsigned char vector[16], unsigned char vector[16])
+// CHECK: call signext i32 @vec_all_eq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %call, <16 x i8> noundef zeroinitializer)
+
+void __attribute__((noinline))
+test_compare() {
+ _mm_cmpeq_epi64(m1, m2);
+ _mm_cmpgt_epi64(m1, m2);
+}
+
+// CHECK-LABEL: @test_compare
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cmpeq_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <2 x i64> @vec_cmpeq(long long vector[2], long long vector[2])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_cmpgt_epi64(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <2 x i64> @vec_cmpgt(long long vector[2], long long vector[2])
+
+void __attribute__((noinline))
+test_mul() {
+ _mm_mul_epi32(m1, m2);
+ _mm_mullo_epi32(m1, m2);
+}
+
+// CHECK-LABEL: @test_mul
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_mul_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: %call = call <2 x i64> @vec_mule(int vector[4], int vector[4])
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_mullo_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: %call = call <4 x i32> @vec_mul(unsigned int vector[4], unsigned int vector[4])
+
+void __attribute__((noinline))
+test_packus() {
+ _mm_packus_epi32(m1, m2);
+}
+
+// CHECK-LABEL: @test_packus
+
+// CHECK-LABEL: define available_externally <2 x i64> @_mm_packus_epi32(<2 x i64> noundef %{{[0-9a-zA-Z_.]+}}, <2 x i64> noundef %{{[0-9a-zA-Z_.]+}})
+// CHECK: call <8 x i16> @vec_packsu(int vector[4], int vector[4])(<4 x i32> noundef %1, <4 x i32> noundef %3)
+
// To test smmintrin.h includes tmmintrin.h
void __attribute__((noinline))
diff --git a/clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c b/clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c
new file mode 100644
index 0000000000000..fb5b9e82be45c
--- /dev/null
+++ b/clang/test/CodeGen/PowerPC/ppc-x86gprintrin.c
@@ -0,0 +1,239 @@
+// REQUIRES: powerpc-registered-target
+// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
+// RUN: %clang -S -emit-llvm -target powerpc64-unknown-linux-gnu -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
+
+// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-freebsd13.0 -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
+// RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr7 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s
+
+#include <x86gprintrin.h>
+
+unsigned short us;
+unsigned ui;
+unsigned long long ul;
+
+void __attribute__((noinline))
+test_bmiintrin() {
+ __tzcnt_u16(us);
+ __andn_u32(ui, ui);
+ _bextr_u32(ui, ui, ui);
+ __bextr_u32(ui, ui);
+ __blsi_u32(ui);
+ _blsi_u32(ui);
+ __blsmsk_u32(ui);
+ _blsmsk_u32(ui);
+ __blsr_u32(ui);
+ _blsr_u32(ui);
+ __tzcnt_u32(ui);
+ _tzcnt_u32(ui);
+ __andn_u64(ul, ul);
+ _bextr_u64(ul, ui, ui);
+ __bextr_u64(ul, ul);
+ __blsi_u64(ul);
+ _blsi_u64(ul);
+ __blsmsk_u64(ul);
+ _blsmsk_u64(ul);
+ __blsr_u64(ul);
+ _blsr_u64(ul);
+ __tzcnt_u64(ul);
+ _tzcnt_u64(ul);
+}
+
+// CHECK-LABEL: @test_bmiintrin
+
+// CHECK-LABEL: define available_externally zeroext i16 @__tzcnt_u16(i16 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i16 %{{[0-9a-zA-Z._]+}} to i32
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i32 @llvm.cttz.i32(i32 %[[CONV]], i1 false)
+// CHECK: trunc i32 %[[CALL]] to i16
+
+// CHECK-LABEL: define available_externally zeroext i32 @__andn_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[NEG:[0-9a-zA-Z._]+]] = xor i32 %{{[0-9a-zA-Z._]+}}, -1
+// CHECK: and i32 %[[NEG]], %1
+
+// CHECK-LABEL: define available_externally zeroext i32 @_bextr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[ADD:[0-9a-zA-Z._]+]] = add i32 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 32, %[[ADD]]
+// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
+// CHECK: %[[SUB]]1 = sub i32 32, %{{[0-9a-zA-Z._]+}}
+// CHECK: lshr i32 %[[SHL]], %[[SUB]]1
+
+// CHECK-LABEL: define available_externally zeroext i32 @__bextr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i32 %{{[0-9a-zA-Z._]+}}, 255
+// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i32 %{{[0-9a-zA-Z._]+}}, 8
+// CHECK: and i32 %[[SHR]], 255
+// CHECK: call zeroext i32 @_bextr_u32
+
+// CHECK-LABEL: define available_externally zeroext i32 @__blsi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 0, %1
+// CHECK: and i32 %0, %[[SUB]]
+
+// CHECK-LABEL: define available_externally zeroext i32 @_blsi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: call zeroext i32 @__blsi_u32
+
+// CHECK-LABEL: define available_externally zeroext i32 @__blsmsk_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 %{{[0-9a-zA-Z._]+}}, 1
+// CHECK: xor i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
+
+// CHECK-LABEL: define available_externally zeroext i32 @_blsmsk_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: call zeroext i32 @__blsmsk_u32
+
+// CHECK-LABEL: define available_externally zeroext i32 @__blsr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 %{{[0-9a-zA-Z._]+}}, 1
+// CHECK: and i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
+
+// CHECK-LABEL: define available_externally zeroext i32 @_blsr_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: call zeroext i32 @__blsr_u32
+
+// CHECK-LABEL: define available_externally zeroext i32 @__tzcnt_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: call i32 @llvm.cttz.i32(i32 %{{[0-9a-zA-Z._]+}}, i1 false)
+
+// CHECK-LABEL: define available_externally zeroext i32 @_tzcnt_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: call i32 @llvm.cttz.i32(i32 %{{[0-9a-zA-Z._]+}}, i1 false)
+
+// CHECK-LABEL: define available_externally i64 @__andn_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[NEG:[0-9a-zA-Z._]+]] = xor i64 %{{[0-9a-zA-Z._]+}}, -1
+// CHECK: and i64 %[[NEG]], %{{[0-9a-zA-Z._]+}}
+
+// CHECK-LABEL: define available_externally i64 @_bextr_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[ADD:[0-9a-zA-Z._]+]] = add i32 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 64, %[[ADD]]
+// CHECK: %[[EXT:[0-9a-zA-Z._]+]] = zext i32 %[[SUB]] to i64
+// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, %[[EXT]]
+// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i32 64, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[EXT2:[0-9a-zA-Z._]+]] = zext i32 %[[SUB1]] to i64
+// CHECK: lshr i64 %[[SHL]], %[[EXT2]]
+
+// CHECK-LABEL: define available_externally i64 @__bextr_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, 255
+// CHECK: trunc i64 %[[AND]] to i32
+// CHECK: %[[AND1:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, 65280
+// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 %[[AND1]], 8
+// CHECK: trunc i64 %[[SHR]] to i32
+// CHECK: call i64 @_bextr_u64
+
+// CHECK-LABEL: define available_externally i64 @__blsi_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 0, %{{[0-9a-zA-Z._]+}}
+// CHECK: and i64 %0, %[[SUB]]
+
+// CHECK-LABEL: define available_externally i64 @_blsi_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: call i64 @__blsi_u64
+
+// CHECK-LABEL: define available_externally i64 @__blsmsk_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, 1
+// CHECK: xor i64 %0, %[[SUB]]
+
+// CHECK-LABEL: define available_externally i64 @_blsmsk_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: call i64 @__blsmsk_u64
+
+// CHECK-LABEL: define available_externally i64 @__blsr_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, 1
+// CHECK: and i64 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
+
+// CHECK-LABEL: define available_externally i64 @_blsr_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: call i64 @__blsr_u64
+
+// CHECK-LABEL: define available_externally i64 @__tzcnt_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.cttz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
+// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32
+// CHECK: sext i32 %[[CAST]] to i64
+
+// CHECK-LABEL: define available_externally i64 @_tzcnt_u64(i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.cttz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
+// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32
+// CHECK: sext i32 %[[CAST]] to i64
+
+void __attribute__((noinline))
+test_bmi2intrin() {
+ _bzhi_u32(ui, ui);
+ _mulx_u32(ui, ui, &ui);
+ _bzhi_u64(ul, ul);
+ _mulx_u64(ul, ul, &ul);
+ _pdep_u64(ul, ul);
+ _pext_u64(ul, ul);
+ _pdep_u32(ui, ui);
+ _pext_u32(ui, ui);
+}
+
+// CHECK-LABEL: @test_bmi2intrin
+
+// CHECK-LABEL: define available_externally zeroext i32 @_bzhi_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i32 32, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i32 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
+// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i32 32, %{{[0-9a-zA-Z._]+}}
+// CHECK: lshr i32 %[[SHL]], %[[SUB1]]
+
+// CHECK-LABEL: define available_externally zeroext i32 @_mulx_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32* noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: zext i32 %{{[0-9a-zA-Z._]+}} to i64
+// CHECK: zext i32 %{{[0-9a-zA-Z._]+}} to i64
+// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 %{{[0-9a-zA-Z._]+}}, 32
+// CHECK: trunc i64 %[[SHR]] to i32
+// CHECK: trunc i64 %{{[0-9a-zA-Z._]+}} to i32
+
+// CHECK-LABEL: define available_externally i64 @_bzhi_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 64, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, %[[SUB]]
+// CHECK: %[[SUB1:[0-9a-zA-Z._]+]] = sub i64 64, %{{[0-9a-zA-Z._]+}}
+// CHECK: lshr i64 %[[SHL]], %[[SUB1]]
+
+// CHECK-LABEL: define available_externally i64 @_mulx_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}}, i64* noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: zext i64 %{{[0-9a-zA-Z._]+}} to i128
+// CHECK: zext i64 %{{[0-9a-zA-Z._]+}} to i128
+// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i128 %{{[0-9a-zA-Z._]+}}, 64
+// CHECK: trunc i128 %[[SHR]] to i64
+// CHECK: trunc i128 %{{[0-9a-zA-Z._]+}} to i64
+
+// CHECK-LABEL: define available_externally i64 @_pdep_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CAST:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL]] to i32
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub nsw i32 64, %[[CAST]]
+// CHECK: sext i32 %[[SUB]] to i64
+// CHECK: %[[CALL2:[0-9a-zA-Z._]+]] = call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
+// CHECK: %[[CAST2:[0-9a-zA-Z._]+]] = trunc i64 %[[CALL2]] to i32
+// CHECK: sext i32 %[[CAST2]] to i64
+// CHECK: %[[SUB2:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
+// CHECK: shl i64 %{{[0-9a-zA-Z._]+}}, %[[SUB2]]
+// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
+// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR]]
+// CHECK: %[[SHR2:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, %[[SHR2]]
+// CHECK: or i64 %{{[0-9a-zA-Z._]+}}, %[[AND]]
+// CHECK: add i64 %{{[0-9a-zA-Z._]+}}, 1
+
+// CHECK-LABEL: define available_externally i64 @_pext_u64(i64 noundef %{{[0-9a-zA-Z._]+}}, i64 noundef %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i1 @llvm.is.constant.i64(i64 %{{[0-9a-zA-Z._]+}})
+// CHECK: br i1 %[[CALL]], label %[[TRUECOND:[0-9a-zA-Z._]+]], label %[[FALSECOND:[0-9a-zA-Z._]+]]
+// CHECK: [[TRUECOND]]:
+// CHECK: %[[CALL2:[0-9a-zA-Z._]+]] = call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}})
+// CHECK: call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
+// CHECK: %[[SHL:[0-9a-zA-Z._]+]] = shl i64 %{{[0-9a-zA-Z._]+}}, 8
+// CHECK: or i64 %[[SHL]], %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[SHR:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
+// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR]]
+// CHECK: add nsw i64 %{{[0-9a-zA-Z._]+}}, 1
+// CHECK: call i64 @llvm.ppc.bpermd
+// CHECK: [[FALSECOND]]:
+// CHECK: call i64 @llvm.ctpop.i64(i64 %{{[0-9a-zA-Z._]+}})
+// CHECK: call i64 @llvm.ctlz.i64(i64 %{{[0-9a-zA-Z._]+}}, i1 false)
+// CHECK: %[[SHR2:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
+// CHECK: %[[AND:[0-9a-zA-Z._]+]] = and i64 %{{[0-9a-zA-Z._]+}}, %[[SHR2]]
+// CHECK: %[[SUB:[0-9a-zA-Z._]+]] = sub i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
+// CHECK: lshr i64 %[[AND]], %[[SUB]]
+// CHECK: %[[SHR3:[0-9a-zA-Z._]+]] = lshr i64 -9223372036854775808, %{{[0-9a-zA-Z._]+}}
+// CHECK: xor i64 %{{[0-9a-zA-Z._]+}}, %[[SHR3]]
+// CHECK: or i64 %{{[0-9a-zA-Z._]+}}, %{{[0-9a-zA-Z._]+}}
+// CHECK: add i64 %{{[0-9a-zA-Z._]+}}, 1
+
+// CHECK-LABEL: define available_externally zeroext i32 @_pdep_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64
+// CHECK: %[[CONV1:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @_pdep_u64(i64 noundef %[[CONV]], i64 noundef %[[CONV1]])
+// CHECK: trunc i64 %[[CALL]] to i32
+
+// CHECK-LABEL: define available_externally zeroext i32 @_pext_u32(i32 noundef zeroext %{{[0-9a-zA-Z._]+}}, i32 noundef zeroext %{{[0-9a-zA-Z._]+}})
+// CHECK: %[[CONV:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64
+// CHECK: %[[CONV1:[0-9a-zA-Z._]+]] = zext i32 %{{[0-9a-zA-Z._]+}} to i64
+// CHECK: %[[CALL:[0-9a-zA-Z._]+]] = call i64 @_pext_u64(i64 noundef %[[CONV]], i64 noundef %[[CONV1]])
+// CHECK: trunc i64 %[[CALL]] to i32
diff --git a/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c b/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c
index 1a331087efd9e..163bd7716ff04 100644
--- a/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c
+++ b/clang/test/CodeGen/PowerPC/ppc-xmmintrin.c
@@ -9,6 +9,9 @@
// RUN: %clang -x c++ -fsyntax-only -target powerpc64le-unknown-linux-gnu -mcpu=pwr8 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns
+// RUN: %clang -S -emit-llvm -target powerpc64le-unknown-linux-gnu -mcpu=pwr10 -ffreestanding -DNO_WARN_X86_INTRINSICS %s \
+// RUN: -ffp-contract=off -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-P10
+
// RUN: %clang -S -emit-llvm -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -nostdlibinc -DNO_WARN_X86_INTRINSICS %s \
// RUN: -fno-discard-value-names -mllvm -disable-llvm-optzns -o - | llvm-cxxfilt -n | FileCheck %s --check-prefixes=CHECK,CHECK-BE
// RUN: %clang -x c++ -fsyntax-only -target powerpc64-unknown-freebsd13.0 -mcpu=pwr8 -ffreestanding -nostdlibinc -DNO_WARN_X86_INTRINSICS %s \
@@ -383,12 +386,12 @@ test_convert() {
// CHECK: extractelement <4 x float> %{{[0-9a-zA-Z_.]+}}, i32 0
// CHECK-LABEL: define available_externally signext i32 @_mm_cvtss_si32
-// CHECK-LE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i64, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctiw $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0"
-// CHECK-BE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i64, double } asm "xscvspdp ${2:x},${0:x};\0Afctiw $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0"
-// CHECK: extractvalue { <4 x float>, i64, double } %[[VEC]], 0
-// CHECK: extractvalue { <4 x float>, i64, double } %[[VEC]], 1
-// CHECK: extractvalue { <4 x float>, i64, double } %[[VEC]], 2
-// CHECK: trunc i64 %{{[0-9a-zA-Z_.]+}} to i32
+// CHECK-LE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i32, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctiw $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0"
+// CHECK-BE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i32, double } asm "xscvspdp ${2:x},${0:x};\0Afctiw $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0"
+// CHECK-P10: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i32, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctiw $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0"
+// CHECK: extractvalue { <4 x float>, i32, double } %[[VEC]], 0
+// CHECK: extractvalue { <4 x float>, i32, double } %[[VEC]], 1
+// CHECK: extractvalue { <4 x float>, i32, double } %[[VEC]], 2
// CHECK-LABEL: define available_externally i64 @_mm_cvtss_si64
// CHECK-LE: %[[VEC:[0-9a-zA-Z_.]+]] = call { <4 x float>, i64, double } asm "xxsldwi ${0:x},${0:x},${0:x},3;\0Axscvspdp ${2:x},${0:x};\0Afctid $2,$2;\0Amfvsrd $1,${2:x};\0A", "=^wa,=r,=f,0"
@@ -681,9 +684,11 @@ test_move() {
// CHECK-LABEL: define available_externally signext i32 @_mm_movemask_ps
// CHECK-LE: call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef bitcast (<4 x i32> <i32 2113632, i32 -2139062144, i32 -2139062144, i32 -2139062144> to <16 x i8>))
// CHECK-LE: extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 1
+// CHECK-LE: trunc i64 %[[EXT]] to i32
// CHECK-BE: call <2 x i64> @vec_vbpermq(unsigned char vector[16], unsigned char vector[16])(<16 x i8> noundef %{{[0-9a-zA-Z_.]+}}, <16 x i8> noundef bitcast (<4 x i32> <i32 -2139062144, i32 -2139062144, i32 -2139062144, i32 2113632> to <16 x i8>))
// CHECK-BE: %[[EXT:[0-9a-zA-Z_.]+]] = extractelement <2 x i64> %{{[0-9a-zA-Z_.]+}}, i32 0
-// CHECK: trunc i64 %[[EXT]] to i32
+// CHECK-BE: trunc i64 %[[EXT]] to i32
+// CHECK-P10: call zeroext i32 @vec_extractm(unsigned int vector[4])(<4 x i32> noundef %{{[0-9a-zA-Z_.]+}})
void __attribute__((noinline))
test_alt_name_move() {
More information about the cfe-commits
mailing list