[clang] [X86] Align 128/256 variants to use void * as 512 variants. (PR #66310)
Freddy Ye via cfe-commits
cfe-commits at lists.llvm.org
Tue Sep 19 19:31:07 PDT 2023
https://github.com/FreddyLeaf updated https://github.com/llvm/llvm-project/pull/66310
>From 21157a0e3b4c4e4e2430752ef806148685a942a2 Mon Sep 17 00:00:00 2001
From: Freddy Ye <freddy.ye at intel.com>
Date: Thu, 14 Sep 2023 09:17:39 +0800
Subject: [PATCH 1/3] [X86] Align 128/256 variants to use void * as 512
variants.
For *_stream_* series intrinsics.
---
clang/lib/Headers/avx2intrin.h | 2 +-
clang/lib/Headers/avxintrin.h | 6 +++---
clang/lib/Headers/emmintrin.h | 8 ++++----
clang/lib/Headers/smmintrin.h | 2 +-
clang/lib/Headers/xmmintrin.h | 2 +-
clang/test/CodeGen/X86/avx-builtins.c | 18 ++++++++++++++++++
clang/test/CodeGen/X86/avx2-builtins.c | 6 ++++++
clang/test/CodeGen/X86/sse-builtins.c | 6 ++++++
clang/test/CodeGen/X86/sse2-builtins.c | 24 ++++++++++++++++++++++++
clang/test/CodeGen/X86/sse41-builtins.c | 6 ++++++
10 files changed, 70 insertions(+), 10 deletions(-)
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index c45006193eddcc9..675a93bba1c8a4f 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -2979,7 +2979,7 @@ _mm256_xor_si256(__m256i __a, __m256i __b)
/// A pointer to the 32-byte aligned memory containing the vector to load.
/// \returns A 256-bit integer vector loaded from memory.
static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_stream_load_si256(__m256i const *__V)
+_mm256_stream_load_si256(void const *__V)
{
typedef __v4di __v4di_aligned __attribute__((aligned(32)));
return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index 94fac5e6c9da471..b796bb773ec11f0 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -3563,7 +3563,7 @@ _mm_maskstore_ps(float *__p, __m128i __m, __m128 __a)
/// \param __b
/// A 256-bit integer vector containing the values to be moved.
static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_si256(__m256i *__a, __m256i __b)
+_mm256_stream_si256(void *__a, __m256i __b)
{
typedef __v4di __v4di_aligned __attribute__((aligned(32)));
__builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a);
@@ -3583,7 +3583,7 @@ _mm256_stream_si256(__m256i *__a, __m256i __b)
/// \param __b
/// A 256-bit vector of [4 x double] containing the values to be moved.
static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_pd(double *__a, __m256d __b)
+_mm256_stream_pd(void *__a, __m256d __b)
{
typedef __v4df __v4df_aligned __attribute__((aligned(32)));
__builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a);
@@ -3604,7 +3604,7 @@ _mm256_stream_pd(double *__a, __m256d __b)
/// \param __a
/// A 256-bit vector of [8 x float] containing the values to be moved.
static __inline void __DEFAULT_FN_ATTRS
-_mm256_stream_ps(float *__p, __m256 __a)
+_mm256_stream_ps(void *__p, __m256 __a)
{
typedef __v8sf __v8sf_aligned __attribute__((aligned(32)));
__builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p);
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 064d974936598f8..eacb0182614304d 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -3945,7 +3945,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_storel_epi64(__m128i_u *__p,
/// A pointer to the 128-bit aligned memory location used to store the value.
/// \param __a
/// A vector of [2 x double] containing the 64-bit values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p,
+static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(void *__p,
__m128d __a) {
__builtin_nontemporal_store((__v2df)__a, (__v2df *)__p);
}
@@ -3963,7 +3963,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_pd(double *__p,
/// A pointer to the 128-bit aligned memory location used to store the value.
/// \param __a
/// A 128-bit integer vector containing the values to be stored.
-static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p,
+static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
__m128i __a) {
__builtin_nontemporal_store((__v2di)__a, (__v2di *)__p);
}
@@ -3983,7 +3983,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(__m128i *__p,
/// A 32-bit integer containing the value to be stored.
static __inline__ void
__attribute__((__always_inline__, __nodebug__, __target__("sse2")))
- _mm_stream_si32(int *__p, int __a) {
+ _mm_stream_si32(void *__p, int __a) {
__builtin_ia32_movnti(__p, __a);
}
@@ -4003,7 +4003,7 @@ static __inline__ void
/// A 64-bit integer containing the value to be stored.
static __inline__ void
__attribute__((__always_inline__, __nodebug__, __target__("sse2")))
- _mm_stream_si64(long long *__p, long long __a) {
+ _mm_stream_si64(void *__p, long long __a) {
__builtin_ia32_movnti64(__p, __a);
}
#endif
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index 16d8855a1c0b5d0..4e2eb46bb5421f2 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -645,7 +645,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
/// \returns A 128-bit integer vector containing the data stored at the
/// specified memory location.
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_stream_load_si128(__m128i const *__V) {
+_mm_stream_load_si128(void const *__V) {
return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);
}
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 80aa2a817f6afc1..10b6907ace07cc4 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -2140,7 +2140,7 @@ _mm_stream_pi(__m64 *__p, __m64 __a)
/// \param __a
/// A 128-bit vector of [4 x float] containing the values to be moved.
static __inline__ void __DEFAULT_FN_ATTRS
-_mm_stream_ps(float *__p, __m128 __a)
+_mm_stream_ps(void *__p, __m128 __a)
{
__builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p);
}
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index b68d192051b9bf4..06d3c321dd89592 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1940,18 +1940,36 @@ void test_mm256_stream_pd(double* A, __m256d B) {
_mm256_stream_pd(A, B);
}
+void test_mm256_stream_pd_void(void* A, __m256d B) {
+ // CHECK-LABEL: test_mm256_stream_pd_void
+ // CHECK: store <4 x double> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
+ _mm256_stream_pd(A, B);
+}
+
void test_mm256_stream_ps(float* A, __m256 B) {
// CHECK-LABEL: test_mm256_stream_ps
// CHECK: store <8 x float> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
_mm256_stream_ps(A, B);
}
+void test_mm256_stream_ps_void(void* A, __m256 B) {
+ // CHECK-LABEL: test_mm256_stream_ps_void
+ // CHECK: store <8 x float> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
+ _mm256_stream_ps(A, B);
+}
+
void test_mm256_stream_si256(__m256i* A, __m256i B) {
// CHECK-LABEL: test_mm256_stream_si256
// CHECK: store <4 x i64> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
_mm256_stream_si256(A, B);
}
+void test_mm256_stream_si256_void(void* A, __m256i B) {
+ // CHECK-LABEL: test_mm256_stream_si256_void
+ // CHECK: store <4 x i64> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
+ _mm256_stream_si256(A, B);
+}
+
__m256d test_mm256_sub_pd(__m256d A, __m256d B) {
// CHECK-LABEL: test_mm256_sub_pd
// CHECK: fsub <4 x double>
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 2750e1b227483ee..5b8c6ded7f216b7 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -1223,6 +1223,12 @@ __m256i test_mm256_stream_load_si256(__m256i const *a) {
return _mm256_stream_load_si256(a);
}
+__m256i test_mm256_stream_load_si256_const(void const *a) {
+ // CHECK-LABEL: test_mm256_stream_load_si256_const
+ // CHECK: load <4 x i64>, ptr %{{.*}}, align 32, !nontemporal
+ return _mm256_stream_load_si256(a);
+}
+
__m256i test_mm256_sub_epi8(__m256i a, __m256i b) {
// CHECK-LABEL: test_mm256_sub_epi8
// CHECK: sub <32 x i8>
diff --git a/clang/test/CodeGen/X86/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c
index da40380926d2c8a..9c64d420f7cdf10 100644
--- a/clang/test/CodeGen/X86/sse-builtins.c
+++ b/clang/test/CodeGen/X86/sse-builtins.c
@@ -720,6 +720,12 @@ void test_mm_stream_ps(float*A, __m128 B) {
_mm_stream_ps(A, B);
}
+void test_mm_stream_ps_2(void*A, __m128 B) {
+ // CHECK-LABEL: test_mm_stream_ps_2
+ // CHECK: store <4 x float> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal
+ _mm_stream_ps(A, B);
+}
+
__m128 test_mm_sub_ps(__m128 A, __m128 B) {
// CHECK-LABEL: test_mm_sub_ps
// CHECK: fsub <4 x float>
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index 7c62a128c331fc5..7165d2791827cfc 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -1488,18 +1488,36 @@ void test_mm_stream_pd(double *A, __m128d B) {
_mm_stream_pd(A, B);
}
+void test_mm_stream_pd_void(void *A, __m128d B) {
+ // CHECK-LABEL: test_mm_stream_pd_void
+ // CHECK: store <2 x double> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal
+ _mm_stream_pd(A, B);
+}
+
void test_mm_stream_si32(int *A, int B) {
// CHECK-LABEL: test_mm_stream_si32
// CHECK: store i32 %{{.*}}, ptr %{{.*}}, align 1, !nontemporal
_mm_stream_si32(A, B);
}
+void test_mm_stream_si32_void(void *A, int B) {
+ // CHECK-LABEL: test_mm_stream_si32_void
+ // CHECK: store i32 %{{.*}}, ptr %{{.*}}, align 1, !nontemporal
+ _mm_stream_si32(A, B);
+}
+
#ifdef __x86_64__
void test_mm_stream_si64(long long *A, long long B) {
// X64-LABEL: test_mm_stream_si64
// X64: store i64 %{{.*}}, ptr %{{.*}}, align 1, !nontemporal
_mm_stream_si64(A, B);
}
+
+void test_mm_stream_si64_void(void *A, long long B) {
+ // X64-LABEL: test_mm_stream_si64_void
+ // X64: store i64 %{{.*}}, ptr %{{.*}}, align 1, !nontemporal
+ _mm_stream_si64(A, B);
+}
#endif
void test_mm_stream_si128(__m128i *A, __m128i B) {
@@ -1508,6 +1526,12 @@ void test_mm_stream_si128(__m128i *A, __m128i B) {
_mm_stream_si128(A, B);
}
+void test_mm_stream_si128_void(void *A, __m128i B) {
+ // CHECK-LABEL: test_mm_stream_si128_void
+ // CHECK: store <2 x i64> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal
+ _mm_stream_si128(A, B);
+}
+
__m128i test_mm_sub_epi8(__m128i A, __m128i B) {
// CHECK-LABEL: test_mm_sub_epi8
// CHECK: sub <16 x i8>
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index fe59cbcaf1938c6..ad486a6d9950af6 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -358,6 +358,12 @@ __m128i test_mm_stream_load_si128(__m128i const *a) {
return _mm_stream_load_si128(a);
}
+__m128i test_mm_stream_load_si128_void(void const *a) {
+ // CHECK-LABEL: test_mm_stream_load_si128_void
+ // CHECK: load <2 x i64>, ptr %{{.*}}, align 16, !nontemporal
+ return _mm_stream_load_si128(a);
+}
+
int test_mm_test_all_ones(__m128i x) {
// CHECK-LABEL: test_mm_test_all_ones
// CHECK: call i32 @llvm.x86.sse41.ptestc(<2 x i64> %{{.*}}, <2 x i64> %{{.*}})
>From 3e463d3c41bb8e8391c62dd95910d009b4b39b7d Mon Sep 17 00:00:00 2001
From: Freddy Ye <freddy.ye at intel.com>
Date: Tue, 19 Sep 2023 08:29:49 +0800
Subject: [PATCH 2/3] Address comments.
---
clang/lib/Headers/avx2intrin.h | 2 +-
clang/lib/Headers/smmintrin.h | 2 +-
clang/test/CodeGen/X86/avx-builtins.c | 6 +++---
clang/test/CodeGen/X86/avx2-builtins.c | 4 ++--
clang/test/CodeGen/X86/sse-builtins.c | 4 ++--
clang/test/CodeGen/X86/sse41-builtins.c | 2 +-
6 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index 675a93bba1c8a4f..9196c8c7d24f7c8 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -2979,7 +2979,7 @@ _mm256_xor_si256(__m256i __a, __m256i __b)
/// A pointer to the 32-byte aligned memory containing the vector to load.
/// \returns A 256-bit integer vector loaded from memory.
static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_stream_load_si256(void const *__V)
+_mm256_stream_load_si256(const void *__V)
{
typedef __v4di __v4di_aligned __attribute__((aligned(32)));
return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index 4e2eb46bb5421f2..feb61bec3b4873e 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -645,7 +645,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_mul_epi32(__m128i __V1,
/// \returns A 128-bit integer vector containing the data stored at the
/// specified memory location.
static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_stream_load_si128(void const *__V) {
+_mm_stream_load_si128(const void *__V) {
return (__m128i)__builtin_nontemporal_load((const __v2di *)__V);
}
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 06d3c321dd89592..9178ecaf3f8fe43 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1940,7 +1940,7 @@ void test_mm256_stream_pd(double* A, __m256d B) {
_mm256_stream_pd(A, B);
}
-void test_mm256_stream_pd_void(void* A, __m256d B) {
+void test_mm256_stream_pd_void(void *A, __m256d B) {
// CHECK-LABEL: test_mm256_stream_pd_void
// CHECK: store <4 x double> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
_mm256_stream_pd(A, B);
@@ -1952,7 +1952,7 @@ void test_mm256_stream_ps(float* A, __m256 B) {
_mm256_stream_ps(A, B);
}
-void test_mm256_stream_ps_void(void* A, __m256 B) {
+void test_mm256_stream_ps_void(void *A, __m256 B) {
// CHECK-LABEL: test_mm256_stream_ps_void
// CHECK: store <8 x float> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
_mm256_stream_ps(A, B);
@@ -1964,7 +1964,7 @@ void test_mm256_stream_si256(__m256i* A, __m256i B) {
_mm256_stream_si256(A, B);
}
-void test_mm256_stream_si256_void(void* A, __m256i B) {
+void test_mm256_stream_si256_void(void *A, __m256i B) {
// CHECK-LABEL: test_mm256_stream_si256_void
// CHECK: store <4 x i64> %{{.*}}, ptr %{{.*}}, align 32, !nontemporal
_mm256_stream_si256(A, B);
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 5b8c6ded7f216b7..fd72e25afdb45cc 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -1223,8 +1223,8 @@ __m256i test_mm256_stream_load_si256(__m256i const *a) {
return _mm256_stream_load_si256(a);
}
-__m256i test_mm256_stream_load_si256_const(void const *a) {
- // CHECK-LABEL: test_mm256_stream_load_si256_const
+__m256i test_mm256_stream_load_si256_void(const void *a) {
+ // CHECK-LABEL: test_mm256_stream_load_si256_void
// CHECK: load <4 x i64>, ptr %{{.*}}, align 32, !nontemporal
return _mm256_stream_load_si256(a);
}
diff --git a/clang/test/CodeGen/X86/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c
index 9c64d420f7cdf10..885c82856522d2a 100644
--- a/clang/test/CodeGen/X86/sse-builtins.c
+++ b/clang/test/CodeGen/X86/sse-builtins.c
@@ -720,8 +720,8 @@ void test_mm_stream_ps(float*A, __m128 B) {
_mm_stream_ps(A, B);
}
-void test_mm_stream_ps_2(void*A, __m128 B) {
- // CHECK-LABEL: test_mm_stream_ps_2
+void test_mm_stream_ps_void(void *A, __m128 B) {
+ // CHECK-LABEL: test_mm_stream_ps_void
// CHECK: store <4 x float> %{{.*}}, ptr %{{.*}}, align 16, !nontemporal
_mm_stream_ps(A, B);
}
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index ad486a6d9950af6..bfe7a917a88555e 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -358,7 +358,7 @@ __m128i test_mm_stream_load_si128(__m128i const *a) {
return _mm_stream_load_si128(a);
}
-__m128i test_mm_stream_load_si128_void(void const *a) {
+__m128i test_mm_stream_load_si128_void(const void *a) {
// CHECK-LABEL: test_mm_stream_load_si128_void
// CHECK: load <2 x i64>, ptr %{{.*}}, align 16, !nontemporal
return _mm_stream_load_si128(a);
>From 24e17651bf39f07bbd886ac173a8d4f3f56d2093 Mon Sep 17 00:00:00 2001
From: Freddy Ye <freddy.ye at intel.com>
Date: Wed, 20 Sep 2023 09:40:26 +0800
Subject: [PATCH 3/3] Add extra cast for _mm_stream_si64/32 due to lit fails.
---
clang/lib/Headers/emmintrin.h | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index eacb0182614304d..8de2864b110653f 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -3984,7 +3984,7 @@ static __inline__ void __DEFAULT_FN_ATTRS _mm_stream_si128(void *__p,
static __inline__ void
__attribute__((__always_inline__, __nodebug__, __target__("sse2")))
_mm_stream_si32(void *__p, int __a) {
- __builtin_ia32_movnti(__p, __a);
+ __builtin_ia32_movnti((int *)__p, __a);
}
#ifdef __x86_64__
@@ -4004,7 +4004,7 @@ static __inline__ void
static __inline__ void
__attribute__((__always_inline__, __nodebug__, __target__("sse2")))
_mm_stream_si64(void *__p, long long __a) {
- __builtin_ia32_movnti64(__p, __a);
+ __builtin_ia32_movnti64((long long *)__p, __a);
}
#endif
More information about the cfe-commits
mailing list