r343388 - [X86] Add more of the icc unaligned load/store to/from 128 bit vector intrinsics

Sat Sep 29 10:49:42 PDT 2018

Author: ctopper
Date: Sat Sep 29 10:49:42 2018
New Revision: 343388

URL: http://llvm.org/viewvc/llvm-project?rev=343388&view=rev
Log:
[X86] Add more of the icc unaligned load/store to/from 128 bit vector intrinsics

Summary:
This patch adds
_mm_loadu_si32
_mm_loadu_si16
_mm_storeu_si64
_mm_storeu_si32
_mm_storeu_si16

We already had _mm_load_si64.

Reviewers: spatel, RKSimon

Reviewed By: RKSimon

Subscribers: cfe-commits

Differential Revision: https://reviews.llvm.org/D52665

Modified:
    cfe/trunk/lib/Headers/emmintrin.h
    cfe/trunk/test/CodeGen/sse2-builtins.c

Modified: cfe/trunk/lib/Headers/emmintrin.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/emmintrin.h?rev=343388&r1=343387&r2=343388&view=diff
==============================================================================

--- cfe/trunk/lib/Headers/emmintrin.h (original)
+++ cfe/trunk/lib/Headers/emmintrin.h Sat Sep 29 10:49:42 2018
@@ -1675,7 +1675,49 @@ _mm_loadu_si64(void const *__a)
     long long __v;
   } __attribute__((__packed__, __may_alias__));
   long long __u = ((struct __loadu_si64*)__a)->__v;
-  return __extension__ (__m128i)(__v2di){__u, 0L};
+  return __extension__ (__m128i)(__v2di){__u, 0LL};
+}
+
+/// Loads a 32-bit integer value to the low element of a 128-bit integer
+///    vector and clears the upper element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
+///
+/// \param __a
+///    A pointer to a 32-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns A 128-bit vector of [4 x i32] containing the loaded value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_loadu_si32(void const *__a)
+{
+  struct __loadu_si32 {
+    int __v;
+  } __attribute__((__packed__, __may_alias__));
+  int __u = ((struct __loadu_si32*)__a)->__v;
+  return __extension__ (__m128i)(__v4si){__u, 0, 0, 0};
+}
+
+/// Loads a 16-bit integer value to the low element of a 128-bit integer
+///    vector and clears the upper element.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic does not correspond to a specific instruction.
+///
+/// \param __a
+///    A pointer to a 16-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns A 128-bit vector of [8 x i16] containing the loaded value.
+static __inline__ __m128i __DEFAULT_FN_ATTRS
+_mm_loadu_si16(void const *__a)
+{
+  struct __loadu_si16 {
+    short __v;
+  } __attribute__((__packed__, __may_alias__));
+  short __u = ((struct __loadu_si16*)__a)->__v;
+  return __extension__ (__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0};
 }
 
 /// Loads a 64-bit double-precision value to the low element of a
@@ -3993,6 +4035,69 @@ _mm_storeu_si128(__m128i *__p, __m128i _
   ((struct __storeu_si128*)__p)->__v = __b;
 }
 
+/// Stores a 64-bit integer value from the low element of a 128-bit integer
+///    vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction.
+///
+/// \param __p
+///    A pointer to a 64-bit memory location. The address of the memory
+///    location does not have to be algned.
+/// \param __b
+///    A 128-bit integer vector containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_si64(void const *__p, __m128i __b)
+{
+  struct __storeu_si64 {
+    long long __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_si64*)__p)->__v = ((__v2di)__b)[0];
+}
+
+/// Stores a 32-bit integer value from the low element of a 128-bit integer
+///    vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
+///
+/// \param __p
+///    A pointer to a 32-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \param __b
+///    A 128-bit integer vector containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_si32(void const *__p, __m128i __b)
+{
+  struct __storeu_si32 {
+    int __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_si32*)__p)->__v = ((__v4si)__b)[0];
+}
+
+/// Stores a 16-bit integer value from the low element of a 128-bit integer
+///    vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic does not correspond to a specific instruction.
+///
+/// \param __p
+///    A pointer to a 16-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \param __b
+///    A 128-bit integer vector containing the value to be stored.
+static __inline__ void __DEFAULT_FN_ATTRS
+_mm_storeu_si16(void const *__p, __m128i __b)
+{
+  struct __storeu_si16 {
+    short __v;
+  } __attribute__((__packed__, __may_alias__));
+  ((struct __storeu_si16*)__p)->__v = ((__v8hi)__b)[0];
+}
+
 /// Moves bytes selected by the mask from the first operand to the
 ///    specified unaligned memory location. When a mask bit is 1, the
 ///    corresponding byte is written, otherwise it is not written.

Modified: cfe/trunk/test/CodeGen/sse2-builtins.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/sse2-builtins.c?rev=343388&r1=343387&r2=343388&view=diff
==============================================================================
--- cfe/trunk/test/CodeGen/sse2-builtins.c (original)
+++ cfe/trunk/test/CodeGen/sse2-builtins.c Sat Sep 29 10:49:42 2018
@@ -721,6 +721,30 @@ __m128i test_mm_loadu_si64(void const* A
   return _mm_loadu_si64(A);
 }
 
+__m128i test_mm_loadu_si32(void const* A) {
+  // CHECK-LABEL: test_mm_loadu_si32
+  // CHECK: load i32, i32* %{{.*}}, align 1{{$}}
+  // CHECK: insertelement <4 x i32> undef, i32 %{{.*}}, i32 0
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 1
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 2
+  // CHECK: insertelement <4 x i32> %{{.*}}, i32 0, i32 3
+  return _mm_loadu_si32(A);
+}
+
+__m128i test_mm_loadu_si16(void const* A) {
+  // CHECK-LABEL: test_mm_loadu_si16
+  // CHECK: load i16, i16* %{{.*}}, align 1{{$}}
+  // CHECK: insertelement <8 x i16> undef, i16 %{{.*}}, i32 0
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 1
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 2
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 3
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 4
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 5
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 6
+  // CHECK: insertelement <8 x i16> %{{.*}}, i16 0, i32 7
+  return _mm_loadu_si16(A);
+}
+
 __m128i test_mm_madd_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_madd_epi16
   // CHECK: call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
@@ -1351,6 +1375,30 @@ void test_mm_storeu_si128(__m128i* A, __
   _mm_storeu_si128(A, B);
 }
 
+void test_mm_storeu_si64(void* A, __m128i B) {
+  // CHECK-LABEL: test_mm_storeu_si64
+  // CHECK: [[EXT:%.*]] = extractelement <2 x i64> %{{.*}}, i32 0
+  // CHECK: store i64 [[EXT]], i64* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
+  _mm_storeu_si64(A, B);
+}
+
+void test_mm_storeu_si32(void* A, __m128i B) {
+  // CHECK-LABEL: test_mm_storeu_si32
+  // CHECK: [[EXT:%.*]] = extractelement <4 x i32> %{{.*}}, i32 0
+  // CHECK: store i32 [[EXT]], i32* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
+  _mm_storeu_si32(A, B);
+}
+
+void test_mm_storeu_si16(void* A, __m128i B) {
+  // CHECK-LABEL: test_mm_storeu_si16
+  // CHECK: [[EXT:%.*]] = extractelement <8 x i16> %{{.*}}, i32 0
+  // CHECK: store i16 [[EXT]], i16* %{{.*}}, align 1{{$}}
+  // CHECK-NEXT: ret void
+  _mm_storeu_si16(A, B);
+}
+
 void test_mm_stream_pd(double *A, __m128d B) {
   // CHECK-LABEL: test_mm_stream_pd
   // CHECK: store <2 x double> %{{.*}}, <2 x double>* %{{.*}}, align 16, !nontemporal