r276499 - Add doxygen comments to emmintrin.h's intrinsics.

Fri Jul 22 16:49:37 PDT 2016

Author: kromanova
Date: Fri Jul 22 18:49:37 2016
New Revision: 276499

URL: http://llvm.org/viewvc/llvm-project?rev=276499&view=rev
Log:
Add doxygen comments to emmintrin.h's intrinsics.

Only around 50% of the intrinsics in this file are documented now. The patches for the rest of the intrisics in this file will be send out later.

The doxygen comments are automatically generated based on Sony's intrinsics docu
ment.

I got an OK from Eric Christopher to commit doxygen comments without prior code
review upstream. This patch was internally reviewed by Paul Robinson.


Modified:
    cfe/trunk/lib/Headers/emmintrin.h

Modified: cfe/trunk/lib/Headers/emmintrin.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/emmintrin.h?rev=276499&r1=276498&r2=276499&view=diff
==============================================================================

--- cfe/trunk/lib/Headers/emmintrin.h (original)
+++ cfe/trunk/lib/Headers/emmintrin.h Fri Jul 22 18:49:37 2016
@@ -2154,42 +2154,169 @@ _mm_set1_epi8(char __b)
   return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
 }
 
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 64-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPUNPCKLQDQ / PUNPCKLQDQ instruction.
+///
+/// \param __q0
+///    A 64-bit integral value used to initialize the lower 64 bits of the
+///    result.
+/// \param __q1
+///    A 64-bit integral value used to initialize the upper 64 bits of the
+///    result.
+/// \returns An initialized 128-bit integer vector.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setr_epi64(__m64 __q0, __m64 __q1)
 {
   return (__m128i){ (long long)__q0, (long long)__q1 };
 }
 
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 32-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __i0
+///    A 32-bit integral value used to initialize bits [31:0] of the result.
+/// \param __i1
+///    A 32-bit integral value used to initialize bits [63:32] of the result.
+/// \param __i2
+///    A 32-bit integral value used to initialize bits [95:64] of the result.
+/// \param __i3
+///    A 32-bit integral value used to initialize bits [127:96] of the result.
+/// \returns An initialized 128-bit integer vector.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setr_epi32(int __i0, int __i1, int __i2, int __i3)
 {
   return (__m128i)(__v4si){ __i0, __i1, __i2, __i3};
 }
 
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 16-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __w0
+///    A 16-bit integral value used to initialize bits [15:0] of the result.
+/// \param __w1
+///    A 16-bit integral value used to initialize bits [31:16] of the result.
+/// \param __w2
+///    A 16-bit integral value used to initialize bits [47:32] of the result.
+/// \param __w3
+///    A 16-bit integral value used to initialize bits [63:48] of the result.
+/// \param __w4
+///    A 16-bit integral value used to initialize bits [79:64] of the result.
+/// \param __w5
+///    A 16-bit integral value used to initialize bits [95:80] of the result.
+/// \param __w6
+///    A 16-bit integral value used to initialize bits [111:96] of the result.
+/// \param __w7
+///    A 16-bit integral value used to initialize bits [127:112] of the result.
+/// \returns An initialized 128-bit integer vector.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7)
 {
   return (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 };
 }
 
+/// \brief Constructs a 128-bit integer vector, initialized in reverse order
+///     with the specified 8-bit integral values.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic is a utility function and does not correspond to a specific
+///    instruction.
+///
+/// \param __b0
+///    An 8-bit integral value used to initialize bits [7:0] of the result.
+/// \param __b1
+///    An 8-bit integral value used to initialize bits [15:8] of the result.
+/// \param __b2
+///    An 8-bit integral value used to initialize bits [23:16] of the result.
+/// \param __b3
+///    An 8-bit integral value used to initialize bits [31:24] of the result.
+/// \param __b4
+///    An 8-bit integral value used to initialize bits [39:32] of the result.
+/// \param __b5
+///    An 8-bit integral value used to initialize bits [47:40] of the result.
+/// \param __b6
+///    An 8-bit integral value used to initialize bits [55:48] of the result.
+/// \param __b7
+///    An 8-bit integral value used to initialize bits [63:56] of the result.
+/// \param __b8
+///    An 8-bit integral value used to initialize bits [71:64] of the result.
+/// \param __b9
+///    An 8-bit integral value used to initialize bits [79:72] of the result.
+/// \param __b10
+///    An 8-bit integral value used to initialize bits [87:80] of the result.
+/// \param __b11
+///    An 8-bit integral value used to initialize bits [95:88] of the result.
+/// \param __b12
+///    An 8-bit integral value used to initialize bits [103:96] of the result.
+/// \param __b13
+///    An 8-bit integral value used to initialize bits [111:104] of the result.
+/// \param __b14
+///    An 8-bit integral value used to initialize bits [119:112] of the result.
+/// \param __b15
+///    An 8-bit integral value used to initialize bits [127:120] of the result.
+/// \returns An initialized 128-bit integer vector.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15)
 {
   return (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 };
 }
 
+/// \brief Creates a 128-bit integer vector initialized to zero.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VXORPS / XORPS instruction.
+///
+/// \returns An initialized 128-bit integer vector with all elements set to
+///    zero.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_setzero_si128(void)
 {
   return (__m128i){ 0LL, 0LL };
 }
 
+/// \brief Stores a 128-bit integer vector to a memory location aligned on a
+///    128-bit boundary.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVAPS / MOVAPS instruction.
+///
+/// \param __p
+///    A pointer to an aligned memory location that will receive the integer
+///    values.
+/// \param __b
+///    A 128-bit integer vector containing the values to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_store_si128(__m128i *__p, __m128i __b)
 {
   *__p = __b;
 }
 
+/// \brief Stores a 128-bit integer vector to an unaligned memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVUPS / MOVUPS instruction.
+///
+/// \param __p
+///    A pointer to a memory location that will receive the integer values.
+/// \param __b
+///    A 128-bit integer vector containing the values to be moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storeu_si128(__m128i *__p, __m128i __b)
 {
@@ -2199,12 +2326,44 @@ _mm_storeu_si128(__m128i *__p, __m128i _
   ((struct __storeu_si128*)__p)->__v = __b;
 }
 
+/// \brief Moves bytes selected by the mask from the first operand to the
+///    specified unaligned memory location. When a mask bit is 1, the
+///    corresponding byte is written, otherwise it is not written. To minimize
+///    caching, the date is flagged as non-temporal (unlikely to be used again
+///    soon). Exception and trap behavior for elements not selected for storage
+///    to memory are implementation dependent.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMASKMOVDQU / MASKMOVDQU instruction.
+///
+/// \param __d
+///    A 128-bit integer vector containing the values to be moved.
+/// \param __n
+///    A 128-bit integer vector containing the mask. The most significant bit of
+///    each byte represents the mask bits.
+/// \param __p
+///    A pointer to an unaligned 128-bit memory location where the specified
+///    values are moved.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
 {
   __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
 }
 
+/// \brief Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to
+///    a memory location.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVLPS / MOVLPS instruction.
+///
+/// \param __p
+///    A pointer to a 64-bit memory location that will receive the lower 64 bits
+///    of the integer vector parameter.
+/// \param __a
+///    A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the
+///    value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_storel_epi64(__m128i *__p, __m128i __a)
 {
@@ -2214,18 +2373,54 @@ _mm_storel_epi64(__m128i *__p, __m128i _
   ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
 }
 
+/// \brief Stores a 128-bit floating point vector of [2 x double] to a 128-bit
+///    aligned memory location. To minimize caching, the data is flagged as
+///    non-temporal (unlikely to be used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVNTPS / MOVNTPS instruction.
+///
+/// \param __p
+///    A pointer to the 128-bit aligned memory location used to store the value.
+/// \param __a
+///    A vector of [2 x double] containing the 64-bit values to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_pd(double *__p, __m128d __a)
 {
   __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p);
 }
 
+/// \brief Stores a 128-bit integer vector to a 128-bit aligned memory location.
+///    To minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVNTPS / MOVNTPS instruction.
+///
+/// \param __p
+///    A pointer to the 128-bit aligned memory location used to store the value.
+/// \param __a
+///    A 128-bit integer vector containing the values to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_si128(__m128i *__p, __m128i __a)
 {
   __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p);
 }
 
+/// \brief Stores a 32-bit integer value in the specified memory location. To
+///    minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c MOVNTI instruction.
+///
+/// \param __p
+///    A pointer to the 32-bit memory location used to store the value.
+/// \param __a
+///    A 32-bit integer containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_si32(int *__p, int __a)
 {
@@ -2233,6 +2428,18 @@ _mm_stream_si32(int *__p, int __a)
 }
 
 #ifdef __x86_64__
+/// \brief Stores a 64-bit integer value in the specified memory location. To
+///    minimize caching, the data is flagged as non-temporal (unlikely to be
+///    used again soon).
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c MOVNTIQ instruction.
+///
+/// \param __p
+///    A pointer to the 64-bit memory location used to store the value.
+/// \param __a
+///    A 64-bit integer containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_stream_si64(long long *__p, long long __a)
 {
@@ -2240,42 +2447,158 @@ _mm_stream_si64(long long *__p, long lon
 }
 #endif
 
+/// \brief The cache line containing __p is flushed and invalidated from all
+///    caches in the coherency domain.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c CLFLUSH instruction.
+///
+/// \param __p
+///    A pointer to the memory location used to identify the cache line to be
+///    flushed.
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_clflush(void const *__p)
 {
   __builtin_ia32_clflush(__p);
 }
 
+/// \brief Forces strong memory ordering (serialization) between load
+///    instructions preceding this instruction and load instructions following
+///    this instruction, ensuring the system completes all previous loads before
+///    executing subsequent loads.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c LFENCE instruction.
+///
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_lfence(void)
 {
   __builtin_ia32_lfence();
 }
 
+/// \brief Forces strong memory ordering (serialization) between load and store
+///    instructions preceding this instruction and load and store instructions
+///    following this instruction, ensuring that the system completes all
+///    previous memory accesses before executing subsequent memory accesses.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c MFENCE instruction.
+///
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_mfence(void)
 {
   __builtin_ia32_mfence();
 }
 
+/// \brief Converts 16-bit signed integers from both 128-bit integer vector
+///    operands into 8-bit signed integers, and packs the results into the
+///    destination. Positive values greater than 0x7F are saturated to 0x7F.
+///    Negative values less than 0x80 are saturated to 0x80.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPACKSSWB / PACKSSWB instruction.
+///
+/// \param __a
+///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///   a signed integer and is converted to a 8-bit signed integer with
+///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
+///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+///   written to the lower 64 bits of the result.
+/// \param __b
+///   A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///   a signed integer and is converted to a 8-bit signed integer with
+///   saturation. Values greater than 0x7F are saturated to 0x7F. Values less
+///   than 0x80 are saturated to 0x80. The converted [8 x i8] values are
+///   written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packs_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Converts 32-bit signed integers from both 128-bit integer vector
+///    operands into 16-bit signed integers, and packs the results into the
+///    destination. Positive values greater than 0x7FFF are saturated to 0x7FFF.
+///    Negative values less than 0x8000 are saturated to 0x8000.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPACKSSDW / PACKSSDW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
+///    a signed integer and is converted to a 16-bit signed integer with
+///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
+///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+///    are written to the lower 64 bits of the result.
+/// \param __b
+///    A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as
+///    a signed integer and is converted to a 16-bit signed integer with
+///    saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values
+///    less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values
+///    are written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packs_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
 }
 
+/// \brief Converts 16-bit signed integers from both 128-bit integer vector
+///    operands into 8-bit unsigned integers, and packs the results into the
+///    destination. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0x00 are saturated to 0x00.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPACKUSWB / PACKUSWB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///    a signed integer and is converted to an 8-bit unsigned integer with
+///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+///    written to the lower 64 bits of the result.
+/// \param __b
+///    A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as
+///    a signed integer and is converted to an 8-bit unsigned integer with
+///    saturation. Values greater than 0xFF are saturated to 0xFF. Values less
+///    than 0x00 are saturated to 0x00. The converted [8 x i8] values are
+///    written to the higher 64 bits of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the converted values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_packus_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
 }
 
+/// \brief Extracts 16 bits from a 128-bit integer vector of [8 x i16], using
+///    the immediate-value parameter as a selector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPEXTRW / PEXTRW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \param __imm
+///    An immediate value. Bits [3:0] selects values from __a to be assigned to
+///    bits[15:0] of the result.
+///    000: assign values from bits [15:0] of __a.
+///    001: assign values from bits [31:16] of __a.
+///    010: assign values from bits [47:32] of __a.
+///    011: assign values from bits [63:48] of __a.
+///    100: assign values from bits [79:64] of __a.
+///    101: assign values from bits [95:80] of __a.
+///    110: assign values from bits [111:96] of __a.
+///    111: assign values from bits [127:112] of __a.
+/// \returns An integer, whose lower 16 bits are selected from the 128-bit
+///    integer vector parameter and the remaining bits are assigned zeros.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_extract_epi16(__m128i __a, int __imm)
 {
@@ -2283,6 +2606,26 @@ _mm_extract_epi16(__m128i __a, int __imm
   return (unsigned short)__b[__imm & 7];
 }
 
+/// \brief Constructs a 128-bit integer vector by first making a copy of the
+///    128-bit integer vector parameter, and then inserting the lower 16 bits
+///    of an integer parameter into an offset specified by the immediate-value
+///    parameter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPINSRW / PINSRW instruction.
+///
+/// \param __a
+///    A 128-bit integer vector of [8 x i16]. This vector is copied to the
+///    result and then one of the eight elements in the result is replaced by
+///    the lower 16 bits of __b.
+/// \param __b
+///    An integer. The lower 16 bits of this parameter are written to the
+///    result beginning at an offset specified by __imm.
+/// \param __imm
+///    An immediate value specifying the bit offset in the result at which the
+///    lower 16 bits of__b are written.
+/// \returns A 128-bit integer vector containing the constructed values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_insert_epi16(__m128i __a, int __b, int __imm)
 {
@@ -2291,18 +2634,85 @@ _mm_insert_epi16(__m128i __a, int __b, i
   return (__m128i)__c;
 }
 
+/// \brief Copies the values of the most significant bits from each 8-bit
+///    element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask
+///    value, zero-extends the value, and writes it to the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPMOVMSKB / PMOVMSKB instruction.
+///
+/// \param __a
+///    A 128-bit integer vector containing the values with bits to be extracted.
+/// \returns The most significant bits from each 8-bit element in __a, written
+///    to bits [15:0]. The other bits are assigned zeros. 
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_movemask_epi8(__m128i __a)
 {
   return __builtin_ia32_pmovmskb128((__v16qi)__a);
 }
 
+/// \brief Constructs a 128-bit integer vector by shuffling four 32-bit
+///    elements of a 128-bit integer vector parameter, using the immediate-value
+///    parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shuffle_epi32(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPSHUFD / PSHUFD instruction.
+///
+/// \param a
+///    A 128-bit integer vector containing the values to be copied.
+/// \param imm
+///    An immediate value containing an 8-bit value specifying which elements to
+///    copy from a. The destinations within the 128-bit destination are assigned
+///    values as follows:
+///    Bits [1:0] are used to assign values to bits [31:0] of the result.
+///    Bits [3:2] are used to assign values to bits [63:32] of the result.
+///    Bits [5:4] are used to assign values to bits [95:64] of the result.
+///    Bits [7:6] are used to assign values to bits [127:96] of the result.
+///    Bit value assignments:
+///    00: assign values from bits [31:0] of a.
+///    01: assign values from bits [63:32] of a.
+///    10: assign values from bits [95:64] of a.
+///    11: assign values from bits [127:96] of a.
+/// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
   (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
                                    (__v4si)_mm_undefined_si128(), \
                                    ((imm) >> 0) & 0x3, ((imm) >> 2) & 0x3, \
                                    ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); })
 
+/// \brief Constructs a 128-bit integer vector by shuffling four lower 16-bit
+///    elements of a 128-bit integer vector of [8 x i16], using the immediate
+///    value parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPSHUFLW / PSHUFLW instruction.
+///
+/// \param a
+///    A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits
+///    [127:64] of the result.
+/// \param imm
+///    An 8-bit immediate value specifying which elements to copy from a.
+///    Bits[1:0] are used to assign values to bits [15:0] of the result.
+///    Bits[3:2] are used to assign values to bits [31:16] of the result.
+///    Bits[5:4] are used to assign values to bits [47:32] of the result.
+///    Bits[7:6] are used to assign values to bits [63:48] of the result.
+///    Bit value assignments:
+///    00: assign values from bits [15:0] of a.
+///    01: assign values from bits [31:16] of a.
+///    10: assign values from bits [47:32] of a.
+///    11: assign values from bits [63:48] of a.
+/// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
                                    (__v8hi)_mm_undefined_si128(), \
@@ -2310,6 +2720,33 @@ _mm_movemask_epi8(__m128i __a)
                                    ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3, \
                                    4, 5, 6, 7); })
 
+/// \brief Constructs a 128-bit integer vector by shuffling four upper 16-bit
+///    elements of a 128-bit integer vector of [8 x i16], using the immediate
+///    value parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPSHUFHW / PSHUFHW instruction.
+///
+/// \param a
+///    A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits
+///    [63:0] of the result.
+/// \param imm
+///    An 8-bit immediate value specifying which elements to copy from a.
+///    Bits[1:0] are used to assign values to bits [79:64] of the result.
+///    Bits[3:2] are used to assign values to bits [95:80] of the result.
+///    Bits[5:4] are used to assign values to bits [111:96] of the result.
+///    Bits[7:6] are used to assign values to bits [127:112] of the result.
+///    Bit value assignments:
+///    00: assign values from bits [79:64] of a.
+///    01: assign values from bits [95:80] of a.
+///    10: assign values from bits [111:96] of a.
+///    11: assign values from bits [127:112] of a.
+/// \returns A 128-bit integer vector containing the shuffled values.
 #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
   (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
                                    (__v8hi)_mm_undefined_si128(), \
@@ -2319,131 +2756,463 @@ _mm_movemask_epi8(__m128i __a)
                                    4 + (((imm) >> 4) & 0x3), \
                                    4 + (((imm) >> 6) & 0x3)); })
 
+/// \brief Unpacks the high-order (index 8-15) values from two 128-bit vectors
+///    of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPUNPCKHBW / PUNPCKHBW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8].
+///    Bits [71:64] are written to bits [7:0] of the result
+///    Bits [79:72] are written to bits [23:16] of the result.
+///    Bits [87:80] are written to bits [39:32] of the result.
+///    Bits [95:88] are written to bits [55:48] of the result.
+///    Bits [103:96] are written to bits [71:64] of the result.
+///    Bits [111:104] are written to bits [87:80] of the result.
+///    Bits [119:112] are written to bits [103:96] of the result.
+///    Bits [127:120] are written to bits [119:112] of the result.
+/// \param __b
+///    A 128-bit vector of [16 x i8].
+///    Bits [71:64] are written to bits [15:8] of the result.
+///    Bits [79:72] are written to bits [31:24] of the result.
+///    Bits [87:80] are written to bits [47:40] of the result.
+///    Bits [95:88] are written to bits [63:56] of the result.
+///    Bits [103:96] are written to bits [79:72] of the result.
+///    Bits [111:104] are written to bits [95:88] of the result.
+///    Bits [119:112] are written to bits [111:104] of the result.
+///    Bits [127:120] are written to bits [127:120] of the destination.
+/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
 }
 
+/// \brief Unpacks the high-order (index 4-7) values from two 128-bit vectors of
+///    [8 x i16] and interleaves them into a 128-bit vector of [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPUNPCKHWD / PUNPCKHWD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+///    Bits [79:64] are written to bits [15:0] of the result.
+///    Bits [95:80] are written to bits [47:32] of the result.
+///    Bits [111:96] are written to bits [79:64] of the result.
+///    Bits [127:112] are written to bits [111:96] of the result.
+/// \param __b
+///    A 128-bit vector of [8 x i16].
+///    Bits [79:64] are written to bits [31:16] of the result.
+///    Bits [95:80] are written to bits [63:48] of the result.
+///    Bits [111:96] are written to bits [95:80] of the result.
+///    Bits [127:112] are written to bits [127:112] of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
 }
 
+/// \brief Unpacks the high-order (index 2,3) values from two 128-bit vectors of
+///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPUNPCKHDQ / PUNPCKHDQ instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32].
+///    Bits [95:64] are written to bits [31:0] of the destination.
+///    Bits [127:96] are written to bits [95:64] of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32].
+///    Bits [95:64] are written to bits [64:32] of the destination.
+///    Bits [127:96] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
 }
 
+/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
+///    of [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPUNPCKHQDQ / PUNPCKHQDQ instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64].
+///    Bits [127:64] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x i64].
+///    Bits [127:64] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpackhi_epi64(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1);
 }
 
+/// \brief Unpacks the low-order (index 0-7) values from two 128-bit vectors of
+///    [16 x i8] and interleaves them into a 128-bit vector of [16 x i8].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPUNPCKLBW / PUNPCKLBW instruction.
+///
+/// \param __a
+///    A 128-bit vector of [16 x i8].
+///    Bits [7:0] are written to bits [7:0] of the result.
+///    Bits [15:8] are written to bits [23:16] of the result.
+///    Bits [23:16] are written to bits [39:32] of the result.
+///    Bits [31:24] are written to bits [55:48] of the result.
+///    Bits [39:32] are written to bits [71:64] of the result.
+///    Bits [47:40] are written to bits [87:80] of the result.
+///    Bits [55:48] are written to bits [103:96] of the result.
+///    Bits [63:56] are written to bits [119:112] of the destination.
+/// \param __b
+///    A 128-bit vector of [16 x i8].
+///    Bits [7:0] are written to bits [15:8] of the result.
+///    Bits [15:8] are written to bits [31:24] of the result.
+///    Bits [23:16] are written to bits [47:40] of the result.
+///    Bits [31:24] are written to bits [63:56] of the result.
+///    Bits [39:32] are written to bits [79:72] of the result.
+///    Bits [47:40] are written to bits [95:88] of the result.
+///    Bits [55:48] are written to bits [111:104] of the result.
+///    Bits [63:56] are written to bits [127:120] of the result.
+/// \returns A 128-bit vector of [16 x i8] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi8(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
 }
 
+/// \brief Unpacks the low-order (index 0-3) values from each of the two 128-bit
+///    vectors of [8 x i16] and interleaves them into a 128-bit vector of
+///    [8 x i16].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPUNPCKLWD / PUNPCKLWD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [8 x i16].
+///    Bits [15:0] are written to bits [15:0] of the result.
+///    Bits [31:16] are written to bits [47:32] of the result.
+///    Bits [47:32] are written to bits [79:64] of the result.
+///    Bits [63:48] are written to bits [111:96] of the result.
+/// \param __b
+///    A 128-bit vector of [8 x i16].
+///    Bits [15:0] are written to bits [31:16] of the result.
+///    Bits [31:16] are written to bits [63:48] of the result.
+///    Bits [47:32] are written to bits [95:80] of the result.
+///    Bits [63:48] are written to bits [127:112] of the result.
+/// \returns A 128-bit vector of [8 x i16] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi16(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
 }
 
+/// \brief Unpacks the low-order (index 0,1) values from two 128-bit vectors of
+///    [4 x i32] and interleaves them into a 128-bit vector of [4 x i32].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPUNPCKLDQ / PUNPCKLDQ instruction.
+///
+/// \param __a
+///    A 128-bit vector of [4 x i32].
+///    Bits [31:0] are written to bits [31:0] of the destination.
+///    Bits [63:32] are written to bits [95:64] of the destination.
+/// \param __b
+///    A 128-bit vector of [4 x i32].
+///    Bits [31:0] are written to bits [64:32] of the destination.
+///    Bits [63:32] are written to bits [127:96] of the destination.
+/// \returns A 128-bit vector of [4 x i32] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi32(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
 }
 
+/// \brief Unpacks the low-order 64-bit elements from two 128-bit vectors of
+///    [2 x i64] and interleaves them into a 128-bit vector of [2 x i64].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VPUNPCKLQDQ / PUNPCKLQDQ instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x i64].
+///    Bits [63:0] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x i64].
+///    Bits [63:0] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x i64] containing the interleaved values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_unpacklo_epi64(__m128i __a, __m128i __b)
 {
   return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0);
 }
 
+/// \brief Returns the lower 64 bits of a 128-bit integer vector as a 64-bit
+///    integer. 
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector operand. The lower 64 bits are moved to the
+///    destination.
+/// \returns A 64-bit integer containing the lower 64 bits of the parameter.
 static __inline__ __m64 __DEFAULT_FN_ATTRS
 _mm_movepi64_pi64(__m128i __a)
 {
   return (__m64)__a[0];
 }
 
+/// \brief Moves the 64-bit operand to a 128-bit integer vector, zeroing the
+///    upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVQ / MOVQ / MOVD instruction.
+///
+/// \param __a
+///    A 64-bit value.
+/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
+///    the operand. The upper 64 bits are assigned zeros.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_movpi64_epi64(__m64 __a)
 {
   return (__m128i){ (long long)__a, 0 };
 }
 
+/// \brief Moves the lower 64 bits of a 128-bit integer vector to a 128-bit
+///    integer vector, zeroing the upper bits.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVQ / MOVQ instruction.
+///
+/// \param __a
+///    A 128-bit integer vector operand. The lower 64 bits are moved to the
+///    destination.
+/// \returns A 128-bit integer vector. The lower 64 bits contain the value from
+///    the operand. The upper 64 bits are assigned zeros.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_move_epi64(__m128i __a)
 {
   return __builtin_shufflevector((__v2di)__a, (__m128i){ 0 }, 0, 2);
 }
 
+/// \brief Unpacks the high-order (odd-indexed) values from two 128-bit vectors
+///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
+///    double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VUNPCKHPD / UNPCKHPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+///    Bits [127:64] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double].
+///    Bits [127:64] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_unpackhi_pd(__m128d __a, __m128d __b)
 {
   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1);
 }
 
+/// \brief Unpacks the low-order (even-indexed) values from two 128-bit vectors
+///    of [2 x double] and interleaves them into a 128-bit vector of [2 x
+///    double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VUNPCKLPD / UNPCKLPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double].
+///    Bits [63:0] are written to bits [63:0] of the destination.
+/// \param __b
+///    A 128-bit vector of [2 x double].
+///    Bits [63:0] are written to bits [127:64] of the destination.
+/// \returns A 128-bit vector of [2 x double] containing the interleaved values.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_unpacklo_pd(__m128d __a, __m128d __b)
 {
   return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0);
 }
 
+/// \brief Extracts the sign bits of the double-precision values in the 128-bit
+///    vector of [2 x double], zero-extends the value, and writes it to the
+///    low-order bits of the destination.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c VMOVMSKPD / MOVMSKPD instruction.
+///
+/// \param __a
+///    A 128-bit vector of [2 x double] containing the values with sign bits to
+///    be extracted.
+/// \returns The sign bits from each of the double-precision elements in __a,
+///    written to bits [1:0]. The remaining bits are assigned values of zero.
 static __inline__ int __DEFAULT_FN_ATTRS
 _mm_movemask_pd(__m128d __a)
 {
   return __builtin_ia32_movmskpd((__v2df)__a);
 }
 
+
+/// \brief Constructs a 128-bit floating-point vector of [2 x double] from two
+///    128-bit vector parameters of [2 x double], using the immediate-value
+///     parameter as a specifier.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VSHUFPD / SHUFPD instruction.
+///
+/// \param a
+///    A 128-bit vector of [2 x double].
+/// \param b
+///    A 128-bit vector of [2 x double].
+/// \param i
+///    An 8-bit immediate value. The least significant two bits specify which
+///    elements to copy from a and b:
+///    Bit[0] = 0: lower element of a copied to lower element of result. 
+///    Bit[0] = 1: upper element of a copied to lower element of result.
+///    Bit[1] = 0: lower element of b copied to upper element of result.
+///    Bit[1] = 1: upper element of b copied to upper element of result.
+/// \returns A 128-bit vector of [2 x double] containing the shuffled values.
 #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
   (__m128d)__builtin_shufflevector((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \
                                    0 + (((i) >> 0) & 0x1), \
                                    2 + (((i) >> 1) & 0x1)); })
 
+/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
+///    floating-point vector of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [2 x double].
+/// \returns A 128-bit floating-point vector of [4 x float] containing the same
+///    bitwise pattern as the parameter.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_castpd_ps(__m128d __a)
 {
   return (__m128)__a;
 }
 
+/// \brief Casts a 128-bit floating-point vector of [2 x double] into a 128-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [2 x double].
+/// \returns A 128-bit integer vector containing the same bitwise pattern as the
+///    parameter.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_castpd_si128(__m128d __a)
 {
   return (__m128i)__a;
 }
 
+/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
+///    floating-point vector of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float].
+/// \returns A 128-bit floating-point vector of [2 x double] containing the same
+///    bitwise pattern as the parameter.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_castps_pd(__m128 __a)
 {
   return (__m128d)__a;
 }
 
+/// \brief Casts a 128-bit floating-point vector of [4 x float] into a 128-bit
+///    integer vector.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit floating-point vector of [4 x float].
+/// \returns A 128-bit integer vector containing the same bitwise pattern as the
+///    parameter.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_castps_si128(__m128 __a)
 {
   return (__m128i)__a;
 }
 
+/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
+///    of [4 x float].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 128-bit floating-point vector of [4 x float] containing the same
+///    bitwise pattern as the parameter.
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_castsi128_ps(__m128i __a)
 {
   return (__m128)__a;
 }
 
+/// \brief Casts a 128-bit integer vector into a 128-bit floating-point vector
+///    of [2 x double].
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic has no corresponding instruction.
+///
+/// \param __a
+///    A 128-bit integer vector.
+/// \returns A 128-bit floating-point vector of [2 x double] containing the same
+///    bitwise pattern as the parameter.
 static __inline__ __m128d __DEFAULT_FN_ATTRS
 _mm_castsi128_pd(__m128i __a)
 {
   return (__m128d)__a;
 }
 
+/// \brief Indicates that a spin loop is being executed for the purposes of
+///    optimizing power consumption during the loop.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PAUSE instruction.
+///
 static __inline__ void __DEFAULT_FN_ATTRS
 _mm_pause(void)
 {