[clang] Update SSE/AVX integer comparison intrinsics to be used in constexpr (PR #155656)

Thu Aug 28 07:28:32 PDT 2025

https://github.com/smoke-y updated https://github.com/llvm/llvm-project/pull/155656

>From 032c6310682a4dc535fad7d94f07dd585bc24df1 Mon Sep 17 00:00:00 2001
From: smoke-y <SnowSneeze at protonmail.com>
Date: Wed, 27 Aug 2025 22:08:02 +0530
Subject: [PATCH 1/8] Update MMX integer comparison intrinsics to be used in
 constexpr

---
 clang/lib/Headers/avx2intrin.h          | 16 ++++++++--------
 clang/lib/Headers/emmintrin.h           | 18 +++++++++---------
 clang/lib/Headers/smmintrin.h           |  4 ++--
 clang/test/CodeGen/X86/avx2-builtins.c  |  8 ++++++++
 clang/test/CodeGen/X86/sse2-builtins.c  |  9 +++++++++
 clang/test/CodeGen/X86/sse41-builtins.c |  1 +
 clang/test/CodeGen/X86/sse42-builtins.c |  2 ++
 7 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index ce5b2b7544d8c..05bd15385d149 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -637,7 +637,7 @@ _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
 /// \param __b
 ///    A 256-bit integer vector containing one of the inputs.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v32qi)__a == (__v32qi)__b);
@@ -663,7 +663,7 @@ _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v16hi)__a == (__v16hi)__b);
@@ -689,7 +689,7 @@ _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v8si)__a == (__v8si)__b);
@@ -715,7 +715,7 @@ _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
 /// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v4di)__a == (__v4di)__b);
@@ -741,7 +741,7 @@ _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit integer vector containing one of the inputs.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
 {
   /* This function always performs a signed comparison, but __v32qi is a char
@@ -769,7 +769,7 @@ _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v16hi)__a > (__v16hi)__b);
@@ -795,7 +795,7 @@ _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v8si)__a > (__v8si)__b);
@@ -821,7 +821,7 @@ _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
 /// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
 {
   return (__m256i)((__v4di)__a > (__v4di)__b);
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 8b6b62458dac1..7f69019e01b06 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -3090,7 +3090,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
 /// \param __b
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpeq_epi8(__m128i __a,
                                                             __m128i __b) {
   return (__m128i)((__v16qi)__a == (__v16qi)__b);
 }
@@ -3109,7 +3109,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi8(__m128i __a,
 /// \param __b
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpeq_epi16(__m128i __a,
                                                              __m128i __b) {
   return (__m128i)((__v8hi)__a == (__v8hi)__b);
 }
@@ -3128,7 +3128,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi16(__m128i __a,
 /// \param __b
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpeq_epi32(__m128i __a,
                                                              __m128i __b) {
   return (__m128i)((__v4si)__a == (__v4si)__b);
 }
@@ -3148,7 +3148,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi32(__m128i __a,
 /// \param __b
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpgt_epi8(__m128i __a,
                                                             __m128i __b) {
   /* This function always performs a signed comparison, but __v16qi is a char
      which may be signed or unsigned, so use __v16qs. */
@@ -3170,7 +3170,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi8(__m128i __a,
 /// \param __b
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpgt_epi16(__m128i __a,
                                                              __m128i __b) {
   return (__m128i)((__v8hi)__a > (__v8hi)__b);
 }
@@ -3190,7 +3190,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi16(__m128i __a,
 /// \param __b
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpgt_epi32(__m128i __a,
                                                              __m128i __b) {
   return (__m128i)((__v4si)__a > (__v4si)__b);
 }
@@ -3210,7 +3210,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi32(__m128i __a,
 /// \param __b
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmplt_epi8(__m128i __a,
                                                             __m128i __b) {
   return _mm_cmpgt_epi8(__b, __a);
 }
@@ -3230,7 +3230,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi8(__m128i __a,
 /// \param __b
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmplt_epi16(__m128i __a,
                                                              __m128i __b) {
   return _mm_cmpgt_epi16(__b, __a);
 }
@@ -3250,7 +3250,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi16(__m128i __a,
 /// \param __b
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmplt_epi32(__m128i __a,
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmplt_epi32(__m128i __a,
                                                              __m128i __b) {
   return _mm_cmpgt_epi32(__b, __a);
 }
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index 57d0d329312af..3f44c786fb75f 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -1211,7 +1211,7 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M,
 /// \param __V2
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpeq_epi64(__m128i __V1,
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpeq_epi64(__m128i __V1,
                                                              __m128i __V2) {
   return (__m128i)((__v2di)__V1 == (__v2di)__V2);
 }
@@ -2338,7 +2338,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
 /// \param __V2
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cmpgt_epi64(__m128i __V1,
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpgt_epi64(__m128i __V1,
                                                              __m128i __V2) {
   return (__m128i)((__v2di)__V1 > (__v2di)__V2);
 }
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 49e35230ba225..84a4db9695b88 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -287,48 +287,56 @@ __m256i test_mm256_cmpeq_epi8(__m256i a, __m256i b) {
   // CHECK: icmp eq <32 x i8>
   return _mm256_cmpeq_epi8(a, b);
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_cmpeq_epi8((__m256i)(__v32qi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32}, (__m256i)(__v32qi){10,2,6,4,5,12,14,8,9,20,22,12,26,14,30,16,17,36,38,20,42,22,46,24,50,26,54,28,58,30,62,32}), 0,-1,0,-1,-1,0,0,-1,-1,0,0,-1,0,-1,0,-1,-1,0,0,-1,0,-1,0,-1,0,-1,0,-1,0,-1,0,-1));
 
 __m256i test_mm256_cmpeq_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_cmpeq_epi16
   // CHECK: icmp eq <16 x i16>
   return _mm256_cmpeq_epi16(a, b);
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_cmpeq_epi16((__m256i)(__v16hi){+1, -2, +3, -4, +5, -6, +7, -8, +9, -10, +11, -12, +13, -14, +15, -16}, (__m256i)(__v16hi){-10, -2, +6, -4, +5, -12, +14, -8, +9, -20, +22, -12, +26, -14, +30, -16}), 0, -1, 0, -1, -1, 0, 0, -1, -1, 0, 0, -1, 0, -1, 0, -1));
 
 __m256i test_mm256_cmpeq_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_cmpeq_epi32
   // CHECK: icmp eq <8 x i32>
   return _mm256_cmpeq_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_cmpeq_epi32((__m256i)(__v8si){+1, -2, +3, -4, +5, -6, +7, -8}, (__m256i)(__v8si){-10, -2, +6, -4, +5, -12, +14, -8}), 0, -1, 0, -1, -1, 0, 0, -1));
 
 __m256i test_mm256_cmpeq_epi64(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_cmpeq_epi64
   // CHECK: icmp eq <4 x i64>
   return _mm256_cmpeq_epi64(a, b);
 }
+TEST_CONSTEXPR(match_v4di(_mm256_cmpeq_epi64((__m256i)(__v4di){+1, -2, +3, -4}, (__m256i)(__v4di){-10, -2, +6, -4}), 0, -1, 0, -1));
 
 __m256i test_mm256_cmpgt_epi8(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_cmpgt_epi8
   // CHECK: icmp sgt <32 x i8>
   return _mm256_cmpgt_epi8(a, b);
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_cmpgt_epi8((__m256i)(__v32qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, (__m256i)(__v32qi){10, 2, 6, 5, 30, 7, 8, 1, 20, 3, 12, 8, 25, 10, 9, 2, 10, 2, 6, 5, 30, 7, 8, 1, 20, 3, 12, 8, 25, 10, 9, 2}), 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, -1, -1));
 
 __m256i test_mm256_cmpgt_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_cmpgt_epi16
   // CHECK: icmp sgt <16 x i16>
   return _mm256_cmpgt_epi16(a, b);
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_cmpgt_epi16((__m256i)(__v16hi){+1, -2, +3, -4, +5, -6, +7, -8, +1, -2, +3, -4, +5, -6, +7, -8}, (__m256i)(__v16hi){-10, -2, +6, -5, +30, -7, +8, -1, -10, -2, +6, -5, +30, -7, +8, -1}), -1, 0, 0, -1, 0, -1, 0, 0, -1, 0, 0, -1, 0, -1, 0, 0));
 
 __m256i test_mm256_cmpgt_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_cmpgt_epi32
   // CHECK: icmp sgt <8 x i32>
   return _mm256_cmpgt_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_cmpgt_epi32((__m256i)(__v8si){+1, -2, +3, -4, +5, -6, +7, -8}, (__m256i)(__v8si){-10, -2, +6, -5, +30, -7, +8, -1}), -1, 0, 0, -1, 0, -1, 0, 0));
 
 __m256i test_mm256_cmpgt_epi64(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_cmpgt_epi64
   // CHECK: icmp sgt <4 x i64>
   return _mm256_cmpgt_epi64(a, b);
 }
+TEST_CONSTEXPR(match_v4di(_mm256_cmpgt_epi64((__m256i)(__v4di){+1, -2, +3, -4}, (__m256i)(__v4di){-10, -2, +6, -5}), -1, 0, 0, -1));
 
 __m256i test_mm256_cvtepi8_epi16(__m128i a) {
   // CHECK-LABEL: test_mm256_cvtepi8_epi16
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index 38d5e877a5036..a578f04c97c71 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -243,18 +243,21 @@ __m128i test_mm_cmpeq_epi8(__m128i A, __m128i B) {
   // CHECK: icmp eq <16 x i8>
   return _mm_cmpeq_epi8(A, B);
 }
+TEST_CONSTEXPR(match_v16qi(_mm_cmpeq_epi8((__m128i)(__v16qi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}, (__m128i)(__v16qi){10,2,6,4,5,12,14,8,9,20,22,12,26,14,30,16}), 0,-1,0,-1,-1,0,0,-1,-1,0,0,-1,0,-1,0,-1));
 
 __m128i test_mm_cmpeq_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpeq_epi16
   // CHECK: icmp eq <8 x i16>
   return _mm_cmpeq_epi16(A, B);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_cmpeq_epi16((__m128i)(__v8hi){+1, -2, +3, -4, +5, -6, +7, -8}, (__m128i)(__v8hi){-10, -2, +6, -4, +5, -12, +14, -8}), 0, -1, 0, -1, -1, 0, 0, -1));
 
 __m128i test_mm_cmpeq_epi32(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpeq_epi32
   // CHECK: icmp eq <4 x i32>
   return _mm_cmpeq_epi32(A, B);
 }
+TEST_CONSTEXPR(match_v4si(_mm_cmpeq_epi32((__m128i)(__v4si){+1, -2, +3, -4}, (__m128i)(__v4si){-10, -2, +6, -4}), 0, -1, 0, -1));
 
 __m128d test_mm_cmpeq_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpeq_pd
@@ -293,18 +296,21 @@ __m128i test_mm_cmpgt_epi8(__m128i A, __m128i B) {
   // CHECK: icmp sgt <16 x i8>
   return _mm_cmpgt_epi8(A, B);
 }
+TEST_CONSTEXPR(match_v16qi(_mm_cmpgt_epi8((__m128i)(__v16qi){15,2,8,4,12,6,20,8,25,10,30,12,35,14,40,16}, (__m128i)(__v16qi){10,2,6,4,5,12,14,8,9,20,22,12,26,14,30,16}), -1,0,-1,0,-1,0,-1,0,-1,0,-1,0,-1,0,-1,0));
 
 __m128i test_mm_cmpgt_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpgt_epi16
   // CHECK: icmp sgt <8 x i16>
   return _mm_cmpgt_epi16(A, B);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_cmpgt_epi16((__m128i)(__v8hi){15,2,8,4,12,6,20,8}, (__m128i)(__v8hi){10,2,6,4,5,12,14,8}), -1,0,-1,0,-1,0,-1,0));
 
 __m128i test_mm_cmpgt_epi32(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpgt_epi32
   // CHECK: icmp sgt <4 x i32>
   return _mm_cmpgt_epi32(A, B);
 }
+TEST_CONSTEXPR(match_v4si(_mm_cmpgt_epi32((__m128i)(__v4si){15,2,8,4}, (__m128i)(__v4si){10,2,6,4}), -1,0,-1,0));
 
 __m128d test_mm_cmpgt_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmpgt_pd
@@ -343,18 +349,21 @@ __m128i test_mm_cmplt_epi8(__m128i A, __m128i B) {
   // CHECK: icmp sgt <16 x i8>
   return _mm_cmplt_epi8(A, B);
 }
+TEST_CONSTEXPR(match_v16qi(_mm_cmplt_epi8((__m128i)(__v16qi){5,2,3,4,1,6,7,8,9,5,11,12,13,10,15,8}, (__m128i)(__v16qi){10,2,6,4,5,12,14,8,9,20,22,12,26,14,30,16}), -1, 0, -1, 0, -1, -1, -1, 0, 0, -1, -1, 0, -1, -1, -1, -1));
 
 __m128i test_mm_cmplt_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmplt_epi16
   // CHECK: icmp sgt <8 x i16>
   return _mm_cmplt_epi16(A, B);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_cmplt_epi16((__m128i)(__v8hi){5,2,3,4,1,6,7,8}, (__m128i)(__v8hi){10,2,6,4,5,12,14,8}), -1, 0, -1, 0, -1, -1, -1, 0));
 
 __m128i test_mm_cmplt_epi32(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmplt_epi32
   // CHECK: icmp sgt <4 x i32>
   return _mm_cmplt_epi32(A, B);
 }
+TEST_CONSTEXPR(match_v4si(_mm_cmplt_epi32((__m128i)(__v4si){5,2,3,4}, (__m128i)(__v4si){10,2,6,4}), -1,0,-1,0));
 
 __m128d test_mm_cmplt_pd(__m128d A, __m128d B) {
   // CHECK-LABEL: test_mm_cmplt_pd
diff --git a/clang/test/CodeGen/X86/sse41-builtins.c b/clang/test/CodeGen/X86/sse41-builtins.c
index 500b780d49057..4f9f1530ce427 100644
--- a/clang/test/CodeGen/X86/sse41-builtins.c
+++ b/clang/test/CodeGen/X86/sse41-builtins.c
@@ -79,6 +79,7 @@ __m128i test_mm_cmpeq_epi64(__m128i A, __m128i B) {
   // CHECK: sext <2 x i1> %{{.*}} to <2 x i64>
   return _mm_cmpeq_epi64(A, B);
 }
+TEST_CONSTEXPR(match_v2di(_mm_cmpeq_epi64((__m128i)(__v2di){+1, -8}, (__m128i)(__v2di){-10, -8}), 0, -1));
 
 __m128i test_mm_cvtepi8_epi16(__m128i a) {
   // CHECK-LABEL: test_mm_cvtepi8_epi16
diff --git a/clang/test/CodeGen/X86/sse42-builtins.c b/clang/test/CodeGen/X86/sse42-builtins.c
index d0c0cce33e1d0..aa598b8f78069 100644
--- a/clang/test/CodeGen/X86/sse42-builtins.c
+++ b/clang/test/CodeGen/X86/sse42-builtins.c
@@ -9,6 +9,7 @@
 
 
 #include <immintrin.h>
+#include "builtin_test_helpers.h"
 
 // NOTE: This should match the tests in llvm/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
 
@@ -59,6 +60,7 @@ __m128i test_mm_cmpgt_epi64(__m128i A, __m128i B) {
   // CHECK: icmp sgt <2 x i64>
   return _mm_cmpgt_epi64(A, B);
 }
+TEST_CONSTEXPR(match_v2di(_mm_cmpgt_epi64((__m128i)(__v2di){+1, -8}, (__m128i)(__v2di){-10, -8}), -1, 0));
 
 int test_mm_cmpistra(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpistra

>From 75fc7f029a1d8e1825a1b81365f947fcff36d07b Mon Sep 17 00:00:00 2001
From: smoke-y <SnowSneeze at protonmail.com>
Date: Thu, 28 Aug 2025 16:41:55 +0530
Subject: [PATCH 2/8] qi -> qs

---
 clang/test/CodeGen/X86/avx2-builtins.c | 5 ++++-
 clang/test/CodeGen/X86/sse2-builtins.c | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 84a4db9695b88..7e25da35020c5 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -287,7 +287,10 @@ __m256i test_mm256_cmpeq_epi8(__m256i a, __m256i b) {
   // CHECK: icmp eq <32 x i8>
   return _mm256_cmpeq_epi8(a, b);
 }
-TEST_CONSTEXPR(match_v32qi(_mm256_cmpeq_epi8((__m256i)(__v32qi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32}, (__m256i)(__v32qi){10,2,6,4,5,12,14,8,9,20,22,12,26,14,30,16,17,36,38,20,42,22,46,24,50,26,54,28,58,30,62,32}), 0,-1,0,-1,-1,0,0,-1,-1,0,0,-1,0,-1,0,-1,-1,0,0,-1,0,-1,0,-1,0,-1,0,-1,0,-1,0,-1));
+TEST_CONSTEXPR(match_v16qi(_mm_cmpeq_epi8(
+    (__m128i)(__v16qs){1,-2,3,-4,-5,6,-7,8,-9,10,-11,12,-13,14,-15,16},
+    (__m128i)(__v16qs){10,-2,6,-4,-5,12,-14,8,-9,20,-22,12,-26,14,-30,16}),
+    0,-1,0,-1,-1,0,0,-1,-1,0,0,-1,0,-1,0,-1));
 
 __m256i test_mm256_cmpeq_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_cmpeq_epi16
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index a578f04c97c71..17030d0b7920e 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -243,7 +243,7 @@ __m128i test_mm_cmpeq_epi8(__m128i A, __m128i B) {
   // CHECK: icmp eq <16 x i8>
   return _mm_cmpeq_epi8(A, B);
 }
-TEST_CONSTEXPR(match_v16qi(_mm_cmpeq_epi8((__m128i)(__v16qi){1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16}, (__m128i)(__v16qi){10,2,6,4,5,12,14,8,9,20,22,12,26,14,30,16}), 0,-1,0,-1,-1,0,0,-1,-1,0,0,-1,0,-1,0,-1));
+TEST_CONSTEXPR(match_v16qi(_mm_cmpeq_epi8((__m128i)(__v16qs){1,-2,3,-4,-5,6,-7,8,-9,10,-11,12,-13,14,-15,16}, (__m128i)(__v16qs){10,-2,6,-4,-5,12,-14,8,-9,20,-22,12,-26,14,-30,16}), 0,-1,0,-1,-1,0,0,-1,-1,0,0,-1,0,-1,0,-1));
 
 __m128i test_mm_cmpeq_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpeq_epi16

>From 5b8a7f11ee2f454ca52c47ad3c19267ff58748ee Mon Sep 17 00:00:00 2001
From: smoke-y <SnowSneeze at protonmail.com>
Date: Thu, 28 Aug 2025 16:53:55 +0530
Subject: [PATCH 3/8] clang-format on avx2, emmintrin, smmintrin

---
 clang/lib/Headers/avx2intrin.h | 852 +++++++++++++++------------------
 clang/lib/Headers/emmintrin.h  |  36 +-
 clang/lib/Headers/smmintrin.h  |   8 +-
 3 files changed, 396 insertions(+), 500 deletions(-)

diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index 05bd15385d149..7d617e519d7b9 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -97,8 +97,8 @@
 ///     An unsigned immediate value specifying the starting positions of the
 ///     bytes to operate on.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-#define _mm256_mpsadbw_epu8(X, Y, M) \
-  ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
+#define _mm256_mpsadbw_epu8(X, Y, M)                                           \
+  ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X),                   \
                                       (__v32qi)(__m256i)(Y), (int)(M)))
 
 /// Computes the absolute value of each signed byte in the 256-bit integer
@@ -112,10 +112,8 @@
 /// \param __a
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_abs_epi8(__m256i __a)
-{
-    return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi8(__m256i __a) {
+  return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
 }
 
 /// Computes the absolute value of each signed 16-bit element in the 256-bit
@@ -129,10 +127,8 @@ _mm256_abs_epi8(__m256i __a)
 /// \param __a
 ///    A 256-bit vector of [16 x i16].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_abs_epi16(__m256i __a)
-{
-    return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi16(__m256i __a) {
+  return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
 }
 
 /// Computes the absolute value of each signed 32-bit element in the 256-bit
@@ -146,10 +142,8 @@ _mm256_abs_epi16(__m256i __a)
 /// \param __a
 ///    A 256-bit vector of [8 x i32].
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_abs_epi32(__m256i __a)
-{
-    return (__m256i)__builtin_elementwise_abs((__v8si)__a);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi32(__m256i __a) {
+  return (__m256i)__builtin_elementwise_abs((__v8si)__a);
 }
 
 /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
@@ -178,8 +172,7 @@ _mm256_abs_epi32(__m256i __a)
 ///    result[255:192].
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packs_epi16(__m256i __a, __m256i __b)
-{
+_mm256_packs_epi16(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -210,8 +203,7 @@ _mm256_packs_epi16(__m256i __a, __m256i __b)
 ///    result[255:192].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packs_epi32(__m256i __a, __m256i __b)
-{
+_mm256_packs_epi32(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
 }
 
@@ -241,8 +233,7 @@ _mm256_packs_epi32(__m256i __a, __m256i __b)
 ///    result[255:192].
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packus_epi16(__m256i __a, __m256i __b)
-{
+_mm256_packus_epi16(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -273,9 +264,8 @@ _mm256_packus_epi16(__m256i __a, __m256i __b)
 ///    result[255:192].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packus_epi32(__m256i __V1, __m256i __V2)
-{
-  return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
+_mm256_packus_epi32(__m256i __V1, __m256i __V2) {
+  return (__m256i)__builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
 }
 
 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
@@ -291,9 +281,8 @@ _mm256_packus_epi32(__m256i __V1, __m256i __V2)
 /// \param __b
 ///    A 256-bit integer vector containing one of the source operands.
 /// \returns A 256-bit integer vector containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_add_epi8(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi8(__m256i __a,
+                                                                __m256i __b) {
   return (__m256i)((__v32qu)__a + (__v32qu)__b);
 }
 
@@ -310,9 +299,8 @@ _mm256_add_epi8(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_add_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi16(__m256i __a,
+                                                                 __m256i __b) {
   return (__m256i)((__v16hu)__a + (__v16hu)__b);
 }
 
@@ -329,9 +317,8 @@ _mm256_add_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_add_epi32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi32(__m256i __a,
+                                                                 __m256i __b) {
   return (__m256i)((__v8su)__a + (__v8su)__b);
 }
 
@@ -348,9 +335,8 @@ _mm256_add_epi32(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [4 x i64] containing one of the source operands.
 /// \returns A 256-bit vector of [4 x i64] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_add_epi64(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi64(__m256i __a,
+                                                                 __m256i __b) {
   return (__m256i)((__v4du)__a + (__v4du)__b);
 }
 
@@ -448,8 +434,8 @@ _mm256_adds_epu16(__m256i __a, __m256i __b) {
 /// \param n
 ///    An immediate value specifying the number of bytes to shift.
 /// \returns A 256-bit integer vector containing the result.
-#define _mm256_alignr_epi8(a, b, n) \
-  ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
+#define _mm256_alignr_epi8(a, b, n)                                            \
+  ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a),                   \
                                       (__v32qi)(__m256i)(b), (n)))
 
 /// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
@@ -465,8 +451,7 @@ _mm256_adds_epu16(__m256i __a, __m256i __b) {
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_and_si256(__m256i __a, __m256i __b)
-{
+_mm256_and_si256(__m256i __a, __m256i __b) {
   return (__m256i)((__v4du)__a & (__v4du)__b);
 }
 
@@ -483,8 +468,7 @@ _mm256_and_si256(__m256i __a, __m256i __b)
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_andnot_si256(__m256i __a, __m256i __b)
-{
+_mm256_andnot_si256(__m256i __a, __m256i __b) {
   return (__m256i)(~(__v4du)__a & (__v4du)__b);
 }
 
@@ -508,9 +492,8 @@ _mm256_andnot_si256(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_avg_epu8(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu8(__m256i __a,
+                                                                __m256i __b) {
   return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
 }
 
@@ -534,9 +517,8 @@ _mm256_avg_epu8(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_avg_epu16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu16(__m256i __a,
+                                                                 __m256i __b) {
   return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -570,10 +552,9 @@ _mm256_avg_epu16(__m256i __a, __m256i __b)
 ///    \a __V2.
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
-{
+_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) {
   return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
-                                              (__v32qi)__M);
+                                             (__v32qi)__M);
 }
 
 /// Merges 16-bit integer values from either of the two 256-bit vectors
@@ -613,8 +594,8 @@ _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
 ///    \a M[0] determines the source for elements 0 and 8, \a M[1] for
 ///    elements 1 and 9, and so forth.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-#define _mm256_blend_epi16(V1, V2, M) \
-  ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
+#define _mm256_blend_epi16(V1, V2, M)                                          \
+  ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1),                  \
                                       (__v16hi)(__m256i)(V2), (int)(M)))
 
 /// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
@@ -638,8 +619,7 @@ _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
 ///    A 256-bit integer vector containing one of the inputs.
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
-{
+_mm256_cmpeq_epi8(__m256i __a, __m256i __b) {
   return (__m256i)((__v32qi)__a == (__v32qi)__b);
 }
 
@@ -664,8 +644,7 @@ _mm256_cmpeq_epi8(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
-{
+_mm256_cmpeq_epi16(__m256i __a, __m256i __b) {
   return (__m256i)((__v16hi)__a == (__v16hi)__b);
 }
 
@@ -690,8 +669,7 @@ _mm256_cmpeq_epi16(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
-{
+_mm256_cmpeq_epi32(__m256i __a, __m256i __b) {
   return (__m256i)((__v8si)__a == (__v8si)__b);
 }
 
@@ -716,8 +694,7 @@ _mm256_cmpeq_epi32(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
 /// \returns A 256-bit vector of [4 x i64] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
-{
+_mm256_cmpeq_epi64(__m256i __a, __m256i __b) {
   return (__m256i)((__v4di)__a == (__v4di)__b);
 }
 
@@ -742,8 +719,7 @@ _mm256_cmpeq_epi64(__m256i __a, __m256i __b)
 ///    A 256-bit integer vector containing one of the inputs.
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
-{
+_mm256_cmpgt_epi8(__m256i __a, __m256i __b) {
   /* This function always performs a signed comparison, but __v32qi is a char
      which may be signed or unsigned, so use __v32qs. */
   return (__m256i)((__v32qs)__a > (__v32qs)__b);
@@ -770,8 +746,7 @@ _mm256_cmpgt_epi8(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
-{
+_mm256_cmpgt_epi16(__m256i __a, __m256i __b) {
   return (__m256i)((__v16hi)__a > (__v16hi)__b);
 }
 
@@ -796,8 +771,7 @@ _mm256_cmpgt_epi16(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
-{
+_mm256_cmpgt_epi32(__m256i __a, __m256i __b) {
   return (__m256i)((__v8si)__a > (__v8si)__b);
 }
 
@@ -822,8 +796,7 @@ _mm256_cmpgt_epi32(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
 /// \returns A 256-bit vector of [4 x i64] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
-{
+_mm256_cmpgt_epi64(__m256i __a, __m256i __b) {
   return (__m256i)((__v4di)__a > (__v4di)__b);
 }
 
@@ -857,10 +830,9 @@ _mm256_cmpgt_epi64(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hadd_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi16(__m256i __a,
+                                                                  __m256i __b) {
+  return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
@@ -889,10 +861,9 @@ _mm256_hadd_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hadd_epi32(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi32(__m256i __a,
+                                                                  __m256i __b) {
+  return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
 }
 
 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
@@ -925,9 +896,8 @@ _mm256_hadd_epi32(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the sums.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hadds_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
+_mm256_hadds_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
@@ -960,10 +930,9 @@ _mm256_hadds_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hsub_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi16(__m256i __a,
+                                                                  __m256i __b) {
+  return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
@@ -992,10 +961,9 @@ _mm256_hsub_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hsub_epi32(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi32(__m256i __a,
+                                                                  __m256i __b) {
+  return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
 }
 
 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
@@ -1029,9 +997,8 @@ _mm256_hsub_epi32(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the differences.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hsubs_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
+_mm256_hsubs_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
@@ -1059,9 +1026,8 @@ _mm256_hsubs_epi16(__m256i __a, __m256i __b)
 ///    A 256-bit vector containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maddubs_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
+_mm256_maddubs_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
 }
 
 /// Multiplies corresponding 16-bit elements of two 256-bit vectors of
@@ -1090,9 +1056,8 @@ _mm256_maddubs_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_madd_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_madd_epi16(__m256i __a,
+                                                                  __m256i __b) {
   return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -1109,9 +1074,8 @@ _mm256_madd_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epi8(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi8(__m256i __a,
+                                                                __m256i __b) {
   return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
 }
 
@@ -1128,9 +1092,8 @@ _mm256_max_epi8(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi16(__m256i __a,
+                                                                 __m256i __b) {
   return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -1147,9 +1110,8 @@ _mm256_max_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32].
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epi32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi32(__m256i __a,
+                                                                 __m256i __b) {
   return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
 }
 
@@ -1166,9 +1128,8 @@ _mm256_max_epi32(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epu8(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu8(__m256i __a,
+                                                                __m256i __b) {
   return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
 }
 
@@ -1185,9 +1146,8 @@ _mm256_max_epu8(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epu16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu16(__m256i __a,
+                                                                 __m256i __b) {
   return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
 }
 
@@ -1204,9 +1164,8 @@ _mm256_max_epu16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32].
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_max_epu32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu32(__m256i __a,
+                                                                 __m256i __b) {
   return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
 }
 
@@ -1223,9 +1182,8 @@ _mm256_max_epu32(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epi8(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi8(__m256i __a,
+                                                                __m256i __b) {
   return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
 }
 
@@ -1242,9 +1200,8 @@ _mm256_min_epi8(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi16(__m256i __a,
+                                                                 __m256i __b) {
   return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -1261,9 +1218,8 @@ _mm256_min_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32].
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epi32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi32(__m256i __a,
+                                                                 __m256i __b) {
   return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
 }
 
@@ -1280,9 +1236,8 @@ _mm256_min_epi32(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epu8(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu8(__m256i __a,
+                                                                __m256i __b) {
   return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
 }
 
@@ -1299,9 +1254,8 @@ _mm256_min_epu8(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epu16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu16(__m256i __a,
+                                                                 __m256i __b) {
   return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
 }
 
@@ -1318,9 +1272,8 @@ _mm256_min_epu16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32].
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_min_epu32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu32(__m256i __a,
+                                                                 __m256i __b) {
   return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
 }
 
@@ -1341,9 +1294,7 @@ _mm256_min_epu32(__m256i __a, __m256i __b)
 /// \param __a
 ///    A 256-bit integer vector containing the source bytes.
 /// \returns The 32-bit integer mask.
-static __inline__ int __DEFAULT_FN_ATTRS256
-_mm256_movemask_epi8(__m256i __a)
-{
+static __inline__ int __DEFAULT_FN_ATTRS256 _mm256_movemask_epi8(__m256i __a) {
   return __builtin_ia32_pmovmskb256((__v32qi)__a);
 }
 
@@ -1371,7 +1322,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepi8_epi16(__m128i __V) {
   /* This function always performs a signed extension, but __v16qi is a char
      which may be signed or unsigned, so use __v16qs. */
-  return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
+  return (__m256i) __builtin_convertvector((__v16qs)__V, __v16hi);
 }
 
 /// Sign-extends bytes from the lower half of the 128-bit integer vector in
@@ -1398,7 +1349,10 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepi8_epi32(__m128i __V) {
   /* This function always performs a signed extension, but __v16qi is a char
      which may be signed or unsigned, so use __v16qs. */
-  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
+  return (__m256i) __builtin_convertvector(
+      __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6,
+                              7),
+      __v8si);
 }
 
 /// Sign-extends the first four bytes from the 128-bit integer vector in
@@ -1424,7 +1378,8 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepi8_epi64(__m128i __V) {
   /* This function always performs a signed extension, but __v16qi is a char
      which may be signed or unsigned, so use __v16qs. */
-  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
+  return (__m256i) __builtin_convertvector(
+      __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
 }
 
 /// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
@@ -1449,7 +1404,7 @@ _mm256_cvtepi8_epi64(__m128i __V) {
 ///    values.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepi16_epi32(__m128i __V) {
-  return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
+  return (__m256i) __builtin_convertvector((__v8hi)__V, __v8si);
 }
 
 /// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
@@ -1473,7 +1428,8 @@ _mm256_cvtepi16_epi32(__m128i __V) {
 ///    values.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepi16_epi64(__m128i __V) {
-  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
+  return (__m256i) __builtin_convertvector(
+      __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
 }
 
 /// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
@@ -1497,7 +1453,7 @@ _mm256_cvtepi16_epi64(__m128i __V) {
 ///    values.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepi32_epi64(__m128i __V) {
-  return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
+  return (__m256i) __builtin_convertvector((__v4si)__V, __v4di);
 }
 
 /// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
@@ -1522,7 +1478,7 @@ _mm256_cvtepi32_epi64(__m128i __V) {
 ///    values.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepu8_epi16(__m128i __V) {
-  return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
+  return (__m256i) __builtin_convertvector((__v16qu)__V, __v16hi);
 }
 
 /// Zero-extends bytes from the lower half of the 128-bit integer vector in
@@ -1547,7 +1503,10 @@ _mm256_cvtepu8_epi16(__m128i __V) {
 ///    values.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepu8_epi32(__m128i __V) {
-  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
+  return (__m256i) __builtin_convertvector(
+      __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6,
+                              7),
+      __v8si);
 }
 
 /// Zero-extends the first four bytes from the 128-bit integer vector in
@@ -1571,7 +1530,8 @@ _mm256_cvtepu8_epi32(__m128i __V) {
 ///    values.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepu8_epi64(__m128i __V) {
-  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
+  return (__m256i) __builtin_convertvector(
+      __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
 }
 
 /// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
@@ -1596,7 +1556,7 @@ _mm256_cvtepu8_epi64(__m128i __V) {
 ///    values.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepu16_epi32(__m128i __V) {
-  return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
+  return (__m256i) __builtin_convertvector((__v8hu)__V, __v8si);
 }
 
 /// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
@@ -1620,7 +1580,8 @@ _mm256_cvtepu16_epi32(__m128i __V) {
 ///    values.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepu16_epi64(__m128i __V) {
-  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
+  return (__m256i) __builtin_convertvector(
+      __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
 }
 
 /// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
@@ -1644,7 +1605,7 @@ _mm256_cvtepu16_epi64(__m128i __V) {
 ///    values.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepu32_epi64(__m128i __V) {
-  return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
+  return (__m256i) __builtin_convertvector((__v4su)__V, __v4di);
 }
 
 /// Multiplies signed 32-bit integers from even-numbered elements of two
@@ -1694,8 +1655,7 @@ _mm256_mul_epi32(__m256i __a, __m256i __b) {
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the rounded products.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
-{
+_mm256_mulhrs_epi16(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -1713,8 +1673,7 @@ _mm256_mulhrs_epi16(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the products.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_mulhi_epu16(__m256i __a, __m256i __b)
-{
+_mm256_mulhi_epu16(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_pmulhuw256((__v16hu)__a, (__v16hu)__b);
 }
 
@@ -1732,8 +1691,7 @@ _mm256_mulhi_epu16(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the products.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_mulhi_epi16(__m256i __a, __m256i __b)
-{
+_mm256_mulhi_epi16(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -1751,8 +1709,7 @@ _mm256_mulhi_epi16(__m256i __a, __m256i __b)
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the products.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_mullo_epi16(__m256i __a, __m256i __b)
-{
+_mm256_mullo_epi16(__m256i __a, __m256i __b) {
   return (__m256i)((__v16hu)__a * (__v16hu)__b);
 }
 
@@ -1769,9 +1726,8 @@ _mm256_mullo_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the products.
-static __inline__  __m256i __DEFAULT_FN_ATTRS256
-_mm256_mullo_epi32 (__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_mullo_epi32(__m256i __a, __m256i __b) {
   return (__m256i)((__v8su)__a * (__v8su)__b);
 }
 
@@ -1813,8 +1769,7 @@ _mm256_mul_epu32(__m256i __a, __m256i __b) {
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_or_si256(__m256i __a, __m256i __b)
-{
+_mm256_or_si256(__m256i __a, __m256i __b) {
   return (__m256i)((__v4du)__a | (__v4du)__b);
 }
 
@@ -1857,9 +1812,8 @@ _mm256_or_si256(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sad_epu8(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sad_epu8(__m256i __a,
+                                                                __m256i __b) {
   return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
 }
 
@@ -1897,8 +1851,7 @@ _mm256_sad_epu8(__m256i __a, __m256i __b)
 ///    to copy to the result byte.
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shuffle_epi8(__m256i __a, __m256i __b)
-{
+_mm256_shuffle_epi8(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
 }
 
@@ -1932,7 +1885,7 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b)
 ///    result, \a imm[3:2] specifies the index for elements 1 and 5, and so
 ///    forth.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-#define _mm256_shuffle_epi32(a, imm) \
+#define _mm256_shuffle_epi32(a, imm)                                           \
   ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
 
 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
@@ -1968,7 +1921,7 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b)
 ///    result, \a imm[3:2] specifies the index for elements 5 and 9, and so
 ///    forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-#define _mm256_shufflehi_epi16(a, imm) \
+#define _mm256_shufflehi_epi16(a, imm)                                         \
   ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
 
 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
@@ -2005,7 +1958,7 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b)
 ///    result, \a imm[3:2] specifies the index for elements 1 and 9, and so
 ///    forth.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-#define _mm256_shufflelo_epi16(a, imm) \
+#define _mm256_shufflelo_epi16(a, imm)                                         \
   ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
 
 /// Sets each byte of the result to the corresponding byte of the 256-bit
@@ -2023,10 +1976,9 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit integer vector].
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sign_epi8(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi8(__m256i __a,
+                                                                 __m256i __b) {
+  return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
 }
 
 /// Sets each element of the result to the corresponding element of the
@@ -2044,10 +1996,9 @@ _mm256_sign_epi8(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sign_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi16(__m256i __a,
+                                                                  __m256i __b) {
+  return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Sets each element of the result to the corresponding element of the
@@ -2065,10 +2016,9 @@ _mm256_sign_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32].
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sign_epi32(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi32(__m256i __a,
+                                                                  __m256i __b) {
+  return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
 }
 
 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
@@ -2088,8 +2038,9 @@ _mm256_sign_epi32(__m256i __a, __m256i __b)
 /// \param imm
 ///     An unsigned immediate value specifying the shift count (in bytes).
 /// \returns A 256-bit integer vector containing the result.
-#define _mm256_slli_si256(a, imm) \
-  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
+#define _mm256_slli_si256(a, imm)                                              \
+  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a),          \
+                                                (int)(imm)))
 
 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
 ///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
@@ -2108,8 +2059,9 @@ _mm256_sign_epi32(__m256i __a, __m256i __b)
 /// \param imm
 ///    An unsigned immediate value specifying the shift count (in bytes).
 /// \returns A 256-bit integer vector containing the result.
-#define _mm256_bslli_epi128(a, imm) \
-  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
+#define _mm256_bslli_epi128(a, imm)                                            \
+  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a),          \
+                                                (int)(imm)))
 
 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
 ///    left by \a __count bits, shifting in zero bits, and returns the result.
@@ -2124,9 +2076,8 @@ _mm256_sign_epi32(__m256i __a, __m256i __b)
 /// \param __count
 ///    An unsigned integer value specifying the shift count (in bits).
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_slli_epi16(__m256i __a, int __count)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi16(__m256i __a,
+                                                                  int __count) {
   return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
 }
 
@@ -2146,8 +2097,7 @@ _mm256_slli_epi16(__m256i __a, int __count)
 ///    shift count (in bits). The upper element is ignored.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sll_epi16(__m256i __a, __m128i __count)
-{
+_mm256_sll_epi16(__m256i __a, __m128i __count) {
   return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
 }
 
@@ -2164,9 +2114,8 @@ _mm256_sll_epi16(__m256i __a, __m128i __count)
 /// \param __count
 ///    An unsigned integer value specifying the shift count (in bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_slli_epi32(__m256i __a, int __count)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi32(__m256i __a,
+                                                                  int __count) {
   return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
 }
 
@@ -2186,8 +2135,7 @@ _mm256_slli_epi32(__m256i __a, int __count)
 ///    shift count (in bits). The upper element is ignored.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sll_epi32(__m256i __a, __m128i __count)
-{
+_mm256_sll_epi32(__m256i __a, __m128i __count) {
   return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
 }
 
@@ -2204,9 +2152,8 @@ _mm256_sll_epi32(__m256i __a, __m128i __count)
 /// \param __count
 ///    An unsigned integer value specifying the shift count (in bits).
 /// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_slli_epi64(__m256i __a, int __count)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi64(__m256i __a,
+                                                                  int __count) {
   return __builtin_ia32_psllqi256((__v4di)__a, __count);
 }
 
@@ -2226,8 +2173,7 @@ _mm256_slli_epi64(__m256i __a, int __count)
 ///    shift count (in bits). The upper element is ignored.
 /// \returns A 256-bit vector of [4 x i64] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sll_epi64(__m256i __a, __m128i __count)
-{
+_mm256_sll_epi64(__m256i __a, __m128i __count) {
   return __builtin_ia32_psllq256((__v4di)__a, __count);
 }
 
@@ -2245,9 +2191,8 @@ _mm256_sll_epi64(__m256i __a, __m128i __count)
 /// \param __count
 ///    An unsigned integer value specifying the shift count (in bits).
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srai_epi16(__m256i __a, int __count)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srai_epi16(__m256i __a,
+                                                                  int __count) {
   return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
 }
 
@@ -2268,8 +2213,7 @@ _mm256_srai_epi16(__m256i __a, int __count)
 ///    shift count (in bits). The upper element is ignored.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sra_epi16(__m256i __a, __m128i __count)
-{
+_mm256_sra_epi16(__m256i __a, __m128i __count) {
   return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
 }
 
@@ -2287,9 +2231,8 @@ _mm256_sra_epi16(__m256i __a, __m128i __count)
 /// \param __count
 ///    An unsigned integer value specifying the shift count (in bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srai_epi32(__m256i __a, int __count)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srai_epi32(__m256i __a,
+                                                                  int __count) {
   return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
 }
 
@@ -2310,8 +2253,7 @@ _mm256_srai_epi32(__m256i __a, int __count)
 ///    shift count (in bits). The upper element is ignored.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sra_epi32(__m256i __a, __m128i __count)
-{
+_mm256_sra_epi32(__m256i __a, __m128i __count) {
   return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
 }
 
@@ -2332,7 +2274,7 @@ _mm256_sra_epi32(__m256i __a, __m128i __count)
 /// \param imm
 ///    An unsigned immediate value specifying the shift count (in bytes).
 /// \returns A 256-bit integer vector containing the result.
-#define _mm256_srli_si256(a, imm) \
+#define _mm256_srli_si256(a, imm)                                              \
   ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
 
 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
@@ -2352,7 +2294,7 @@ _mm256_sra_epi32(__m256i __a, __m128i __count)
 /// \param imm
 ///     An unsigned immediate value specifying the shift count (in bytes).
 /// \returns A 256-bit integer vector containing the result.
-#define _mm256_bsrli_epi128(a, imm) \
+#define _mm256_bsrli_epi128(a, imm)                                            \
   ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
 
 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
@@ -2368,9 +2310,8 @@ _mm256_sra_epi32(__m256i __a, __m128i __count)
 /// \param __count
 ///    An unsigned integer value specifying the shift count (in bits).
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srli_epi16(__m256i __a, int __count)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi16(__m256i __a,
+                                                                  int __count) {
   return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
 }
 
@@ -2390,8 +2331,7 @@ _mm256_srli_epi16(__m256i __a, int __count)
 ///    shift count (in bits). The upper element is ignored.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srl_epi16(__m256i __a, __m128i __count)
-{
+_mm256_srl_epi16(__m256i __a, __m128i __count) {
   return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
 }
 
@@ -2408,9 +2348,8 @@ _mm256_srl_epi16(__m256i __a, __m128i __count)
 /// \param __count
 ///    An unsigned integer value specifying the shift count (in bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srli_epi32(__m256i __a, int __count)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi32(__m256i __a,
+                                                                  int __count) {
   return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
 }
 
@@ -2430,8 +2369,7 @@ _mm256_srli_epi32(__m256i __a, int __count)
 ///    shift count (in bits). The upper element is ignored.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srl_epi32(__m256i __a, __m128i __count)
-{
+_mm256_srl_epi32(__m256i __a, __m128i __count) {
   return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
 }
 
@@ -2448,9 +2386,8 @@ _mm256_srl_epi32(__m256i __a, __m128i __count)
 /// \param __count
 ///    An unsigned integer value specifying the shift count (in bits).
 /// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srli_epi64(__m256i __a, int __count)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi64(__m256i __a,
+                                                                  int __count) {
   return __builtin_ia32_psrlqi256((__v4di)__a, __count);
 }
 
@@ -2470,8 +2407,7 @@ _mm256_srli_epi64(__m256i __a, int __count)
 ///    shift count (in bits). The upper element is ignored.
 /// \returns A 256-bit vector of [4 x i64] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srl_epi64(__m256i __a, __m128i __count)
-{
+_mm256_srl_epi64(__m256i __a, __m128i __count) {
   return __builtin_ia32_psrlq256((__v4di)__a, __count);
 }
 
@@ -2496,9 +2432,8 @@ _mm256_srl_epi64(__m256i __a, __m128i __count)
 /// \param __b
 ///    A 256-bit integer vector containing the subtrahends.
 /// \returns A 256-bit integer vector containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sub_epi8(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi8(__m256i __a,
+                                                                __m256i __b) {
   return (__m256i)((__v32qu)__a - (__v32qu)__b);
 }
 
@@ -2523,9 +2458,8 @@ _mm256_sub_epi8(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing the subtrahends.
 /// \returns A 256-bit vector of [16 x i16] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sub_epi16(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi16(__m256i __a,
+                                                                 __m256i __b) {
   return (__m256i)((__v16hu)__a - (__v16hu)__b);
 }
 
@@ -2549,9 +2483,8 @@ _mm256_sub_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing the subtrahends.
 /// \returns A 256-bit vector of [8 x i32] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sub_epi32(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi32(__m256i __a,
+                                                                 __m256i __b) {
   return (__m256i)((__v8su)__a - (__v8su)__b);
 }
 
@@ -2575,9 +2508,8 @@ _mm256_sub_epi32(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [4 x i64] containing the subtrahends.
 /// \returns A 256-bit vector of [4 x i64] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sub_epi64(__m256i __a, __m256i __b)
-{
+static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi64(__m256i __a,
+                                                                 __m256i __b) {
   return (__m256i)((__v4du)__a - (__v4du)__b);
 }
 
@@ -2712,7 +2644,11 @@ _mm256_subs_epu16(__m256i __a, __m256i __b) {
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_unpackhi_epi8(__m256i __a, __m256i __b) {
-  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
+  return (__m256i)__builtin_shufflevector(
+      (__v32qi)__a, (__v32qi)__b, 8, 32 + 8, 9, 32 + 9, 10, 32 + 10, 11,
+      32 + 11, 12, 32 + 12, 13, 32 + 13, 14, 32 + 14, 15, 32 + 15, 24, 32 + 24,
+      25, 32 + 25, 26, 32 + 26, 27, 32 + 27, 28, 32 + 28, 29, 32 + 29, 30,
+      32 + 30, 31, 32 + 31);
 }
 
 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
@@ -2746,7 +2682,9 @@ _mm256_unpackhi_epi8(__m256i __a, __m256i __b) {
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_unpackhi_epi16(__m256i __a, __m256i __b) {
-  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
+  return (__m256i)__builtin_shufflevector(
+      (__v16hi)__a, (__v16hi)__b, 4, 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7,
+      12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
 }
 
 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
@@ -2779,7 +2717,8 @@ _mm256_unpackhi_epi16(__m256i __a, __m256i __b) {
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_unpackhi_epi32(__m256i __a, __m256i __b) {
-  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
+  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8 + 2, 3,
+                                          8 + 3, 6, 8 + 6, 7, 8 + 7);
 }
 
 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
@@ -2808,7 +2747,8 @@ _mm256_unpackhi_epi32(__m256i __a, __m256i __b) {
 /// \returns A 256-bit vector of [4 x i64] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_unpackhi_epi64(__m256i __a, __m256i __b) {
-  return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
+  return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4 + 1, 3,
+                                          4 + 3);
 }
 
 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
@@ -2841,7 +2781,10 @@ _mm256_unpackhi_epi64(__m256i __a, __m256i __b) {
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_unpacklo_epi8(__m256i __a, __m256i __b) {
-  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
+  return (__m256i)__builtin_shufflevector(
+      (__v32qi)__a, (__v32qi)__b, 0, 32 + 0, 1, 32 + 1, 2, 32 + 2, 3, 32 + 3, 4,
+      32 + 4, 5, 32 + 5, 6, 32 + 6, 7, 32 + 7, 16, 32 + 16, 17, 32 + 17, 18,
+      32 + 18, 19, 32 + 19, 20, 32 + 20, 21, 32 + 21, 22, 32 + 22, 23, 32 + 23);
 }
 
 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
@@ -2875,7 +2818,9 @@ _mm256_unpacklo_epi8(__m256i __a, __m256i __b) {
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_unpacklo_epi16(__m256i __a, __m256i __b) {
-  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
+  return (__m256i)__builtin_shufflevector(
+      (__v16hi)__a, (__v16hi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 8,
+      16 + 8, 9, 16 + 9, 10, 16 + 10, 11, 16 + 11);
 }
 
 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
@@ -2908,7 +2853,8 @@ _mm256_unpacklo_epi16(__m256i __a, __m256i __b) {
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_unpacklo_epi32(__m256i __a, __m256i __b) {
-  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
+  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8 + 0, 1,
+                                          8 + 1, 4, 8 + 4, 5, 8 + 5);
 }
 
 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
@@ -2937,7 +2883,8 @@ _mm256_unpacklo_epi32(__m256i __a, __m256i __b) {
 /// \returns A 256-bit vector of [4 x i64] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_unpacklo_epi64(__m256i __a, __m256i __b) {
-  return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
+  return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4 + 0, 2,
+                                          4 + 2);
 }
 
 /// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
@@ -2953,8 +2900,7 @@ _mm256_unpacklo_epi64(__m256i __a, __m256i __b) {
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_xor_si256(__m256i __a, __m256i __b)
-{
+_mm256_xor_si256(__m256i __a, __m256i __b) {
   return (__m256i)((__v4du)__a ^ (__v4du)__b);
 }
 
@@ -2970,8 +2916,7 @@ _mm256_xor_si256(__m256i __a, __m256i __b)
 ///    A pointer to the 32-byte aligned memory containing the vector to load.
 /// \returns A 256-bit integer vector loaded from memory.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_stream_load_si256(const void *__V)
-{
+_mm256_stream_load_si256(const void *__V) {
   typedef __v4di __v4di_aligned __attribute__((aligned(32)));
   return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
 }
@@ -3021,7 +2966,8 @@ _mm_broadcastsd_pd(__m128d __a) {
 /// \returns A 256-bit vector of [8 x float] containing the result.
 static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_broadcastss_ps(__m128 __X) {
-  return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
+  return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0,
+                                         0, 0, 0, 0);
 }
 
 /// Broadcasts the 64-bit floating-point value from the low element of the
@@ -3090,8 +3036,8 @@ _mm256_broadcastsi128_si256(__m128i __X) {
 ///    corresponds to the index of a copied value. When a mask bit is 0, the
 ///    element is copied from \a V1; otherwise, it is copied from \a V2.
 /// \returns A 128-bit vector of [4 x i32] containing the result.
-#define _mm_blend_epi32(V1, V2, M) \
-  ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
+#define _mm_blend_epi32(V1, V2, M)                                             \
+  ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1),                   \
                                       (__v4si)(__m128i)(V2), (int)(M)))
 
 /// Merges 32-bit integer elements from either of the two 256-bit vectors of
@@ -3127,8 +3073,8 @@ _mm256_broadcastsi128_si256(__m128i __X) {
 ///    corresponds to the index of a copied value. When a mask bit is 0, the
 ///    element is copied from \a V1; otherwise, it is is copied from \a V2.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-#define _mm256_blend_epi32(V1, V2, M) \
-  ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
+#define _mm256_blend_epi32(V1, V2, M)                                          \
+  ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1),                   \
                                       (__v8si)(__m256i)(V2), (int)(M)))
 
 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
@@ -3143,7 +3089,9 @@ _mm256_broadcastsi128_si256(__m128i __X) {
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_broadcastb_epi8(__m128i __X) {
-  return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  return (__m256i)__builtin_shufflevector(
+      (__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
@@ -3158,7 +3106,8 @@ _mm256_broadcastb_epi8(__m128i __X) {
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_broadcastw_epi16(__m128i __X) {
-  return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0,
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
@@ -3173,7 +3122,8 @@ _mm256_broadcastw_epi16(__m128i __X) {
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_broadcastd_epi32(__m128i __X) {
-  return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
+  return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0,
+                                          0, 0, 0, 0);
 }
 
 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
@@ -3203,7 +3153,9 @@ _mm256_broadcastq_epi64(__m128i __X) {
 /// \returns A 128-bit integer vector containing the result.
 static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_broadcastb_epi8(__m128i __X) {
-  return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0,
+                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0);
 }
 
 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in
@@ -3218,7 +3170,8 @@ _mm_broadcastb_epi8(__m128i __X) {
 /// \returns A 128-bit vector of [8 x i16] containing the result.
 static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_broadcastw_epi16(__m128i __X) {
-  return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
+  return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0,
+                                          0, 0, 0, 0);
 }
 
 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
@@ -3274,8 +3227,7 @@ _mm_broadcastq_epi64(__m128i __X) {
 ///    \a __a.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
-{
+_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) {
   return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
 }
 
@@ -3306,7 +3258,7 @@ _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
 ///    \a M[1:0] specifies the index in \a a for element 0 of the result,
 ///    \a M[3:2] specifies the index for element 1, and so forth.
 /// \returns A 256-bit vector of [4 x double] containing the result.
-#define _mm256_permute4x64_pd(V, M) \
+#define _mm256_permute4x64_pd(V, M)                                            \
   ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
 
 /// Sets the result's 256-bit vector of [8 x float] to copies of elements of
@@ -3332,8 +3284,7 @@ _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
 ///    \a __a.
 /// \returns A 256-bit vector of [8 x float] containing the result.
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
-{
+_mm256_permutevar8x32_ps(__m256 __a, __m256i __b) {
   return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
 }
 
@@ -3364,7 +3315,7 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
 ///    \a M[1:0] specifies the index in \a a for element 0 of the result,
 ///    \a M[3:2] specifies the index for element 1, and so forth.
 /// \returns A 256-bit vector of [4 x i64] containing the result.
-#define _mm256_permute4x64_epi64(V, M) \
+#define _mm256_permute4x64_epi64(V, M)                                         \
   ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
 
 /// Sets each half of the 256-bit result either to zero or to one of the
@@ -3410,7 +3361,7 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
 ///    2: the lower half of \a V2 \n
 ///    3: the upper half of \a V2
 /// \returns A 256-bit integer vector containing the result.
-#define _mm256_permute2x128_si256(V1, V2, M) \
+#define _mm256_permute2x128_si256(V1, V2, M)                                   \
   ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
 
 /// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
@@ -3430,7 +3381,7 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
 /// \param M
 ///    An immediate value specifying which half of \a V to extract.
 /// \returns A 128-bit integer vector containing the result.
-#define _mm256_extracti128_si256(V, M) \
+#define _mm256_extracti128_si256(V, M)                                         \
   ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
 
 /// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
@@ -3453,8 +3404,8 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
 /// \param M
 ///    An immediate value specifying where to put \a V2 in the result.
 /// \returns A 256-bit integer vector containing the result.
-#define _mm256_inserti128_si256(V1, V2, M) \
-  ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
+#define _mm256_inserti128_si256(V1, V2, M)                                     \
+  ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1),                \
                                          (__v2di)(__m128i)(V2), (int)(M)))
 
 /// Conditionally loads eight 32-bit integer elements from memory \a __X, if
@@ -3484,8 +3435,7 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
 /// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
 ///    elements.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskload_epi32(int const *__X, __m256i __M)
-{
+_mm256_maskload_epi32(int const *__X, __m256i __M) {
   return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
 }
 
@@ -3516,8 +3466,7 @@ _mm256_maskload_epi32(int const *__X, __m256i __M)
 /// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
 ///    elements.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskload_epi64(long long const *__X, __m256i __M)
-{
+_mm256_maskload_epi64(long long const *__X, __m256i __M) {
   return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
 }
 
@@ -3548,8 +3497,7 @@ _mm256_maskload_epi64(long long const *__X, __m256i __M)
 /// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
 ///    elements.
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskload_epi32(int const *__X, __m128i __M)
-{
+_mm_maskload_epi32(int const *__X, __m128i __M) {
   return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
 }
 
@@ -3580,8 +3528,7 @@ _mm_maskload_epi32(int const *__X, __m128i __M)
 /// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
 ///    elements.
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskload_epi64(long long const *__X, __m128i __M)
-{
+_mm_maskload_epi64(long long const *__X, __m128i __M) {
   return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
 }
 
@@ -3610,8 +3557,7 @@ _mm_maskload_epi64(long long const *__X, __m128i __M)
 /// \param __Y
 ///    A 256-bit vector of [8 x i32] containing the values to store.
 static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
-{
+_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y) {
   __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
 }
 
@@ -3640,8 +3586,7 @@ _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
 /// \param __Y
 ///    A 256-bit vector of [4 x i64] containing the values to store.
 static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
-{
+_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y) {
   __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
 }
 
@@ -3669,9 +3614,9 @@ _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
 ///    A 128-bit vector of [4 x i32] containing the mask bits.
 /// \param __Y
 ///    A 128-bit vector of [4 x i32] containing the values to store.
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
-{
+static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi32(int *__X,
+                                                                 __m128i __M,
+                                                                 __m128i __Y) {
   __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
 }
 
@@ -3699,10 +3644,10 @@ _mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
 ///    A 128-bit vector of [2 x i64] containing the mask bits.
 /// \param __Y
 ///    A 128-bit vector of [2 x i64] containing the values to store.
-static __inline__ void __DEFAULT_FN_ATTRS128
-_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
-{
-  __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
+static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi64(long long *__X,
+                                                                 __m128i __M,
+                                                                 __m128i __Y) {
+  __builtin_ia32_maskstoreq((__v2di *)__X, (__v2di)__M, (__v2di)__Y);
 }
 
 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
@@ -3722,8 +3667,7 @@ _mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
 ///    bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_sllv_epi32(__m256i __X, __m256i __Y)
-{
+_mm256_sllv_epi32(__m256i __X, __m256i __Y) {
   return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
 }
 
@@ -3744,8 +3688,7 @@ _mm256_sllv_epi32(__m256i __X, __m256i __Y)
 ///    bits).
 /// \returns A 128-bit vector of [4 x i32] containing the result.
 static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_sllv_epi32(__m128i __X, __m128i __Y)
-{
+_mm_sllv_epi32(__m128i __X, __m128i __Y) {
   return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
 }
 
@@ -3766,8 +3709,7 @@ _mm_sllv_epi32(__m128i __X, __m128i __Y)
 ///    bits).
 /// \returns A 256-bit vector of [4 x i64] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_sllv_epi64(__m256i __X, __m256i __Y)
-{
+_mm256_sllv_epi64(__m256i __X, __m256i __Y) {
   return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
 }
 
@@ -3788,8 +3730,7 @@ _mm256_sllv_epi64(__m256i __X, __m256i __Y)
 ///    bits).
 /// \returns A 128-bit vector of [2 x i64] containing the result.
 static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_sllv_epi64(__m128i __X, __m128i __Y)
-{
+_mm_sllv_epi64(__m128i __X, __m128i __Y) {
   return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
 }
 
@@ -3811,8 +3752,7 @@ _mm_sllv_epi64(__m128i __X, __m128i __Y)
 ///    bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_srav_epi32(__m256i __X, __m256i __Y)
-{
+_mm256_srav_epi32(__m256i __X, __m256i __Y) {
   return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
 }
 
@@ -3834,8 +3774,7 @@ _mm256_srav_epi32(__m256i __X, __m256i __Y)
 ///    bits).
 /// \returns A 128-bit vector of [4 x i32] containing the result.
 static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_srav_epi32(__m128i __X, __m128i __Y)
-{
+_mm_srav_epi32(__m128i __X, __m128i __Y) {
   return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
 }
 
@@ -3856,8 +3795,7 @@ _mm_srav_epi32(__m128i __X, __m128i __Y)
 ///    bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_srlv_epi32(__m256i __X, __m256i __Y)
-{
+_mm256_srlv_epi32(__m256i __X, __m256i __Y) {
   return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
 }
 
@@ -3878,8 +3816,7 @@ _mm256_srlv_epi32(__m256i __X, __m256i __Y)
 ///    bits).
 /// \returns A 128-bit vector of [4 x i32] containing the result.
 static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_srlv_epi32(__m128i __X, __m128i __Y)
-{
+_mm_srlv_epi32(__m128i __X, __m128i __Y) {
   return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
 }
 
@@ -3900,8 +3837,7 @@ _mm_srlv_epi32(__m128i __X, __m128i __Y)
 ///    bits).
 /// \returns A 256-bit vector of [4 x i64] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_srlv_epi64(__m256i __X, __m256i __Y)
-{
+_mm256_srlv_epi64(__m256i __X, __m256i __Y) {
   return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
 }
 
@@ -3922,8 +3858,7 @@ _mm256_srlv_epi64(__m256i __X, __m256i __Y)
 ///    bits).
 /// \returns A 128-bit vector of [2 x i64] containing the result.
 static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_srlv_epi64(__m128i __X, __m128i __Y)
-{
+_mm_srlv_epi64(__m128i __X, __m128i __Y) {
   return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
 }
 
@@ -3970,11 +3905,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
-#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
-  ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
-                                      (double const *)(m), \
-                                      (__v4si)(__m128i)(i), \
-                                      (__v2df)(__m128d)(mask), (s)))
+#define _mm_mask_i32gather_pd(a, m, i, mask, s)                                \
+  ((__m128d)__builtin_ia32_gatherd_pd(                                         \
+      (__v2df)(__m128i)(a), (double const *)(m), (__v4si)(__m128i)(i),         \
+      (__v2df)(__m128d)(mask), (s)))
 
 /// Conditionally gathers four 64-bit floating-point values, either from the
 ///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
@@ -4018,11 +3952,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
-#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
-  ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
-                                         (double const *)(m), \
-                                         (__v4si)(__m128i)(i), \
-                                         (__v4df)(__m256d)(mask), (s)))
+#define _mm256_mask_i32gather_pd(a, m, i, mask, s)                             \
+  ((__m256d)__builtin_ia32_gatherd_pd256(                                      \
+      (__v4df)(__m256d)(a), (double const *)(m), (__v4si)(__m128i)(i),         \
+      (__v4df)(__m256d)(mask), (s)))
 
 /// Conditionally gathers two 64-bit floating-point values, either from the
 ///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
@@ -4066,11 +3999,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
-#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
-  ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
-                                      (double const *)(m), \
-                                      (__v2di)(__m128i)(i), \
-                                      (__v2df)(__m128d)(mask), (s)))
+#define _mm_mask_i64gather_pd(a, m, i, mask, s)                                \
+  ((__m128d)__builtin_ia32_gatherq_pd(                                         \
+      (__v2df)(__m128d)(a), (double const *)(m), (__v2di)(__m128i)(i),         \
+      (__v2df)(__m128d)(mask), (s)))
 
 /// Conditionally gathers four 64-bit floating-point values, either from the
 ///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
@@ -4114,11 +4046,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
-#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
-  ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
-                                         (double const *)(m), \
-                                         (__v4di)(__m256i)(i), \
-                                         (__v4df)(__m256d)(mask), (s)))
+#define _mm256_mask_i64gather_pd(a, m, i, mask, s)                             \
+  ((__m256d)__builtin_ia32_gatherq_pd256(                                      \
+      (__v4df)(__m256d)(a), (double const *)(m), (__v4di)(__m256i)(i),         \
+      (__v4df)(__m256d)(mask), (s)))
 
 /// Conditionally gathers four 32-bit floating-point values, either from the
 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
@@ -4162,10 +4093,9 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
-#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
-  ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
-                                     (float const *)(m), \
-                                     (__v4si)(__m128i)(i), \
+#define _mm_mask_i32gather_ps(a, m, i, mask, s)                                \
+  ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), (float const *)(m),  \
+                                     (__v4si)(__m128i)(i),                     \
                                      (__v4sf)(__m128)(mask), (s)))
 
 /// Conditionally gathers eight 32-bit floating-point values, either from the
@@ -4210,11 +4140,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
-#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
-  ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
-                                        (float const *)(m), \
-                                        (__v8si)(__m256i)(i), \
-                                        (__v8sf)(__m256)(mask), (s)))
+#define _mm256_mask_i32gather_ps(a, m, i, mask, s)                             \
+  ((__m256)__builtin_ia32_gatherd_ps256(                                       \
+      (__v8sf)(__m256)(a), (float const *)(m), (__v8si)(__m256i)(i),           \
+      (__v8sf)(__m256)(mask), (s)))
 
 /// Conditionally gathers two 32-bit floating-point values, either from the
 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
@@ -4261,10 +4190,9 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
-#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
-  ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
-                                     (float const *)(m), \
-                                     (__v2di)(__m128i)(i), \
+#define _mm_mask_i64gather_ps(a, m, i, mask, s)                                \
+  ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), (float const *)(m),  \
+                                     (__v2di)(__m128i)(i),                     \
                                      (__v4sf)(__m128)(mask), (s)))
 
 /// Conditionally gathers four 32-bit floating-point values, either from the
@@ -4309,11 +4237,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
-#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
-  ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
-                                        (float const *)(m), \
-                                        (__v4di)(__m256i)(i), \
-                                        (__v4sf)(__m128)(mask), (s)))
+#define _mm256_mask_i64gather_ps(a, m, i, mask, s)                             \
+  ((__m128)__builtin_ia32_gatherq_ps256(                                       \
+      (__v4sf)(__m128)(a), (float const *)(m), (__v4di)(__m256i)(i),           \
+      (__v4sf)(__m128)(mask), (s)))
 
 /// Conditionally gathers four 32-bit integer values, either from the
 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
@@ -4357,10 +4284,9 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
-#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
-  ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
-                                     (int const *)(m), \
-                                     (__v4si)(__m128i)(i), \
+#define _mm_mask_i32gather_epi32(a, m, i, mask, s)                             \
+  ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), (int const *)(m),   \
+                                     (__v4si)(__m128i)(i),                     \
                                      (__v4si)(__m128i)(mask), (s)))
 
 /// Conditionally gathers eight 32-bit integer values, either from the
@@ -4405,11 +4331,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
-#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
-  ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
-                                        (int const *)(m), \
-                                        (__v8si)(__m256i)(i), \
-                                        (__v8si)(__m256i)(mask), (s)))
+#define _mm256_mask_i32gather_epi32(a, m, i, mask, s)                          \
+  ((__m256i)__builtin_ia32_gatherd_d256(                                       \
+      (__v8si)(__m256i)(a), (int const *)(m), (__v8si)(__m256i)(i),            \
+      (__v8si)(__m256i)(mask), (s)))
 
 /// Conditionally gathers two 32-bit integer values, either from the
 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
@@ -4456,10 +4381,9 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
-#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
-  ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
-                                     (int const *)(m), \
-                                     (__v2di)(__m128i)(i), \
+#define _mm_mask_i64gather_epi32(a, m, i, mask, s)                             \
+  ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), (int const *)(m),   \
+                                     (__v2di)(__m128i)(i),                     \
                                      (__v4si)(__m128i)(mask), (s)))
 
 /// Conditionally gathers four 32-bit integer values, either from the
@@ -4504,11 +4428,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
-#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
-  ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
-                                        (int const *)(m), \
-                                        (__v4di)(__m256i)(i), \
-                                        (__v4si)(__m128i)(mask), (s)))
+#define _mm256_mask_i64gather_epi32(a, m, i, mask, s)                          \
+  ((__m128i)__builtin_ia32_gatherq_d256(                                       \
+      (__v4si)(__m128i)(a), (int const *)(m), (__v4di)(__m256i)(i),            \
+      (__v4si)(__m128i)(mask), (s)))
 
 /// Conditionally gathers two 64-bit integer values, either from the
 ///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
@@ -4553,11 +4476,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
-#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
-  ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
-                                     (long long const *)(m), \
-                                     (__v4si)(__m128i)(i), \
-                                     (__v2di)(__m128i)(mask), (s)))
+#define _mm_mask_i32gather_epi64(a, m, i, mask, s)                             \
+  ((__m128i)__builtin_ia32_gatherd_q(                                          \
+      (__v2di)(__m128i)(a), (long long const *)(m), (__v4si)(__m128i)(i),      \
+      (__v2di)(__m128i)(mask), (s)))
 
 /// Conditionally gathers four 64-bit integer values, either from the
 ///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
@@ -4601,11 +4523,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
-#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
-  ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
-                                        (long long const *)(m), \
-                                        (__v4si)(__m128i)(i), \
-                                        (__v4di)(__m256i)(mask), (s)))
+#define _mm256_mask_i32gather_epi64(a, m, i, mask, s)                          \
+  ((__m256i)__builtin_ia32_gatherd_q256(                                       \
+      (__v4di)(__m256i)(a), (long long const *)(m), (__v4si)(__m128i)(i),      \
+      (__v4di)(__m256i)(mask), (s)))
 
 /// Conditionally gathers two 64-bit integer values, either from the
 ///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
@@ -4649,11 +4570,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
-#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
-  ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
-                                     (long long const *)(m), \
-                                     (__v2di)(__m128i)(i), \
-                                     (__v2di)(__m128i)(mask), (s)))
+#define _mm_mask_i64gather_epi64(a, m, i, mask, s)                             \
+  ((__m128i)__builtin_ia32_gatherq_q(                                          \
+      (__v2di)(__m128i)(a), (long long const *)(m), (__v2di)(__m128i)(i),      \
+      (__v2di)(__m128i)(mask), (s)))
 
 /// Conditionally gathers four 64-bit integer values, either from the
 ///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
@@ -4697,11 +4617,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
-#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
-  ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
-                                        (long long const *)(m), \
-                                        (__v4di)(__m256i)(i), \
-                                        (__v4di)(__m256i)(mask), (s)))
+#define _mm256_mask_i64gather_epi64(a, m, i, mask, s)                          \
+  ((__m256i)__builtin_ia32_gatherq_q256(                                       \
+      (__v4di)(__m256i)(a), (long long const *)(m), (__v4di)(__m256i)(i),      \
+      (__v4di)(__m256i)(mask), (s)))
 
 /// Gathers two 64-bit floating-point values from memory \a m using scaled
 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
@@ -4731,13 +4650,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
-#define _mm_i32gather_pd(m, i, s) \
-  ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
-                                      (double const *)(m), \
-                                      (__v4si)(__m128i)(i), \
-                                      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
-                                                           _mm_setzero_pd()), \
-                                      (s)))
+#define _mm_i32gather_pd(m, i, s)                                              \
+  ((__m128d)__builtin_ia32_gatherd_pd(                                         \
+      (__v2df)_mm_undefined_pd(), (double const *)(m), (__v4si)(__m128i)(i),   \
+      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), _mm_setzero_pd()), (s)))
 
 /// Gathers four 64-bit floating-point values from memory \a m using scaled
 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
@@ -4766,14 +4682,13 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
-#define _mm256_i32gather_pd(m, i, s) \
-  ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
-                                         (double const *)(m), \
-                                         (__v4si)(__m128i)(i), \
-                                         (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
-                                                               _mm256_setzero_pd(), \
-                                                               _CMP_EQ_OQ), \
-                                         (s)))
+#define _mm256_i32gather_pd(m, i, s)                                           \
+  ((__m256d)__builtin_ia32_gatherd_pd256(                                      \
+      (__v4df)_mm256_undefined_pd(), (double const *)(m),                      \
+      (__v4si)(__m128i)(i),                                                    \
+      (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(),          \
+                            _CMP_EQ_OQ),                                       \
+      (s)))
 
 /// Gathers two 64-bit floating-point values from memory \a m using scaled
 ///    indexes from the 128-bit vector of [2 x i64] in \a i.
@@ -4802,13 +4717,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
-#define _mm_i64gather_pd(m, i, s) \
-  ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
-                                      (double const *)(m), \
-                                      (__v2di)(__m128i)(i), \
-                                      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
-                                                           _mm_setzero_pd()), \
-                                      (s)))
+#define _mm_i64gather_pd(m, i, s)                                              \
+  ((__m128d)__builtin_ia32_gatherq_pd(                                         \
+      (__v2df)_mm_undefined_pd(), (double const *)(m), (__v2di)(__m128i)(i),   \
+      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), _mm_setzero_pd()), (s)))
 
 /// Gathers four 64-bit floating-point values from memory \a m using scaled
 ///    indexes from the 256-bit vector of [4 x i64] in \a i.
@@ -4837,14 +4749,13 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
-#define _mm256_i64gather_pd(m, i, s) \
-  ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
-                                         (double const *)(m), \
-                                         (__v4di)(__m256i)(i), \
-                                         (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
-                                                               _mm256_setzero_pd(), \
-                                                               _CMP_EQ_OQ), \
-                                         (s)))
+#define _mm256_i64gather_pd(m, i, s)                                           \
+  ((__m256d)__builtin_ia32_gatherq_pd256(                                      \
+      (__v4df)_mm256_undefined_pd(), (double const *)(m),                      \
+      (__v4di)(__m256i)(i),                                                    \
+      (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(),          \
+                            _CMP_EQ_OQ),                                       \
+      (s)))
 
 /// Gathers four 32-bit floating-point values from memory \a m using scaled
 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
@@ -4873,13 +4784,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
-#define _mm_i32gather_ps(m, i, s) \
-  ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
-                                     (float const *)(m), \
-                                     (__v4si)(__m128i)(i), \
-                                     (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
-                                                          _mm_setzero_ps()), \
-                                     (s)))
+#define _mm_i32gather_ps(m, i, s)                                              \
+  ((__m128)__builtin_ia32_gatherd_ps(                                          \
+      (__v4sf)_mm_undefined_ps(), (float const *)(m), (__v4si)(__m128i)(i),    \
+      (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), _mm_setzero_ps()), (s)))
 
 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
 ///    indexes from the 256-bit vector of [8 x i32] in \a i.
@@ -4908,14 +4816,12 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
-#define _mm256_i32gather_ps(m, i, s) \
-  ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
-                                        (float const *)(m), \
-                                        (__v8si)(__m256i)(i), \
-                                        (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
-                                                              _mm256_setzero_ps(), \
-                                                              _CMP_EQ_OQ), \
-                                        (s)))
+#define _mm256_i32gather_ps(m, i, s)                                           \
+  ((__m256)__builtin_ia32_gatherd_ps256(                                       \
+      (__v8sf)_mm256_undefined_ps(), (float const *)(m), (__v8si)(__m256i)(i), \
+      (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(),          \
+                            _CMP_EQ_OQ),                                       \
+      (s)))
 
 /// Gathers two 32-bit floating-point values from memory \a m using scaled
 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
@@ -4946,13 +4852,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
-#define _mm_i64gather_ps(m, i, s) \
-  ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
-                                     (float const *)(m), \
-                                     (__v2di)(__m128i)(i), \
-                                     (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
-                                                          _mm_setzero_ps()), \
-                                     (s)))
+#define _mm_i64gather_ps(m, i, s)                                              \
+  ((__m128)__builtin_ia32_gatherq_ps(                                          \
+      (__v4sf)_mm_undefined_ps(), (float const *)(m), (__v2di)(__m128i)(i),    \
+      (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), _mm_setzero_ps()), (s)))
 
 /// Gathers four 32-bit floating-point values from memory \a m using scaled
 ///    indexes from the 256-bit vector of [4 x i64] in \a i.
@@ -4981,13 +4884,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
-#define _mm256_i64gather_ps(m, i, s) \
-  ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
-                                        (float const *)(m), \
-                                        (__v4di)(__m256i)(i), \
-                                        (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
-                                                             _mm_setzero_ps()), \
-                                        (s)))
+#define _mm256_i64gather_ps(m, i, s)                                           \
+  ((__m128)__builtin_ia32_gatherq_ps256(                                       \
+      (__v4sf)_mm_undefined_ps(), (float const *)(m), (__v4di)(__m256i)(i),    \
+      (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), _mm_setzero_ps()), (s)))
 
 /// Gathers four 32-bit floating-point values from memory \a m using scaled
 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
@@ -5016,9 +4916,9 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
-#define _mm_i32gather_epi32(m, i, s) \
-  ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
-                                     (int const *)(m), (__v4si)(__m128i)(i), \
+#define _mm_i32gather_epi32(m, i, s)                                           \
+  ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(),            \
+                                     (int const *)(m), (__v4si)(__m128i)(i),   \
                                      (__v4si)_mm_set1_epi32(-1), (s)))
 
 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
@@ -5048,10 +4948,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
-#define _mm256_i32gather_epi32(m, i, s) \
-  ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
-                                        (int const *)(m), (__v8si)(__m256i)(i), \
-                                        (__v8si)_mm256_set1_epi32(-1), (s)))
+#define _mm256_i32gather_epi32(m, i, s)                                        \
+  ((__m256i)__builtin_ia32_gatherd_d256(                                       \
+      (__v8si)_mm256_undefined_si256(), (int const *)(m),                      \
+      (__v8si)(__m256i)(i), (__v8si)_mm256_set1_epi32(-1), (s)))
 
 /// Gathers two 32-bit integer values from memory \a m using scaled indexes
 ///    from the 128-bit vector of [2 x i64] in \a i. The upper two elements
@@ -5082,9 +4982,9 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
-#define _mm_i64gather_epi32(m, i, s) \
-  ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
-                                     (int const *)(m), (__v2di)(__m128i)(i), \
+#define _mm_i64gather_epi32(m, i, s)                                           \
+  ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(),            \
+                                     (int const *)(m), (__v2di)(__m128i)(i),   \
                                      (__v4si)_mm_set1_epi32(-1), (s)))
 
 /// Gathers four 32-bit integer values from memory \a m using scaled indexes
@@ -5114,10 +5014,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
-#define _mm256_i64gather_epi32(m, i, s) \
-  ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
-                                        (int const *)(m), (__v4di)(__m256i)(i), \
-                                        (__v4si)_mm_set1_epi32(-1), (s)))
+#define _mm256_i64gather_epi32(m, i, s)                                        \
+  ((__m128i)__builtin_ia32_gatherq_d256(                                       \
+      (__v4si)_mm_undefined_si128(), (int const *)(m), (__v4di)(__m256i)(i),   \
+      (__v4si)_mm_set1_epi32(-1), (s)))
 
 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
 ///    from the 128-bit vector of [4 x i32] in \a i.
@@ -5147,11 +5047,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
-#define _mm_i32gather_epi64(m, i, s) \
-  ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
-                                     (long long const *)(m), \
-                                     (__v4si)(__m128i)(i), \
-                                     (__v2di)_mm_set1_epi64x(-1), (s)))
+#define _mm_i32gather_epi64(m, i, s)                                           \
+  ((__m128i)__builtin_ia32_gatherd_q(                                          \
+      (__v2di)_mm_undefined_si128(), (long long const *)(m),                   \
+      (__v4si)(__m128i)(i), (__v2di)_mm_set1_epi64x(-1), (s)))
 
 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
 ///    from the 128-bit vector of [4 x i32] in \a i.
@@ -5180,11 +5079,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
-#define _mm256_i32gather_epi64(m, i, s) \
-  ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
-                                        (long long const *)(m), \
-                                        (__v4si)(__m128i)(i), \
-                                        (__v4di)_mm256_set1_epi64x(-1), (s)))
+#define _mm256_i32gather_epi64(m, i, s)                                        \
+  ((__m256i)__builtin_ia32_gatherd_q256(                                       \
+      (__v4di)_mm256_undefined_si256(), (long long const *)(m),                \
+      (__v4si)(__m128i)(i), (__v4di)_mm256_set1_epi64x(-1), (s)))
 
 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
 ///    from the 128-bit vector of [2 x i64] in \a i.
@@ -5213,11 +5111,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
-#define _mm_i64gather_epi64(m, i, s) \
-  ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
-                                     (long long const *)(m), \
-                                     (__v2di)(__m128i)(i), \
-                                     (__v2di)_mm_set1_epi64x(-1), (s)))
+#define _mm_i64gather_epi64(m, i, s)                                           \
+  ((__m128i)__builtin_ia32_gatherq_q(                                          \
+      (__v2di)_mm_undefined_si128(), (long long const *)(m),                   \
+      (__v2di)(__m128i)(i), (__v2di)_mm_set1_epi64x(-1), (s)))
 
 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
 ///    from the 256-bit vector of [4 x i64] in \a i.
@@ -5246,11 +5143,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y)
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
-#define _mm256_i64gather_epi64(m, i, s) \
-  ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
-                                        (long long const *)(m), \
-                                        (__v4di)(__m256i)(i), \
-                                        (__v4di)_mm256_set1_epi64x(-1), (s)))
+#define _mm256_i64gather_epi64(m, i, s)                                        \
+  ((__m256i)__builtin_ia32_gatherq_q256(                                       \
+      (__v4di)_mm256_undefined_si256(), (long long const *)(m),                \
+      (__v4di)(__m256i)(i), (__v4di)_mm256_set1_epi64x(-1), (s)))
 
 #undef __DEFAULT_FN_ATTRS256
 #undef __DEFAULT_FN_ATTRS128
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index 7f69019e01b06..dbdd4898d900e 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -3090,8 +3090,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_srl_epi64(__m128i __a,
 /// \param __b
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpeq_epi8(__m128i __a,
-                                                            __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cmpeq_epi8(__m128i __a, __m128i __b) {
   return (__m128i)((__v16qi)__a == (__v16qi)__b);
 }
 
@@ -3109,8 +3109,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpeq_epi8(__m128i __
 /// \param __b
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpeq_epi16(__m128i __a,
-                                                             __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cmpeq_epi16(__m128i __a, __m128i __b) {
   return (__m128i)((__v8hi)__a == (__v8hi)__b);
 }
 
@@ -3128,8 +3128,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpeq_epi16(__m128i _
 /// \param __b
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpeq_epi32(__m128i __a,
-                                                             __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cmpeq_epi32(__m128i __a, __m128i __b) {
   return (__m128i)((__v4si)__a == (__v4si)__b);
 }
 
@@ -3148,8 +3148,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpeq_epi32(__m128i _
 /// \param __b
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpgt_epi8(__m128i __a,
-                                                            __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cmpgt_epi8(__m128i __a, __m128i __b) {
   /* This function always performs a signed comparison, but __v16qi is a char
      which may be signed or unsigned, so use __v16qs. */
   return (__m128i)((__v16qs)__a > (__v16qs)__b);
@@ -3170,8 +3170,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpgt_epi8(__m128i __
 /// \param __b
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpgt_epi16(__m128i __a,
-                                                             __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cmpgt_epi16(__m128i __a, __m128i __b) {
   return (__m128i)((__v8hi)__a > (__v8hi)__b);
 }
 
@@ -3190,8 +3190,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpgt_epi16(__m128i _
 /// \param __b
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpgt_epi32(__m128i __a,
-                                                             __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cmpgt_epi32(__m128i __a, __m128i __b) {
   return (__m128i)((__v4si)__a > (__v4si)__b);
 }
 
@@ -3210,8 +3210,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpgt_epi32(__m128i _
 /// \param __b
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmplt_epi8(__m128i __a,
-                                                            __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cmplt_epi8(__m128i __a, __m128i __b) {
   return _mm_cmpgt_epi8(__b, __a);
 }
 
@@ -3230,8 +3230,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmplt_epi8(__m128i __
 /// \param __b
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmplt_epi16(__m128i __a,
-                                                             __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cmplt_epi16(__m128i __a, __m128i __b) {
   return _mm_cmpgt_epi16(__b, __a);
 }
 
@@ -3250,8 +3250,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmplt_epi16(__m128i _
 /// \param __b
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmplt_epi32(__m128i __a,
-                                                             __m128i __b) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cmplt_epi32(__m128i __a, __m128i __b) {
   return _mm_cmpgt_epi32(__b, __a);
 }
 
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index 3f44c786fb75f..f68dd7ed2bcc9 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -1211,8 +1211,8 @@ static __inline__ int __DEFAULT_FN_ATTRS _mm_testnzc_si128(__m128i __M,
 /// \param __V2
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpeq_epi64(__m128i __V1,
-                                                             __m128i __V2) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cmpeq_epi64(__m128i __V1, __m128i __V2) {
   return (__m128i)((__v2di)__V1 == (__v2di)__V2);
 }
 
@@ -2338,8 +2338,8 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_minpos_epu16(__m128i __V) {
 /// \param __V2
 ///    A 128-bit integer vector.
 /// \returns A 128-bit integer vector containing the comparison results.
-static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR _mm_cmpgt_epi64(__m128i __V1,
-                                                             __m128i __V2) {
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_cmpgt_epi64(__m128i __V1, __m128i __V2) {
   return (__m128i)((__v2di)__V1 > (__v2di)__V2);
 }
 

>From d6c72caab363cf254e8ea899ed76f9f0a024c0ff Mon Sep 17 00:00:00 2001
From: smoke-y <SnowSneeze at protonmail.com>
Date: Thu, 28 Aug 2025 16:57:38 +0530
Subject: [PATCH 4/8] resolved conflict: added CONSTEXPR support back to abs

---
 clang/lib/Headers/avx2intrin.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index 7d617e519d7b9..35ca5be1d8cdd 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -112,7 +112,8 @@
 /// \param __a
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi8(__m256i __a) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_abs_epi8(__m256i __a) {
   return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
 }
 

>From 21fee1bcda3777c16d3e018766fd211322849f45 Mon Sep 17 00:00:00 2001
From: smoke-y <SnowSneeze at protonmail.com>
Date: Thu, 28 Aug 2025 16:59:59 +0530
Subject: [PATCH 5/8] resolved conflict: added CONSTEXPR support back to abs

---
 clang/lib/Headers/avx2intrin.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index 35ca5be1d8cdd..26c665eea0c99 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -128,7 +128,7 @@ _mm256_abs_epi8(__m256i __a) {
 /// \param __a
 ///    A 256-bit vector of [16 x i16].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi16(__m256i __a) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi16(__m256i __a) {
   return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
 }
 
@@ -143,7 +143,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi16(__m256i __a) {
 /// \param __a
 ///    A 256-bit vector of [8 x i32].
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_abs_epi32(__m256i __a) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi32(__m256i __a) {
   return (__m256i)__builtin_elementwise_abs((__v8si)__a);
 }
 

>From 399b18277beffcef85a5bb742284d7e90c3a654e Mon Sep 17 00:00:00 2001
From: smoke-y <SnowSneeze at protonmail.com>
Date: Thu, 28 Aug 2025 17:00:55 +0530
Subject: [PATCH 6/8] formated resolved conflicts

---
 clang/lib/Headers/avx2intrin.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index 26c665eea0c99..4b698bf0c6135 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -128,7 +128,8 @@ _mm256_abs_epi8(__m256i __a) {
 /// \param __a
 ///    A 256-bit vector of [16 x i16].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi16(__m256i __a) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_abs_epi16(__m256i __a) {
   return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
 }
 
@@ -143,7 +144,8 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi16(__m25
 /// \param __a
 ///    A 256-bit vector of [8 x i32].
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR _mm256_abs_epi32(__m256i __a) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_abs_epi32(__m256i __a) {
   return (__m256i)__builtin_elementwise_abs((__v8si)__a);
 }
 

>From 65004c2c687adafa9d1fb0a9685c647c59585f80 Mon Sep 17 00:00:00 2001
From: smoke-y <SnowSneeze at protonmail.com>
Date: Thu, 28 Aug 2025 17:34:46 +0530
Subject: [PATCH 7/8] formated and changed i->s

---
 clang/lib/Headers/avx2intrin.h         |  5 ++---
 clang/test/CodeGen/X86/avx2-builtins.c |  5 ++++-
 clang/test/CodeGen/X86/sse2-builtins.c | 10 ++++++++--
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index 4b698bf0c6135..58297bde14184 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -97,9 +97,8 @@
 ///     An unsigned immediate value specifying the starting positions of the
 ///     bytes to operate on.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-#define _mm256_mpsadbw_epu8(X, Y, M)                                           \
-  ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X),                   \
-                                      (__v32qi)(__m256i)(Y), (int)(M)))
+#define _mm256_mpsadbw_epu8(X, Y, M) \
+  ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
 
 /// Computes the absolute value of each signed byte in the 256-bit integer
 ///    vector \a __a and returns each value in the corresponding byte of
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 7e25da35020c5..1629c20659597 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -318,7 +318,10 @@ __m256i test_mm256_cmpgt_epi8(__m256i a, __m256i b) {
   // CHECK: icmp sgt <32 x i8>
   return _mm256_cmpgt_epi8(a, b);
 }
-TEST_CONSTEXPR(match_v32qi(_mm256_cmpgt_epi8((__m256i)(__v32qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, (__m256i)(__v32qi){10, 2, 6, 5, 30, 7, 8, 1, 20, 3, 12, 8, 25, 10, 9, 2, 10, 2, 6, 5, 30, 7, 8, 1, 20, 3, 12, 8, 25, 10, 9, 2}), 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, -1, -1));
+TEST_CONSTEXPR(match_v32qi(_mm256_cmpgt_epi8(
+    (__m256i)(__v32qs){1, -2, 3, -4, 5, -6, 7, -8, 9, -10, 11, -12, 13, -14, 15, -16, -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11, 12, -13, 14, -15, 16},
+    (__m256i)(__v32qs){10, -2, 6, -5, 30, -7, 8, -1, 20, -3, 12, -8, 25, -10, 9, -2, -10, 2, -6, 5, -30, 7, -8, 1, -20, 3, -12, 8, -25, 10, -9, 2}),
+            0, 0, 0, -1, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, -1, -1, -1, -1, -1, -1, -1, 0, -1));
 
 __m256i test_mm256_cmpgt_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_cmpgt_epi16
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index 17030d0b7920e..81fe8136220c8 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -296,7 +296,10 @@ __m128i test_mm_cmpgt_epi8(__m128i A, __m128i B) {
   // CHECK: icmp sgt <16 x i8>
   return _mm_cmpgt_epi8(A, B);
 }
-TEST_CONSTEXPR(match_v16qi(_mm_cmpgt_epi8((__m128i)(__v16qi){15,2,8,4,12,6,20,8,25,10,30,12,35,14,40,16}, (__m128i)(__v16qi){10,2,6,4,5,12,14,8,9,20,22,12,26,14,30,16}), -1,0,-1,0,-1,0,-1,0,-1,0,-1,0,-1,0,-1,0));
+TEST_CONSTEXPR(match_v16qi(_mm_cmpgt_epi8(
+    (__m128i)(__v16qs){15,-2,8,-4,12,6,-20,8,25,-10,30,12,-35,14,40,-16},
+    (__m128i)(__v16qs){10,-2,6,-4,5,12,-14,8,9,-20,22,12,-26,14,30,-16}),
+            -1, 0, -1, 0, -1, 0, 0, 0,-1, -1, -1, 0, 0, 0, -1, 0));
 
 __m128i test_mm_cmpgt_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmpgt_epi16
@@ -349,7 +352,10 @@ __m128i test_mm_cmplt_epi8(__m128i A, __m128i B) {
   // CHECK: icmp sgt <16 x i8>
   return _mm_cmplt_epi8(A, B);
 }
-TEST_CONSTEXPR(match_v16qi(_mm_cmplt_epi8((__m128i)(__v16qi){5,2,3,4,1,6,7,8,9,5,11,12,13,10,15,8}, (__m128i)(__v16qi){10,2,6,4,5,12,14,8,9,20,22,12,26,14,30,16}), -1, 0, -1, 0, -1, -1, -1, 0, 0, -1, -1, 0, -1, -1, -1, -1));
+TEST_CONSTEXPR(match_v16qi(_mm_cmplt_epi8(
+    (__m128i)(__v16qs){15,-2,8,-4,12,6,-20,8,25,-10,30,12,-35,14,40,-16},
+    (__m128i)(__v16qs){10,-2,6,-4,5,12,-14,8,9,-20,22,12,-26,14,30,-16}),
+            0, 0, 0, 0, 0, -1, -1, 0,0, 0, 0, 0, -1, 0, 0, 0));
 
 __m128i test_mm_cmplt_epi16(__m128i A, __m128i B) {
   // CHECK-LABEL: test_mm_cmplt_epi16

>From 4226edf9ceee9fd745fef14ca1862e8d92b0808e Mon Sep 17 00:00:00 2001
From: smoke-y <SnowSneeze at protonmail.com>
Date: Thu, 28 Aug 2025 19:56:34 +0530
Subject: [PATCH 8/8] cleaned avx2intrin.h

---
 clang/lib/Headers/avx2intrin.h | 852 ++++++++++++++++++---------------
 1 file changed, 477 insertions(+), 375 deletions(-)

diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index 58297bde14184..05bd15385d149 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -99,6 +99,7 @@
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 #define _mm256_mpsadbw_epu8(X, Y, M) \
   ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \
+                                      (__v32qi)(__m256i)(Y), (int)(M)))
 
 /// Computes the absolute value of each signed byte in the 256-bit integer
 ///    vector \a __a and returns each value in the corresponding byte of
@@ -111,9 +112,10 @@
 /// \param __a
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_abs_epi8(__m256i __a) {
-  return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_abs_epi8(__m256i __a)
+{
+    return (__m256i)__builtin_elementwise_abs((__v32qs)__a);
 }
 
 /// Computes the absolute value of each signed 16-bit element in the 256-bit
@@ -127,9 +129,10 @@ _mm256_abs_epi8(__m256i __a) {
 /// \param __a
 ///    A 256-bit vector of [16 x i16].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_abs_epi16(__m256i __a) {
-  return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_abs_epi16(__m256i __a)
+{
+    return (__m256i)__builtin_elementwise_abs((__v16hi)__a);
 }
 
 /// Computes the absolute value of each signed 32-bit element in the 256-bit
@@ -143,9 +146,10 @@ _mm256_abs_epi16(__m256i __a) {
 /// \param __a
 ///    A 256-bit vector of [8 x i32].
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_abs_epi32(__m256i __a) {
-  return (__m256i)__builtin_elementwise_abs((__v8si)__a);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_abs_epi32(__m256i __a)
+{
+    return (__m256i)__builtin_elementwise_abs((__v8si)__a);
 }
 
 /// Converts the elements of two 256-bit vectors of [16 x i16] to 8-bit
@@ -174,7 +178,8 @@ _mm256_abs_epi32(__m256i __a) {
 ///    result[255:192].
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packs_epi16(__m256i __a, __m256i __b) {
+_mm256_packs_epi16(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -205,7 +210,8 @@ _mm256_packs_epi16(__m256i __a, __m256i __b) {
 ///    result[255:192].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packs_epi32(__m256i __a, __m256i __b) {
+_mm256_packs_epi32(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b);
 }
 
@@ -235,7 +241,8 @@ _mm256_packs_epi32(__m256i __a, __m256i __b) {
 ///    result[255:192].
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packus_epi16(__m256i __a, __m256i __b) {
+_mm256_packus_epi16(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -266,8 +273,9 @@ _mm256_packus_epi16(__m256i __a, __m256i __b) {
 ///    result[255:192].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_packus_epi32(__m256i __V1, __m256i __V2) {
-  return (__m256i)__builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
+_mm256_packus_epi32(__m256i __V1, __m256i __V2)
+{
+  return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2);
 }
 
 /// Adds 8-bit integers from corresponding bytes of two 256-bit integer
@@ -283,8 +291,9 @@ _mm256_packus_epi32(__m256i __V1, __m256i __V2) {
 /// \param __b
 ///    A 256-bit integer vector containing one of the source operands.
 /// \returns A 256-bit integer vector containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi8(__m256i __a,
-                                                                __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_add_epi8(__m256i __a, __m256i __b)
+{
   return (__m256i)((__v32qu)__a + (__v32qu)__b);
 }
 
@@ -301,8 +310,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi8(__m256i __a,
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi16(__m256i __a,
-                                                                 __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_add_epi16(__m256i __a, __m256i __b)
+{
   return (__m256i)((__v16hu)__a + (__v16hu)__b);
 }
 
@@ -319,8 +329,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi16(__m256i __a,
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi32(__m256i __a,
-                                                                 __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_add_epi32(__m256i __a, __m256i __b)
+{
   return (__m256i)((__v8su)__a + (__v8su)__b);
 }
 
@@ -337,8 +348,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi32(__m256i __a,
 /// \param __b
 ///    A 256-bit vector of [4 x i64] containing one of the source operands.
 /// \returns A 256-bit vector of [4 x i64] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_add_epi64(__m256i __a,
-                                                                 __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_add_epi64(__m256i __a, __m256i __b)
+{
   return (__m256i)((__v4du)__a + (__v4du)__b);
 }
 
@@ -436,8 +448,8 @@ _mm256_adds_epu16(__m256i __a, __m256i __b) {
 /// \param n
 ///    An immediate value specifying the number of bytes to shift.
 /// \returns A 256-bit integer vector containing the result.
-#define _mm256_alignr_epi8(a, b, n)                                            \
-  ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a),                   \
+#define _mm256_alignr_epi8(a, b, n) \
+  ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \
                                       (__v32qi)(__m256i)(b), (n)))
 
 /// Computes the bitwise AND of the 256-bit integer vectors in \a __a and
@@ -453,7 +465,8 @@ _mm256_adds_epu16(__m256i __a, __m256i __b) {
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_and_si256(__m256i __a, __m256i __b) {
+_mm256_and_si256(__m256i __a, __m256i __b)
+{
   return (__m256i)((__v4du)__a & (__v4du)__b);
 }
 
@@ -470,7 +483,8 @@ _mm256_and_si256(__m256i __a, __m256i __b) {
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_andnot_si256(__m256i __a, __m256i __b) {
+_mm256_andnot_si256(__m256i __a, __m256i __b)
+{
   return (__m256i)(~(__v4du)__a & (__v4du)__b);
 }
 
@@ -494,8 +508,9 @@ _mm256_andnot_si256(__m256i __a, __m256i __b) {
 /// \param __b
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu8(__m256i __a,
-                                                                __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_avg_epu8(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b);
 }
 
@@ -519,8 +534,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu8(__m256i __a,
 /// \param __b
 ///    A 256-bit vector of [16 x i16].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu16(__m256i __a,
-                                                                 __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_avg_epu16(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -554,9 +570,10 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu16(__m256i __a,
 ///    \a __V2.
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) {
+_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M)
+{
   return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2,
-                                             (__v32qi)__M);
+                                              (__v32qi)__M);
 }
 
 /// Merges 16-bit integer values from either of the two 256-bit vectors
@@ -596,8 +613,8 @@ _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) {
 ///    \a M[0] determines the source for elements 0 and 8, \a M[1] for
 ///    elements 1 and 9, and so forth.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-#define _mm256_blend_epi16(V1, V2, M)                                          \
-  ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1),                  \
+#define _mm256_blend_epi16(V1, V2, M) \
+  ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \
                                       (__v16hi)(__m256i)(V2), (int)(M)))
 
 /// Compares corresponding bytes in the 256-bit integer vectors in \a __a and
@@ -621,7 +638,8 @@ _mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) {
 ///    A 256-bit integer vector containing one of the inputs.
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_cmpeq_epi8(__m256i __a, __m256i __b) {
+_mm256_cmpeq_epi8(__m256i __a, __m256i __b)
+{
   return (__m256i)((__v32qi)__a == (__v32qi)__b);
 }
 
@@ -646,7 +664,8 @@ _mm256_cmpeq_epi8(__m256i __a, __m256i __b) {
 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_cmpeq_epi16(__m256i __a, __m256i __b) {
+_mm256_cmpeq_epi16(__m256i __a, __m256i __b)
+{
   return (__m256i)((__v16hi)__a == (__v16hi)__b);
 }
 
@@ -671,7 +690,8 @@ _mm256_cmpeq_epi16(__m256i __a, __m256i __b) {
 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_cmpeq_epi32(__m256i __a, __m256i __b) {
+_mm256_cmpeq_epi32(__m256i __a, __m256i __b)
+{
   return (__m256i)((__v8si)__a == (__v8si)__b);
 }
 
@@ -696,7 +716,8 @@ _mm256_cmpeq_epi32(__m256i __a, __m256i __b) {
 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
 /// \returns A 256-bit vector of [4 x i64] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_cmpeq_epi64(__m256i __a, __m256i __b) {
+_mm256_cmpeq_epi64(__m256i __a, __m256i __b)
+{
   return (__m256i)((__v4di)__a == (__v4di)__b);
 }
 
@@ -721,7 +742,8 @@ _mm256_cmpeq_epi64(__m256i __a, __m256i __b) {
 ///    A 256-bit integer vector containing one of the inputs.
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_cmpgt_epi8(__m256i __a, __m256i __b) {
+_mm256_cmpgt_epi8(__m256i __a, __m256i __b)
+{
   /* This function always performs a signed comparison, but __v32qi is a char
      which may be signed or unsigned, so use __v32qs. */
   return (__m256i)((__v32qs)__a > (__v32qs)__b);
@@ -748,7 +770,8 @@ _mm256_cmpgt_epi8(__m256i __a, __m256i __b) {
 ///    A 256-bit vector of [16 x i16] containing one of the inputs.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_cmpgt_epi16(__m256i __a, __m256i __b) {
+_mm256_cmpgt_epi16(__m256i __a, __m256i __b)
+{
   return (__m256i)((__v16hi)__a > (__v16hi)__b);
 }
 
@@ -773,7 +796,8 @@ _mm256_cmpgt_epi16(__m256i __a, __m256i __b) {
 ///    A 256-bit vector of [8 x i32] containing one of the inputs.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_cmpgt_epi32(__m256i __a, __m256i __b) {
+_mm256_cmpgt_epi32(__m256i __a, __m256i __b)
+{
   return (__m256i)((__v8si)__a > (__v8si)__b);
 }
 
@@ -798,7 +822,8 @@ _mm256_cmpgt_epi32(__m256i __a, __m256i __b) {
 ///    A 256-bit vector of [4 x i64] containing one of the inputs.
 /// \returns A 256-bit vector of [4 x i64] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_cmpgt_epi64(__m256i __a, __m256i __b) {
+_mm256_cmpgt_epi64(__m256i __a, __m256i __b)
+{
   return (__m256i)((__v4di)__a > (__v4di)__b);
 }
 
@@ -832,9 +857,10 @@ _mm256_cmpgt_epi64(__m256i __a, __m256i __b) {
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi16(__m256i __a,
-                                                                  __m256i __b) {
-  return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_hadd_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Horizontally adds the adjacent pairs of 32-bit integers from two 256-bit
@@ -863,9 +889,10 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi16(__m256i __a,
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the sums.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi32(__m256i __a,
-                                                                  __m256i __b) {
-  return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_hadd_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b);
 }
 
 /// Horizontally adds the adjacent pairs of 16-bit integers from two 256-bit
@@ -898,8 +925,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hadd_epi32(__m256i __a,
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the sums.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hadds_epi16(__m256i __a, __m256i __b) {
-  return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
+_mm256_hadds_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
@@ -932,9 +960,10 @@ _mm256_hadds_epi16(__m256i __a, __m256i __b) {
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi16(__m256i __a,
-                                                                  __m256i __b) {
-  return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_hsub_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Horizontally subtracts adjacent pairs of 32-bit integers from two 256-bit
@@ -963,9 +992,10 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi16(__m256i __a,
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi32(__m256i __a,
-                                                                  __m256i __b) {
-  return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_hsub_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b);
 }
 
 /// Horizontally subtracts adjacent pairs of 16-bit integers from two 256-bit
@@ -999,8 +1029,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_hsub_epi32(__m256i __a,
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the differences.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_hsubs_epi16(__m256i __a, __m256i __b) {
-  return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
+_mm256_hsubs_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Multiplies each unsigned byte from the 256-bit integer vector in \a __a
@@ -1028,8 +1059,9 @@ _mm256_hsubs_epi16(__m256i __a, __m256i __b) {
 ///    A 256-bit vector containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maddubs_epi16(__m256i __a, __m256i __b) {
-  return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
+_mm256_maddubs_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b);
 }
 
 /// Multiplies corresponding 16-bit elements of two 256-bit vectors of
@@ -1058,8 +1090,9 @@ _mm256_maddubs_epi16(__m256i __a, __m256i __b) {
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_madd_epi16(__m256i __a,
-                                                                  __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_madd_epi16(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -1076,8 +1109,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_madd_epi16(__m256i __a,
 /// \param __b
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi8(__m256i __a,
-                                                                __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_max_epi8(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b);
 }
 
@@ -1094,8 +1128,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi8(__m256i __a,
 /// \param __b
 ///    A 256-bit vector of [16 x i16].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi16(__m256i __a,
-                                                                 __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_max_epi16(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -1112,8 +1147,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi16(__m256i __a,
 /// \param __b
 ///    A 256-bit vector of [8 x i32].
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi32(__m256i __a,
-                                                                 __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_max_epi32(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b);
 }
 
@@ -1130,8 +1166,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epi32(__m256i __a,
 /// \param __b
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu8(__m256i __a,
-                                                                __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_max_epu8(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b);
 }
 
@@ -1148,8 +1185,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu8(__m256i __a,
 /// \param __b
 ///    A 256-bit vector of [16 x i16].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu16(__m256i __a,
-                                                                 __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_max_epu16(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b);
 }
 
@@ -1166,8 +1204,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu16(__m256i __a,
 /// \param __b
 ///    A 256-bit vector of [8 x i32].
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu32(__m256i __a,
-                                                                 __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_max_epu32(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b);
 }
 
@@ -1184,8 +1223,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_max_epu32(__m256i __a,
 /// \param __b
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi8(__m256i __a,
-                                                                __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_min_epi8(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b);
 }
 
@@ -1202,8 +1242,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi8(__m256i __a,
 /// \param __b
 ///    A 256-bit vector of [16 x i16].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi16(__m256i __a,
-                                                                 __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_min_epi16(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -1220,8 +1261,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi16(__m256i __a,
 /// \param __b
 ///    A 256-bit vector of [8 x i32].
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi32(__m256i __a,
-                                                                 __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_min_epi32(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b);
 }
 
@@ -1238,8 +1280,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epi32(__m256i __a,
 /// \param __b
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu8(__m256i __a,
-                                                                __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_min_epu8(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b);
 }
 
@@ -1256,8 +1299,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu8(__m256i __a,
 /// \param __b
 ///    A 256-bit vector of [16 x i16].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu16(__m256i __a,
-                                                                 __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_min_epu16(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b);
 }
 
@@ -1274,8 +1318,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu16(__m256i __a,
 /// \param __b
 ///    A 256-bit vector of [8 x i32].
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu32(__m256i __a,
-                                                                 __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_min_epu32(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b);
 }
 
@@ -1296,7 +1341,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_min_epu32(__m256i __a,
 /// \param __a
 ///    A 256-bit integer vector containing the source bytes.
 /// \returns The 32-bit integer mask.
-static __inline__ int __DEFAULT_FN_ATTRS256 _mm256_movemask_epi8(__m256i __a) {
+static __inline__ int __DEFAULT_FN_ATTRS256
+_mm256_movemask_epi8(__m256i __a)
+{
   return __builtin_ia32_pmovmskb256((__v32qi)__a);
 }
 
@@ -1324,7 +1371,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepi8_epi16(__m128i __V) {
   /* This function always performs a signed extension, but __v16qi is a char
      which may be signed or unsigned, so use __v16qs. */
-  return (__m256i) __builtin_convertvector((__v16qs)__V, __v16hi);
+  return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi);
 }
 
 /// Sign-extends bytes from the lower half of the 128-bit integer vector in
@@ -1351,10 +1398,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepi8_epi32(__m128i __V) {
   /* This function always performs a signed extension, but __v16qi is a char
      which may be signed or unsigned, so use __v16qs. */
-  return (__m256i) __builtin_convertvector(
-      __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6,
-                              7),
-      __v8si);
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
 }
 
 /// Sign-extends the first four bytes from the 128-bit integer vector in
@@ -1380,8 +1424,7 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepi8_epi64(__m128i __V) {
   /* This function always performs a signed extension, but __v16qi is a char
      which may be signed or unsigned, so use __v16qs. */
-  return (__m256i) __builtin_convertvector(
-      __builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di);
 }
 
 /// Sign-extends 16-bit elements from the 128-bit vector of [8 x i16] in
@@ -1406,7 +1449,7 @@ _mm256_cvtepi8_epi64(__m128i __V) {
 ///    values.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepi16_epi32(__m128i __V) {
-  return (__m256i) __builtin_convertvector((__v8hi)__V, __v8si);
+  return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si);
 }
 
 /// Sign-extends 16-bit elements from the lower half of the 128-bit vector of
@@ -1430,8 +1473,7 @@ _mm256_cvtepi16_epi32(__m128i __V) {
 ///    values.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepi16_epi64(__m128i __V) {
-  return (__m256i) __builtin_convertvector(
-      __builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di);
 }
 
 /// Sign-extends 32-bit elements from the 128-bit vector of [4 x i32] in
@@ -1455,7 +1497,7 @@ _mm256_cvtepi16_epi64(__m128i __V) {
 ///    values.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepi32_epi64(__m128i __V) {
-  return (__m256i) __builtin_convertvector((__v4si)__V, __v4di);
+  return (__m256i)__builtin_convertvector((__v4si)__V, __v4di);
 }
 
 /// Zero-extends bytes from the 128-bit integer vector in \a __V and returns
@@ -1480,7 +1522,7 @@ _mm256_cvtepi32_epi64(__m128i __V) {
 ///    values.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepu8_epi16(__m128i __V) {
-  return (__m256i) __builtin_convertvector((__v16qu)__V, __v16hi);
+  return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi);
 }
 
 /// Zero-extends bytes from the lower half of the 128-bit integer vector in
@@ -1505,10 +1547,7 @@ _mm256_cvtepu8_epi16(__m128i __V) {
 ///    values.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepu8_epi32(__m128i __V) {
-  return (__m256i) __builtin_convertvector(
-      __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6,
-                              7),
-      __v8si);
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si);
 }
 
 /// Zero-extends the first four bytes from the 128-bit integer vector in
@@ -1532,8 +1571,7 @@ _mm256_cvtepu8_epi32(__m128i __V) {
 ///    values.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepu8_epi64(__m128i __V) {
-  return (__m256i) __builtin_convertvector(
-      __builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di);
 }
 
 /// Zero-extends 16-bit elements from the 128-bit vector of [8 x i16] in
@@ -1558,7 +1596,7 @@ _mm256_cvtepu8_epi64(__m128i __V) {
 ///    values.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepu16_epi32(__m128i __V) {
-  return (__m256i) __builtin_convertvector((__v8hu)__V, __v8si);
+  return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si);
 }
 
 /// Zero-extends 16-bit elements from the lower half of the 128-bit vector of
@@ -1582,8 +1620,7 @@ _mm256_cvtepu16_epi32(__m128i __V) {
 ///    values.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepu16_epi64(__m128i __V) {
-  return (__m256i) __builtin_convertvector(
-      __builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
+  return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di);
 }
 
 /// Zero-extends 32-bit elements from the 128-bit vector of [4 x i32] in
@@ -1607,7 +1644,7 @@ _mm256_cvtepu16_epi64(__m128i __V) {
 ///    values.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_cvtepu32_epi64(__m128i __V) {
-  return (__m256i) __builtin_convertvector((__v4su)__V, __v4di);
+  return (__m256i)__builtin_convertvector((__v4su)__V, __v4di);
 }
 
 /// Multiplies signed 32-bit integers from even-numbered elements of two
@@ -1657,7 +1694,8 @@ _mm256_mul_epi32(__m256i __a, __m256i __b) {
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the rounded products.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mulhrs_epi16(__m256i __a, __m256i __b) {
+_mm256_mulhrs_epi16(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -1675,7 +1713,8 @@ _mm256_mulhrs_epi16(__m256i __a, __m256i __b) {
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the products.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_mulhi_epu16(__m256i __a, __m256i __b) {
+_mm256_mulhi_epu16(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_ia32_pmulhuw256((__v16hu)__a, (__v16hu)__b);
 }
 
@@ -1693,7 +1732,8 @@ _mm256_mulhi_epu16(__m256i __a, __m256i __b) {
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the products.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_mulhi_epi16(__m256i __a, __m256i __b) {
+_mm256_mulhi_epi16(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b);
 }
 
@@ -1711,7 +1751,8 @@ _mm256_mulhi_epi16(__m256i __a, __m256i __b) {
 ///    A 256-bit vector of [16 x i16] containing one of the source operands.
 /// \returns A 256-bit vector of [16 x i16] containing the products.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_mullo_epi16(__m256i __a, __m256i __b) {
+_mm256_mullo_epi16(__m256i __a, __m256i __b)
+{
   return (__m256i)((__v16hu)__a * (__v16hu)__b);
 }
 
@@ -1728,8 +1769,9 @@ _mm256_mullo_epi16(__m256i __a, __m256i __b) {
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing one of the source operands.
 /// \returns A 256-bit vector of [8 x i32] containing the products.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_mullo_epi32(__m256i __a, __m256i __b) {
+static __inline__  __m256i __DEFAULT_FN_ATTRS256
+_mm256_mullo_epi32 (__m256i __a, __m256i __b)
+{
   return (__m256i)((__v8su)__a * (__v8su)__b);
 }
 
@@ -1771,7 +1813,8 @@ _mm256_mul_epu32(__m256i __a, __m256i __b) {
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_or_si256(__m256i __a, __m256i __b) {
+_mm256_or_si256(__m256i __a, __m256i __b)
+{
   return (__m256i)((__v4du)__a | (__v4du)__b);
 }
 
@@ -1814,8 +1857,9 @@ _mm256_or_si256(__m256i __a, __m256i __b) {
 /// \param __b
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sad_epu8(__m256i __a,
-                                                                __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sad_epu8(__m256i __a, __m256i __b)
+{
   return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b);
 }
 
@@ -1853,7 +1897,8 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sad_epu8(__m256i __a,
 ///    to copy to the result byte.
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_shuffle_epi8(__m256i __a, __m256i __b) {
+_mm256_shuffle_epi8(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b);
 }
 
@@ -1887,7 +1932,7 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b) {
 ///    result, \a imm[3:2] specifies the index for elements 1 and 5, and so
 ///    forth.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-#define _mm256_shuffle_epi32(a, imm)                                           \
+#define _mm256_shuffle_epi32(a, imm) \
   ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm)))
 
 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] in \a a
@@ -1923,7 +1968,7 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b) {
 ///    result, \a imm[3:2] specifies the index for elements 5 and 9, and so
 ///    forth. Indexes are offset by 4 (so 0 means index 4, and so forth).
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-#define _mm256_shufflehi_epi16(a, imm)                                         \
+#define _mm256_shufflehi_epi16(a, imm) \
   ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm)))
 
 /// Shuffles 16-bit integers from the 256-bit vector of [16 x i16] \a a
@@ -1960,7 +2005,7 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b) {
 ///    result, \a imm[3:2] specifies the index for elements 1 and 9, and so
 ///    forth.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-#define _mm256_shufflelo_epi16(a, imm)                                         \
+#define _mm256_shufflelo_epi16(a, imm) \
   ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm)))
 
 /// Sets each byte of the result to the corresponding byte of the 256-bit
@@ -1978,9 +2023,10 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b) {
 /// \param __b
 ///    A 256-bit integer vector].
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi8(__m256i __a,
-                                                                 __m256i __b) {
-  return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sign_epi8(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
 }
 
 /// Sets each element of the result to the corresponding element of the
@@ -1998,9 +2044,10 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi8(__m256i __a,
 /// \param __b
 ///    A 256-bit vector of [16 x i16].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi16(__m256i __a,
-                                                                  __m256i __b) {
-  return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sign_epi16(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Sets each element of the result to the corresponding element of the
@@ -2018,9 +2065,10 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi16(__m256i __a,
 /// \param __b
 ///    A 256-bit vector of [8 x i32].
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi32(__m256i __a,
-                                                                  __m256i __b) {
-  return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sign_epi32(__m256i __a, __m256i __b)
+{
+    return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
 }
 
 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
@@ -2040,9 +2088,8 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi32(__m256i __a,
 /// \param imm
 ///     An unsigned immediate value specifying the shift count (in bytes).
 /// \returns A 256-bit integer vector containing the result.
-#define _mm256_slli_si256(a, imm)                                              \
-  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a),          \
-                                                (int)(imm)))
+#define _mm256_slli_si256(a, imm) \
+  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
 
 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
 ///    \a imm bytes, shifting in zero bytes, and returns the result. If \a imm
@@ -2061,9 +2108,8 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi32(__m256i __a,
 /// \param imm
 ///    An unsigned immediate value specifying the shift count (in bytes).
 /// \returns A 256-bit integer vector containing the result.
-#define _mm256_bslli_epi128(a, imm)                                            \
-  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a),          \
-                                                (int)(imm)))
+#define _mm256_bslli_epi128(a, imm) \
+  ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm)))
 
 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
 ///    left by \a __count bits, shifting in zero bits, and returns the result.
@@ -2078,8 +2124,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sign_epi32(__m256i __a,
 /// \param __count
 ///    An unsigned integer value specifying the shift count (in bits).
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi16(__m256i __a,
-                                                                  int __count) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_slli_epi16(__m256i __a, int __count)
+{
   return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count);
 }
 
@@ -2099,7 +2146,8 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi16(__m256i __a,
 ///    shift count (in bits). The upper element is ignored.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sll_epi16(__m256i __a, __m128i __count) {
+_mm256_sll_epi16(__m256i __a, __m128i __count)
+{
   return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count);
 }
 
@@ -2116,8 +2164,9 @@ _mm256_sll_epi16(__m256i __a, __m128i __count) {
 /// \param __count
 ///    An unsigned integer value specifying the shift count (in bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi32(__m256i __a,
-                                                                  int __count) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_slli_epi32(__m256i __a, int __count)
+{
   return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count);
 }
 
@@ -2137,7 +2186,8 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi32(__m256i __a,
 ///    shift count (in bits). The upper element is ignored.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sll_epi32(__m256i __a, __m128i __count) {
+_mm256_sll_epi32(__m256i __a, __m128i __count)
+{
   return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count);
 }
 
@@ -2154,8 +2204,9 @@ _mm256_sll_epi32(__m256i __a, __m128i __count) {
 /// \param __count
 ///    An unsigned integer value specifying the shift count (in bits).
 /// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi64(__m256i __a,
-                                                                  int __count) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_slli_epi64(__m256i __a, int __count)
+{
   return __builtin_ia32_psllqi256((__v4di)__a, __count);
 }
 
@@ -2175,7 +2226,8 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_slli_epi64(__m256i __a,
 ///    shift count (in bits). The upper element is ignored.
 /// \returns A 256-bit vector of [4 x i64] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sll_epi64(__m256i __a, __m128i __count) {
+_mm256_sll_epi64(__m256i __a, __m128i __count)
+{
   return __builtin_ia32_psllq256((__v4di)__a, __count);
 }
 
@@ -2193,8 +2245,9 @@ _mm256_sll_epi64(__m256i __a, __m128i __count) {
 /// \param __count
 ///    An unsigned integer value specifying the shift count (in bits).
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srai_epi16(__m256i __a,
-                                                                  int __count) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srai_epi16(__m256i __a, int __count)
+{
   return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count);
 }
 
@@ -2215,7 +2268,8 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srai_epi16(__m256i __a,
 ///    shift count (in bits). The upper element is ignored.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sra_epi16(__m256i __a, __m128i __count) {
+_mm256_sra_epi16(__m256i __a, __m128i __count)
+{
   return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count);
 }
 
@@ -2233,8 +2287,9 @@ _mm256_sra_epi16(__m256i __a, __m128i __count) {
 /// \param __count
 ///    An unsigned integer value specifying the shift count (in bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srai_epi32(__m256i __a,
-                                                                  int __count) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srai_epi32(__m256i __a, int __count)
+{
   return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count);
 }
 
@@ -2255,7 +2310,8 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srai_epi32(__m256i __a,
 ///    shift count (in bits). The upper element is ignored.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sra_epi32(__m256i __a, __m128i __count) {
+_mm256_sra_epi32(__m256i __a, __m128i __count)
+{
   return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count);
 }
 
@@ -2276,7 +2332,7 @@ _mm256_sra_epi32(__m256i __a, __m128i __count) {
 /// \param imm
 ///    An unsigned immediate value specifying the shift count (in bytes).
 /// \returns A 256-bit integer vector containing the result.
-#define _mm256_srli_si256(a, imm)                                              \
+#define _mm256_srli_si256(a, imm) \
   ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
 
 /// Shifts each 128-bit half of the 256-bit integer vector in \a a right by
@@ -2296,7 +2352,7 @@ _mm256_sra_epi32(__m256i __a, __m128i __count) {
 /// \param imm
 ///     An unsigned immediate value specifying the shift count (in bytes).
 /// \returns A 256-bit integer vector containing the result.
-#define _mm256_bsrli_epi128(a, imm)                                            \
+#define _mm256_bsrli_epi128(a, imm) \
   ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm)))
 
 /// Shifts each 16-bit element of the 256-bit vector of [16 x i16] in \a __a
@@ -2312,8 +2368,9 @@ _mm256_sra_epi32(__m256i __a, __m128i __count) {
 /// \param __count
 ///    An unsigned integer value specifying the shift count (in bits).
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi16(__m256i __a,
-                                                                  int __count) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srli_epi16(__m256i __a, int __count)
+{
   return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count);
 }
 
@@ -2333,7 +2390,8 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi16(__m256i __a,
 ///    shift count (in bits). The upper element is ignored.
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srl_epi16(__m256i __a, __m128i __count) {
+_mm256_srl_epi16(__m256i __a, __m128i __count)
+{
   return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count);
 }
 
@@ -2350,8 +2408,9 @@ _mm256_srl_epi16(__m256i __a, __m128i __count) {
 /// \param __count
 ///    An unsigned integer value specifying the shift count (in bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi32(__m256i __a,
-                                                                  int __count) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srli_epi32(__m256i __a, int __count)
+{
   return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count);
 }
 
@@ -2371,7 +2430,8 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi32(__m256i __a,
 ///    shift count (in bits). The upper element is ignored.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srl_epi32(__m256i __a, __m128i __count) {
+_mm256_srl_epi32(__m256i __a, __m128i __count)
+{
   return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count);
 }
 
@@ -2388,8 +2448,9 @@ _mm256_srl_epi32(__m256i __a, __m128i __count) {
 /// \param __count
 ///    An unsigned integer value specifying the shift count (in bits).
 /// \returns A 256-bit vector of [4 x i64] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi64(__m256i __a,
-                                                                  int __count) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_srli_epi64(__m256i __a, int __count)
+{
   return __builtin_ia32_psrlqi256((__v4di)__a, __count);
 }
 
@@ -2409,7 +2470,8 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_srli_epi64(__m256i __a,
 ///    shift count (in bits). The upper element is ignored.
 /// \returns A 256-bit vector of [4 x i64] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_srl_epi64(__m256i __a, __m128i __count) {
+_mm256_srl_epi64(__m256i __a, __m128i __count)
+{
   return __builtin_ia32_psrlq256((__v4di)__a, __count);
 }
 
@@ -2434,8 +2496,9 @@ _mm256_srl_epi64(__m256i __a, __m128i __count) {
 /// \param __b
 ///    A 256-bit integer vector containing the subtrahends.
 /// \returns A 256-bit integer vector containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi8(__m256i __a,
-                                                                __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sub_epi8(__m256i __a, __m256i __b)
+{
   return (__m256i)((__v32qu)__a - (__v32qu)__b);
 }
 
@@ -2460,8 +2523,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi8(__m256i __a,
 /// \param __b
 ///    A 256-bit vector of [16 x i16] containing the subtrahends.
 /// \returns A 256-bit vector of [16 x i16] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi16(__m256i __a,
-                                                                 __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sub_epi16(__m256i __a, __m256i __b)
+{
   return (__m256i)((__v16hu)__a - (__v16hu)__b);
 }
 
@@ -2485,8 +2549,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi16(__m256i __a,
 /// \param __b
 ///    A 256-bit vector of [8 x i32] containing the subtrahends.
 /// \returns A 256-bit vector of [8 x i32] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi32(__m256i __a,
-                                                                 __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sub_epi32(__m256i __a, __m256i __b)
+{
   return (__m256i)((__v8su)__a - (__v8su)__b);
 }
 
@@ -2510,8 +2575,9 @@ static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi32(__m256i __a,
 /// \param __b
 ///    A 256-bit vector of [4 x i64] containing the subtrahends.
 /// \returns A 256-bit vector of [4 x i64] containing the differences.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_sub_epi64(__m256i __a,
-                                                                 __m256i __b) {
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_sub_epi64(__m256i __a, __m256i __b)
+{
   return (__m256i)((__v4du)__a - (__v4du)__b);
 }
 
@@ -2646,11 +2712,7 @@ _mm256_subs_epu16(__m256i __a, __m256i __b) {
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_unpackhi_epi8(__m256i __a, __m256i __b) {
-  return (__m256i)__builtin_shufflevector(
-      (__v32qi)__a, (__v32qi)__b, 8, 32 + 8, 9, 32 + 9, 10, 32 + 10, 11,
-      32 + 11, 12, 32 + 12, 13, 32 + 13, 14, 32 + 14, 15, 32 + 15, 24, 32 + 24,
-      25, 32 + 25, 26, 32 + 26, 27, 32 + 27, 28, 32 + 28, 29, 32 + 29, 30,
-      32 + 30, 31, 32 + 31);
+  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31);
 }
 
 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
@@ -2684,9 +2746,7 @@ _mm256_unpackhi_epi8(__m256i __a, __m256i __b) {
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_unpackhi_epi16(__m256i __a, __m256i __b) {
-  return (__m256i)__builtin_shufflevector(
-      (__v16hi)__a, (__v16hi)__b, 4, 16 + 4, 5, 16 + 5, 6, 16 + 6, 7, 16 + 7,
-      12, 16 + 12, 13, 16 + 13, 14, 16 + 14, 15, 16 + 15);
+  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
 }
 
 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
@@ -2719,8 +2779,7 @@ _mm256_unpackhi_epi16(__m256i __a, __m256i __b) {
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_unpackhi_epi32(__m256i __a, __m256i __b) {
-  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8 + 2, 3,
-                                          8 + 3, 6, 8 + 6, 7, 8 + 7);
+  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7);
 }
 
 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
@@ -2749,8 +2808,7 @@ _mm256_unpackhi_epi32(__m256i __a, __m256i __b) {
 /// \returns A 256-bit vector of [4 x i64] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_unpackhi_epi64(__m256i __a, __m256i __b) {
-  return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4 + 1, 3,
-                                          4 + 3);
+  return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3);
 }
 
 /// Unpacks and interleaves 8-bit integers from parts of the 256-bit integer
@@ -2783,10 +2841,7 @@ _mm256_unpackhi_epi64(__m256i __a, __m256i __b) {
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_unpacklo_epi8(__m256i __a, __m256i __b) {
-  return (__m256i)__builtin_shufflevector(
-      (__v32qi)__a, (__v32qi)__b, 0, 32 + 0, 1, 32 + 1, 2, 32 + 2, 3, 32 + 3, 4,
-      32 + 4, 5, 32 + 5, 6, 32 + 6, 7, 32 + 7, 16, 32 + 16, 17, 32 + 17, 18,
-      32 + 18, 19, 32 + 19, 20, 32 + 20, 21, 32 + 21, 22, 32 + 22, 23, 32 + 23);
+  return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23);
 }
 
 /// Unpacks and interleaves 16-bit integers from parts of the 256-bit vectors
@@ -2820,9 +2875,7 @@ _mm256_unpacklo_epi8(__m256i __a, __m256i __b) {
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_unpacklo_epi16(__m256i __a, __m256i __b) {
-  return (__m256i)__builtin_shufflevector(
-      (__v16hi)__a, (__v16hi)__b, 0, 16 + 0, 1, 16 + 1, 2, 16 + 2, 3, 16 + 3, 8,
-      16 + 8, 9, 16 + 9, 10, 16 + 10, 11, 16 + 11);
+  return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11);
 }
 
 /// Unpacks and interleaves 32-bit integers from parts of the 256-bit vectors
@@ -2855,8 +2908,7 @@ _mm256_unpacklo_epi16(__m256i __a, __m256i __b) {
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_unpacklo_epi32(__m256i __a, __m256i __b) {
-  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8 + 0, 1,
-                                          8 + 1, 4, 8 + 4, 5, 8 + 5);
+  return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5);
 }
 
 /// Unpacks and interleaves 64-bit integers from parts of the 256-bit vectors
@@ -2885,8 +2937,7 @@ _mm256_unpacklo_epi32(__m256i __a, __m256i __b) {
 /// \returns A 256-bit vector of [4 x i64] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_unpacklo_epi64(__m256i __a, __m256i __b) {
-  return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4 + 0, 2,
-                                          4 + 2);
+  return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2);
 }
 
 /// Computes the bitwise XOR of the 256-bit integer vectors in \a __a and
@@ -2902,7 +2953,8 @@ _mm256_unpacklo_epi64(__m256i __a, __m256i __b) {
 ///    A 256-bit integer vector.
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_xor_si256(__m256i __a, __m256i __b) {
+_mm256_xor_si256(__m256i __a, __m256i __b)
+{
   return (__m256i)((__v4du)__a ^ (__v4du)__b);
 }
 
@@ -2918,7 +2970,8 @@ _mm256_xor_si256(__m256i __a, __m256i __b) {
 ///    A pointer to the 32-byte aligned memory containing the vector to load.
 /// \returns A 256-bit integer vector loaded from memory.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_stream_load_si256(const void *__V) {
+_mm256_stream_load_si256(const void *__V)
+{
   typedef __v4di __v4di_aligned __attribute__((aligned(32)));
   return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V);
 }
@@ -2968,8 +3021,7 @@ _mm_broadcastsd_pd(__m128d __a) {
 /// \returns A 256-bit vector of [8 x float] containing the result.
 static __inline__ __m256 __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_broadcastss_ps(__m128 __X) {
-  return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0,
-                                         0, 0, 0, 0);
+  return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
 /// Broadcasts the 64-bit floating-point value from the low element of the
@@ -3038,8 +3090,8 @@ _mm256_broadcastsi128_si256(__m128i __X) {
 ///    corresponds to the index of a copied value. When a mask bit is 0, the
 ///    element is copied from \a V1; otherwise, it is copied from \a V2.
 /// \returns A 128-bit vector of [4 x i32] containing the result.
-#define _mm_blend_epi32(V1, V2, M)                                             \
-  ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1),                   \
+#define _mm_blend_epi32(V1, V2, M) \
+  ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \
                                       (__v4si)(__m128i)(V2), (int)(M)))
 
 /// Merges 32-bit integer elements from either of the two 256-bit vectors of
@@ -3075,8 +3127,8 @@ _mm256_broadcastsi128_si256(__m128i __X) {
 ///    corresponds to the index of a copied value. When a mask bit is 0, the
 ///    element is copied from \a V1; otherwise, it is is copied from \a V2.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-#define _mm256_blend_epi32(V1, V2, M)                                          \
-  ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1),                   \
+#define _mm256_blend_epi32(V1, V2, M) \
+  ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \
                                       (__v8si)(__m256i)(V2), (int)(M)))
 
 /// Broadcasts the low byte from the 128-bit integer vector in \a __X to all
@@ -3091,9 +3143,7 @@ _mm256_broadcastsi128_si256(__m128i __X) {
 /// \returns A 256-bit integer vector containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_broadcastb_epi8(__m128i __X) {
-  return (__m256i)__builtin_shufflevector(
-      (__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in \a __X
@@ -3108,8 +3158,7 @@ _mm256_broadcastb_epi8(__m128i __X) {
 /// \returns A 256-bit vector of [16 x i16] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_broadcastw_epi16(__m128i __X) {
-  return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0,
-                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+  return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
@@ -3124,8 +3173,7 @@ _mm256_broadcastw_epi16(__m128i __X) {
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
 _mm256_broadcastd_epi32(__m128i __X) {
-  return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0,
-                                          0, 0, 0, 0);
+  return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
 /// Broadcasts the low element from the 128-bit vector of [2 x i64] in \a __X
@@ -3155,9 +3203,7 @@ _mm256_broadcastq_epi64(__m128i __X) {
 /// \returns A 128-bit integer vector containing the result.
 static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_broadcastb_epi8(__m128i __X) {
-  return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0,
-                                          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                                          0);
+  return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
 /// Broadcasts the low element from the 128-bit vector of [8 x i16] in
@@ -3172,8 +3218,7 @@ _mm_broadcastb_epi8(__m128i __X) {
 /// \returns A 128-bit vector of [8 x i16] containing the result.
 static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
 _mm_broadcastw_epi16(__m128i __X) {
-  return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0,
-                                          0, 0, 0, 0);
+  return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0);
 }
 
 /// Broadcasts the low element from the 128-bit vector of [4 x i32] in \a __X
@@ -3229,7 +3274,8 @@ _mm_broadcastq_epi64(__m128i __X) {
 ///    \a __a.
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) {
+_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b)
+{
   return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b);
 }
 
@@ -3260,7 +3306,7 @@ _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) {
 ///    \a M[1:0] specifies the index in \a a for element 0 of the result,
 ///    \a M[3:2] specifies the index for element 1, and so forth.
 /// \returns A 256-bit vector of [4 x double] containing the result.
-#define _mm256_permute4x64_pd(V, M)                                            \
+#define _mm256_permute4x64_pd(V, M) \
   ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M)))
 
 /// Sets the result's 256-bit vector of [8 x float] to copies of elements of
@@ -3286,7 +3332,8 @@ _mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) {
 ///    \a __a.
 /// \returns A 256-bit vector of [8 x float] containing the result.
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
-_mm256_permutevar8x32_ps(__m256 __a, __m256i __b) {
+_mm256_permutevar8x32_ps(__m256 __a, __m256i __b)
+{
   return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b);
 }
 
@@ -3317,7 +3364,7 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256i __b) {
 ///    \a M[1:0] specifies the index in \a a for element 0 of the result,
 ///    \a M[3:2] specifies the index for element 1, and so forth.
 /// \returns A 256-bit vector of [4 x i64] containing the result.
-#define _mm256_permute4x64_epi64(V, M)                                         \
+#define _mm256_permute4x64_epi64(V, M) \
   ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M)))
 
 /// Sets each half of the 256-bit result either to zero or to one of the
@@ -3363,7 +3410,7 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256i __b) {
 ///    2: the lower half of \a V2 \n
 ///    3: the upper half of \a V2
 /// \returns A 256-bit integer vector containing the result.
-#define _mm256_permute2x128_si256(V1, V2, M)                                   \
+#define _mm256_permute2x128_si256(V1, V2, M) \
   ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M)))
 
 /// Extracts half of the 256-bit vector \a V to the 128-bit result. If bit 0
@@ -3383,7 +3430,7 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256i __b) {
 /// \param M
 ///    An immediate value specifying which half of \a V to extract.
 /// \returns A 128-bit integer vector containing the result.
-#define _mm256_extracti128_si256(V, M)                                         \
+#define _mm256_extracti128_si256(V, M) \
   ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M)))
 
 /// Copies the 256-bit vector \a V1 to the result, then overwrites half of the
@@ -3406,8 +3453,8 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256i __b) {
 /// \param M
 ///    An immediate value specifying where to put \a V2 in the result.
 /// \returns A 256-bit integer vector containing the result.
-#define _mm256_inserti128_si256(V1, V2, M)                                     \
-  ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1),                \
+#define _mm256_inserti128_si256(V1, V2, M) \
+  ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \
                                          (__v2di)(__m128i)(V2), (int)(M)))
 
 /// Conditionally loads eight 32-bit integer elements from memory \a __X, if
@@ -3437,7 +3484,8 @@ _mm256_permutevar8x32_ps(__m256 __a, __m256i __b) {
 /// \returns A 256-bit vector of [8 x i32] containing the loaded or zeroed
 ///    elements.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskload_epi32(int const *__X, __m256i __M) {
+_mm256_maskload_epi32(int const *__X, __m256i __M)
+{
   return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M);
 }
 
@@ -3468,7 +3516,8 @@ _mm256_maskload_epi32(int const *__X, __m256i __M) {
 /// \returns A 256-bit vector of [4 x i64] containing the loaded or zeroed
 ///    elements.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_maskload_epi64(long long const *__X, __m256i __M) {
+_mm256_maskload_epi64(long long const *__X, __m256i __M)
+{
   return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M);
 }
 
@@ -3499,7 +3548,8 @@ _mm256_maskload_epi64(long long const *__X, __m256i __M) {
 /// \returns A 128-bit vector of [4 x i32] containing the loaded or zeroed
 ///    elements.
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskload_epi32(int const *__X, __m128i __M) {
+_mm_maskload_epi32(int const *__X, __m128i __M)
+{
   return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M);
 }
 
@@ -3530,7 +3580,8 @@ _mm_maskload_epi32(int const *__X, __m128i __M) {
 /// \returns A 128-bit vector of [2 x i64] containing the loaded or zeroed
 ///    elements.
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_maskload_epi64(long long const *__X, __m128i __M) {
+_mm_maskload_epi64(long long const *__X, __m128i __M)
+{
   return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M);
 }
 
@@ -3559,7 +3610,8 @@ _mm_maskload_epi64(long long const *__X, __m128i __M) {
 /// \param __Y
 ///    A 256-bit vector of [8 x i32] containing the values to store.
 static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y) {
+_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y)
+{
   __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y);
 }
 
@@ -3588,7 +3640,8 @@ _mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y) {
 /// \param __Y
 ///    A 256-bit vector of [4 x i64] containing the values to store.
 static __inline__ void __DEFAULT_FN_ATTRS256
-_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y) {
+_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y)
+{
   __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y);
 }
 
@@ -3616,9 +3669,9 @@ _mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y) {
 ///    A 128-bit vector of [4 x i32] containing the mask bits.
 /// \param __Y
 ///    A 128-bit vector of [4 x i32] containing the values to store.
-static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi32(int *__X,
-                                                                 __m128i __M,
-                                                                 __m128i __Y) {
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y)
+{
   __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y);
 }
 
@@ -3646,10 +3699,10 @@ static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi32(int *__X,
 ///    A 128-bit vector of [2 x i64] containing the mask bits.
 /// \param __Y
 ///    A 128-bit vector of [2 x i64] containing the values to store.
-static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi64(long long *__X,
-                                                                 __m128i __M,
-                                                                 __m128i __Y) {
-  __builtin_ia32_maskstoreq((__v2di *)__X, (__v2di)__M, (__v2di)__Y);
+static __inline__ void __DEFAULT_FN_ATTRS128
+_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y)
+{
+  __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y);
 }
 
 /// Shifts each 32-bit element of the 256-bit vector of [8 x i32] in \a __X
@@ -3669,7 +3722,8 @@ static __inline__ void __DEFAULT_FN_ATTRS128 _mm_maskstore_epi64(long long *__X,
 ///    bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_sllv_epi32(__m256i __X, __m256i __Y) {
+_mm256_sllv_epi32(__m256i __X, __m256i __Y)
+{
   return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y);
 }
 
@@ -3690,7 +3744,8 @@ _mm256_sllv_epi32(__m256i __X, __m256i __Y) {
 ///    bits).
 /// \returns A 128-bit vector of [4 x i32] containing the result.
 static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_sllv_epi32(__m128i __X, __m128i __Y) {
+_mm_sllv_epi32(__m128i __X, __m128i __Y)
+{
   return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y);
 }
 
@@ -3711,7 +3766,8 @@ _mm_sllv_epi32(__m128i __X, __m128i __Y) {
 ///    bits).
 /// \returns A 256-bit vector of [4 x i64] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_sllv_epi64(__m256i __X, __m256i __Y) {
+_mm256_sllv_epi64(__m256i __X, __m256i __Y)
+{
   return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y);
 }
 
@@ -3732,7 +3788,8 @@ _mm256_sllv_epi64(__m256i __X, __m256i __Y) {
 ///    bits).
 /// \returns A 128-bit vector of [2 x i64] containing the result.
 static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_sllv_epi64(__m128i __X, __m128i __Y) {
+_mm_sllv_epi64(__m128i __X, __m128i __Y)
+{
   return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y);
 }
 
@@ -3754,7 +3811,8 @@ _mm_sllv_epi64(__m128i __X, __m128i __Y) {
 ///    bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_srav_epi32(__m256i __X, __m256i __Y) {
+_mm256_srav_epi32(__m256i __X, __m256i __Y)
+{
   return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y);
 }
 
@@ -3776,7 +3834,8 @@ _mm256_srav_epi32(__m256i __X, __m256i __Y) {
 ///    bits).
 /// \returns A 128-bit vector of [4 x i32] containing the result.
 static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_srav_epi32(__m128i __X, __m128i __Y) {
+_mm_srav_epi32(__m128i __X, __m128i __Y)
+{
   return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y);
 }
 
@@ -3797,7 +3856,8 @@ _mm_srav_epi32(__m128i __X, __m128i __Y) {
 ///    bits).
 /// \returns A 256-bit vector of [8 x i32] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_srlv_epi32(__m256i __X, __m256i __Y) {
+_mm256_srlv_epi32(__m256i __X, __m256i __Y)
+{
   return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y);
 }
 
@@ -3818,7 +3878,8 @@ _mm256_srlv_epi32(__m256i __X, __m256i __Y) {
 ///    bits).
 /// \returns A 128-bit vector of [4 x i32] containing the result.
 static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_srlv_epi32(__m128i __X, __m128i __Y) {
+_mm_srlv_epi32(__m128i __X, __m128i __Y)
+{
   return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y);
 }
 
@@ -3839,7 +3900,8 @@ _mm_srlv_epi32(__m128i __X, __m128i __Y) {
 ///    bits).
 /// \returns A 256-bit vector of [4 x i64] containing the result.
 static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
-_mm256_srlv_epi64(__m256i __X, __m256i __Y) {
+_mm256_srlv_epi64(__m256i __X, __m256i __Y)
+{
   return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y);
 }
 
@@ -3860,7 +3922,8 @@ _mm256_srlv_epi64(__m256i __X, __m256i __Y) {
 ///    bits).
 /// \returns A 128-bit vector of [2 x i64] containing the result.
 static __inline__ __m128i __DEFAULT_FN_ATTRS128_CONSTEXPR
-_mm_srlv_epi64(__m128i __X, __m128i __Y) {
+_mm_srlv_epi64(__m128i __X, __m128i __Y)
+{
   return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y);
 }
 
@@ -3907,10 +3970,11 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
-#define _mm_mask_i32gather_pd(a, m, i, mask, s)                                \
-  ((__m128d)__builtin_ia32_gatherd_pd(                                         \
-      (__v2df)(__m128i)(a), (double const *)(m), (__v4si)(__m128i)(i),         \
-      (__v2df)(__m128d)(mask), (s)))
+#define _mm_mask_i32gather_pd(a, m, i, mask, s) \
+  ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \
+                                      (double const *)(m), \
+                                      (__v4si)(__m128i)(i), \
+                                      (__v2df)(__m128d)(mask), (s)))
 
 /// Conditionally gathers four 64-bit floating-point values, either from the
 ///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
@@ -3954,10 +4018,11 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
-#define _mm256_mask_i32gather_pd(a, m, i, mask, s)                             \
-  ((__m256d)__builtin_ia32_gatherd_pd256(                                      \
-      (__v4df)(__m256d)(a), (double const *)(m), (__v4si)(__m128i)(i),         \
-      (__v4df)(__m256d)(mask), (s)))
+#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \
+  ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \
+                                         (double const *)(m), \
+                                         (__v4si)(__m128i)(i), \
+                                         (__v4df)(__m256d)(mask), (s)))
 
 /// Conditionally gathers two 64-bit floating-point values, either from the
 ///    128-bit vector of [2 x double] in \a a, or from memory \a m using scaled
@@ -4001,10 +4066,11 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
-#define _mm_mask_i64gather_pd(a, m, i, mask, s)                                \
-  ((__m128d)__builtin_ia32_gatherq_pd(                                         \
-      (__v2df)(__m128d)(a), (double const *)(m), (__v2di)(__m128i)(i),         \
-      (__v2df)(__m128d)(mask), (s)))
+#define _mm_mask_i64gather_pd(a, m, i, mask, s) \
+  ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \
+                                      (double const *)(m), \
+                                      (__v2di)(__m128i)(i), \
+                                      (__v2df)(__m128d)(mask), (s)))
 
 /// Conditionally gathers four 64-bit floating-point values, either from the
 ///    256-bit vector of [4 x double] in \a a, or from memory \a m using scaled
@@ -4048,10 +4114,11 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
-#define _mm256_mask_i64gather_pd(a, m, i, mask, s)                             \
-  ((__m256d)__builtin_ia32_gatherq_pd256(                                      \
-      (__v4df)(__m256d)(a), (double const *)(m), (__v4di)(__m256i)(i),         \
-      (__v4df)(__m256d)(mask), (s)))
+#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \
+  ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \
+                                         (double const *)(m), \
+                                         (__v4di)(__m256i)(i), \
+                                         (__v4df)(__m256d)(mask), (s)))
 
 /// Conditionally gathers four 32-bit floating-point values, either from the
 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
@@ -4095,9 +4162,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
-#define _mm_mask_i32gather_ps(a, m, i, mask, s)                                \
-  ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), (float const *)(m),  \
-                                     (__v4si)(__m128i)(i),                     \
+#define _mm_mask_i32gather_ps(a, m, i, mask, s) \
+  ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \
+                                     (float const *)(m), \
+                                     (__v4si)(__m128i)(i), \
                                      (__v4sf)(__m128)(mask), (s)))
 
 /// Conditionally gathers eight 32-bit floating-point values, either from the
@@ -4142,10 +4210,11 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
-#define _mm256_mask_i32gather_ps(a, m, i, mask, s)                             \
-  ((__m256)__builtin_ia32_gatherd_ps256(                                       \
-      (__v8sf)(__m256)(a), (float const *)(m), (__v8si)(__m256i)(i),           \
-      (__v8sf)(__m256)(mask), (s)))
+#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \
+  ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \
+                                        (float const *)(m), \
+                                        (__v8si)(__m256i)(i), \
+                                        (__v8sf)(__m256)(mask), (s)))
 
 /// Conditionally gathers two 32-bit floating-point values, either from the
 ///    128-bit vector of [4 x float] in \a a, or from memory \a m using scaled
@@ -4192,9 +4261,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
-#define _mm_mask_i64gather_ps(a, m, i, mask, s)                                \
-  ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), (float const *)(m),  \
-                                     (__v2di)(__m128i)(i),                     \
+#define _mm_mask_i64gather_ps(a, m, i, mask, s) \
+  ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \
+                                     (float const *)(m), \
+                                     (__v2di)(__m128i)(i), \
                                      (__v4sf)(__m128)(mask), (s)))
 
 /// Conditionally gathers four 32-bit floating-point values, either from the
@@ -4239,10 +4309,11 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
-#define _mm256_mask_i64gather_ps(a, m, i, mask, s)                             \
-  ((__m128)__builtin_ia32_gatherq_ps256(                                       \
-      (__v4sf)(__m128)(a), (float const *)(m), (__v4di)(__m256i)(i),           \
-      (__v4sf)(__m128)(mask), (s)))
+#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \
+  ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \
+                                        (float const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4sf)(__m128)(mask), (s)))
 
 /// Conditionally gathers four 32-bit integer values, either from the
 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
@@ -4286,9 +4357,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
-#define _mm_mask_i32gather_epi32(a, m, i, mask, s)                             \
-  ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), (int const *)(m),   \
-                                     (__v4si)(__m128i)(i),                     \
+#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \
+  ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \
+                                     (int const *)(m), \
+                                     (__v4si)(__m128i)(i), \
                                      (__v4si)(__m128i)(mask), (s)))
 
 /// Conditionally gathers eight 32-bit integer values, either from the
@@ -4333,10 +4405,11 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
-#define _mm256_mask_i32gather_epi32(a, m, i, mask, s)                          \
-  ((__m256i)__builtin_ia32_gatherd_d256(                                       \
-      (__v8si)(__m256i)(a), (int const *)(m), (__v8si)(__m256i)(i),            \
-      (__v8si)(__m256i)(mask), (s)))
+#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \
+  ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \
+                                        (int const *)(m), \
+                                        (__v8si)(__m256i)(i), \
+                                        (__v8si)(__m256i)(mask), (s)))
 
 /// Conditionally gathers two 32-bit integer values, either from the
 ///    128-bit vector of [4 x i32] in \a a, or from memory \a m using scaled
@@ -4383,9 +4456,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
-#define _mm_mask_i64gather_epi32(a, m, i, mask, s)                             \
-  ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), (int const *)(m),   \
-                                     (__v2di)(__m128i)(i),                     \
+#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \
+  ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \
+                                     (int const *)(m), \
+                                     (__v2di)(__m128i)(i), \
                                      (__v4si)(__m128i)(mask), (s)))
 
 /// Conditionally gathers four 32-bit integer values, either from the
@@ -4430,10 +4504,11 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
-#define _mm256_mask_i64gather_epi32(a, m, i, mask, s)                          \
-  ((__m128i)__builtin_ia32_gatherq_d256(                                       \
-      (__v4si)(__m128i)(a), (int const *)(m), (__v4di)(__m256i)(i),            \
-      (__v4si)(__m128i)(mask), (s)))
+#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \
+  ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \
+                                        (int const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4si)(__m128i)(mask), (s)))
 
 /// Conditionally gathers two 64-bit integer values, either from the
 ///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
@@ -4478,10 +4553,11 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
-#define _mm_mask_i32gather_epi64(a, m, i, mask, s)                             \
-  ((__m128i)__builtin_ia32_gatherd_q(                                          \
-      (__v2di)(__m128i)(a), (long long const *)(m), (__v4si)(__m128i)(i),      \
-      (__v2di)(__m128i)(mask), (s)))
+#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \
+  ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \
+                                     (long long const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v2di)(__m128i)(mask), (s)))
 
 /// Conditionally gathers four 64-bit integer values, either from the
 ///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
@@ -4525,10 +4601,11 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
-#define _mm256_mask_i32gather_epi64(a, m, i, mask, s)                          \
-  ((__m256i)__builtin_ia32_gatherd_q256(                                       \
-      (__v4di)(__m256i)(a), (long long const *)(m), (__v4si)(__m128i)(i),      \
-      (__v4di)(__m256i)(mask), (s)))
+#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \
+  ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \
+                                        (long long const *)(m), \
+                                        (__v4si)(__m128i)(i), \
+                                        (__v4di)(__m256i)(mask), (s)))
 
 /// Conditionally gathers two 64-bit integer values, either from the
 ///    128-bit vector of [2 x i64] in \a a, or from memory \a m using scaled
@@ -4572,10 +4649,11 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
-#define _mm_mask_i64gather_epi64(a, m, i, mask, s)                             \
-  ((__m128i)__builtin_ia32_gatherq_q(                                          \
-      (__v2di)(__m128i)(a), (long long const *)(m), (__v2di)(__m128i)(i),      \
-      (__v2di)(__m128i)(mask), (s)))
+#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \
+  ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \
+                                     (long long const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v2di)(__m128i)(mask), (s)))
 
 /// Conditionally gathers four 64-bit integer values, either from the
 ///    256-bit vector of [4 x i64] in \a a, or from memory \a m using scaled
@@ -4619,10 +4697,11 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
-#define _mm256_mask_i64gather_epi64(a, m, i, mask, s)                          \
-  ((__m256i)__builtin_ia32_gatherq_q256(                                       \
-      (__v4di)(__m256i)(a), (long long const *)(m), (__v4di)(__m256i)(i),      \
-      (__v4di)(__m256i)(mask), (s)))
+#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \
+  ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \
+                                        (long long const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4di)(__m256i)(mask), (s)))
 
 /// Gathers two 64-bit floating-point values from memory \a m using scaled
 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
@@ -4652,10 +4731,13 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
-#define _mm_i32gather_pd(m, i, s)                                              \
-  ((__m128d)__builtin_ia32_gatherd_pd(                                         \
-      (__v2df)_mm_undefined_pd(), (double const *)(m), (__v4si)(__m128i)(i),   \
-      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), _mm_setzero_pd()), (s)))
+#define _mm_i32gather_pd(m, i, s) \
+  ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \
+                                      (double const *)(m), \
+                                      (__v4si)(__m128i)(i), \
+                                      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
+                                                           _mm_setzero_pd()), \
+                                      (s)))
 
 /// Gathers four 64-bit floating-point values from memory \a m using scaled
 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
@@ -4684,13 +4766,14 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
-#define _mm256_i32gather_pd(m, i, s)                                           \
-  ((__m256d)__builtin_ia32_gatherd_pd256(                                      \
-      (__v4df)_mm256_undefined_pd(), (double const *)(m),                      \
-      (__v4si)(__m128i)(i),                                                    \
-      (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(),          \
-                            _CMP_EQ_OQ),                                       \
-      (s)))
+#define _mm256_i32gather_pd(m, i, s) \
+  ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \
+                                         (double const *)(m), \
+                                         (__v4si)(__m128i)(i), \
+                                         (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
+                                                               _mm256_setzero_pd(), \
+                                                               _CMP_EQ_OQ), \
+                                         (s)))
 
 /// Gathers two 64-bit floating-point values from memory \a m using scaled
 ///    indexes from the 128-bit vector of [2 x i64] in \a i.
@@ -4719,10 +4802,13 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [2 x double] containing the gathered values.
-#define _mm_i64gather_pd(m, i, s)                                              \
-  ((__m128d)__builtin_ia32_gatherq_pd(                                         \
-      (__v2df)_mm_undefined_pd(), (double const *)(m), (__v2di)(__m128i)(i),   \
-      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), _mm_setzero_pd()), (s)))
+#define _mm_i64gather_pd(m, i, s) \
+  ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \
+                                      (double const *)(m), \
+                                      (__v2di)(__m128i)(i), \
+                                      (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \
+                                                           _mm_setzero_pd()), \
+                                      (s)))
 
 /// Gathers four 64-bit floating-point values from memory \a m using scaled
 ///    indexes from the 256-bit vector of [4 x i64] in \a i.
@@ -4751,13 +4837,14 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [4 x double] containing the gathered values.
-#define _mm256_i64gather_pd(m, i, s)                                           \
-  ((__m256d)__builtin_ia32_gatherq_pd256(                                      \
-      (__v4df)_mm256_undefined_pd(), (double const *)(m),                      \
-      (__v4di)(__m256i)(i),                                                    \
-      (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), _mm256_setzero_pd(),          \
-                            _CMP_EQ_OQ),                                       \
-      (s)))
+#define _mm256_i64gather_pd(m, i, s) \
+  ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \
+                                         (double const *)(m), \
+                                         (__v4di)(__m256i)(i), \
+                                         (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \
+                                                               _mm256_setzero_pd(), \
+                                                               _CMP_EQ_OQ), \
+                                         (s)))
 
 /// Gathers four 32-bit floating-point values from memory \a m using scaled
 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
@@ -4786,10 +4873,13 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
-#define _mm_i32gather_ps(m, i, s)                                              \
-  ((__m128)__builtin_ia32_gatherd_ps(                                          \
-      (__v4sf)_mm_undefined_ps(), (float const *)(m), (__v4si)(__m128i)(i),    \
-      (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), _mm_setzero_ps()), (s)))
+#define _mm_i32gather_ps(m, i, s) \
+  ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \
+                                     (float const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                          _mm_setzero_ps()), \
+                                     (s)))
 
 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
 ///    indexes from the 256-bit vector of [8 x i32] in \a i.
@@ -4818,12 +4908,14 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [8 x float] containing the gathered values.
-#define _mm256_i32gather_ps(m, i, s)                                           \
-  ((__m256)__builtin_ia32_gatherd_ps256(                                       \
-      (__v8sf)_mm256_undefined_ps(), (float const *)(m), (__v8si)(__m256i)(i), \
-      (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), _mm256_setzero_ps(),          \
-                            _CMP_EQ_OQ),                                       \
-      (s)))
+#define _mm256_i32gather_ps(m, i, s) \
+  ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \
+                                        (float const *)(m), \
+                                        (__v8si)(__m256i)(i), \
+                                        (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \
+                                                              _mm256_setzero_ps(), \
+                                                              _CMP_EQ_OQ), \
+                                        (s)))
 
 /// Gathers two 32-bit floating-point values from memory \a m using scaled
 ///    indexes from the 128-bit vector of [2 x i64] in \a i. The upper two
@@ -4854,10 +4946,13 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
-#define _mm_i64gather_ps(m, i, s)                                              \
-  ((__m128)__builtin_ia32_gatherq_ps(                                          \
-      (__v4sf)_mm_undefined_ps(), (float const *)(m), (__v2di)(__m128i)(i),    \
-      (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), _mm_setzero_ps()), (s)))
+#define _mm_i64gather_ps(m, i, s) \
+  ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \
+                                     (float const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                          _mm_setzero_ps()), \
+                                     (s)))
 
 /// Gathers four 32-bit floating-point values from memory \a m using scaled
 ///    indexes from the 256-bit vector of [4 x i64] in \a i.
@@ -4886,10 +4981,13 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x float] containing the gathered values.
-#define _mm256_i64gather_ps(m, i, s)                                           \
-  ((__m128)__builtin_ia32_gatherq_ps256(                                       \
-      (__v4sf)_mm_undefined_ps(), (float const *)(m), (__v4di)(__m256i)(i),    \
-      (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), _mm_setzero_ps()), (s)))
+#define _mm256_i64gather_ps(m, i, s) \
+  ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \
+                                        (float const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \
+                                                             _mm_setzero_ps()), \
+                                        (s)))
 
 /// Gathers four 32-bit floating-point values from memory \a m using scaled
 ///    indexes from the 128-bit vector of [4 x i32] in \a i.
@@ -4918,9 +5016,9 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
-#define _mm_i32gather_epi32(m, i, s)                                           \
-  ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(),            \
-                                     (int const *)(m), (__v4si)(__m128i)(i),   \
+#define _mm_i32gather_epi32(m, i, s) \
+  ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \
+                                     (int const *)(m), (__v4si)(__m128i)(i), \
                                      (__v4si)_mm_set1_epi32(-1), (s)))
 
 /// Gathers eight 32-bit floating-point values from memory \a m using scaled
@@ -4950,10 +5048,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [8 x i32] containing the gathered values.
-#define _mm256_i32gather_epi32(m, i, s)                                        \
-  ((__m256i)__builtin_ia32_gatherd_d256(                                       \
-      (__v8si)_mm256_undefined_si256(), (int const *)(m),                      \
-      (__v8si)(__m256i)(i), (__v8si)_mm256_set1_epi32(-1), (s)))
+#define _mm256_i32gather_epi32(m, i, s) \
+  ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \
+                                        (int const *)(m), (__v8si)(__m256i)(i), \
+                                        (__v8si)_mm256_set1_epi32(-1), (s)))
 
 /// Gathers two 32-bit integer values from memory \a m using scaled indexes
 ///    from the 128-bit vector of [2 x i64] in \a i. The upper two elements
@@ -4984,9 +5082,9 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
-#define _mm_i64gather_epi32(m, i, s)                                           \
-  ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(),            \
-                                     (int const *)(m), (__v2di)(__m128i)(i),   \
+#define _mm_i64gather_epi32(m, i, s) \
+  ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \
+                                     (int const *)(m), (__v2di)(__m128i)(i), \
                                      (__v4si)_mm_set1_epi32(-1), (s)))
 
 /// Gathers four 32-bit integer values from memory \a m using scaled indexes
@@ -5016,10 +5114,10 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [4 x i32] containing the gathered values.
-#define _mm256_i64gather_epi32(m, i, s)                                        \
-  ((__m128i)__builtin_ia32_gatherq_d256(                                       \
-      (__v4si)_mm_undefined_si128(), (int const *)(m), (__v4di)(__m256i)(i),   \
-      (__v4si)_mm_set1_epi32(-1), (s)))
+#define _mm256_i64gather_epi32(m, i, s) \
+  ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \
+                                        (int const *)(m), (__v4di)(__m256i)(i), \
+                                        (__v4si)_mm_set1_epi32(-1), (s)))
 
 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
 ///    from the 128-bit vector of [4 x i32] in \a i.
@@ -5049,10 +5147,11 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
-#define _mm_i32gather_epi64(m, i, s)                                           \
-  ((__m128i)__builtin_ia32_gatherd_q(                                          \
-      (__v2di)_mm_undefined_si128(), (long long const *)(m),                   \
-      (__v4si)(__m128i)(i), (__v2di)_mm_set1_epi64x(-1), (s)))
+#define _mm_i32gather_epi64(m, i, s) \
+  ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \
+                                     (long long const *)(m), \
+                                     (__v4si)(__m128i)(i), \
+                                     (__v2di)_mm_set1_epi64x(-1), (s)))
 
 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
 ///    from the 128-bit vector of [4 x i32] in \a i.
@@ -5081,10 +5180,11 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
-#define _mm256_i32gather_epi64(m, i, s)                                        \
-  ((__m256i)__builtin_ia32_gatherd_q256(                                       \
-      (__v4di)_mm256_undefined_si256(), (long long const *)(m),                \
-      (__v4si)(__m128i)(i), (__v4di)_mm256_set1_epi64x(-1), (s)))
+#define _mm256_i32gather_epi64(m, i, s) \
+  ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \
+                                        (long long const *)(m), \
+                                        (__v4si)(__m128i)(i), \
+                                        (__v4di)_mm256_set1_epi64x(-1), (s)))
 
 /// Gathers two 64-bit integer values from memory \a m using scaled indexes
 ///    from the 128-bit vector of [2 x i64] in \a i.
@@ -5113,10 +5213,11 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 128-bit vector of [2 x i64] containing the gathered values.
-#define _mm_i64gather_epi64(m, i, s)                                           \
-  ((__m128i)__builtin_ia32_gatherq_q(                                          \
-      (__v2di)_mm_undefined_si128(), (long long const *)(m),                   \
-      (__v2di)(__m128i)(i), (__v2di)_mm_set1_epi64x(-1), (s)))
+#define _mm_i64gather_epi64(m, i, s) \
+  ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \
+                                     (long long const *)(m), \
+                                     (__v2di)(__m128i)(i), \
+                                     (__v2di)_mm_set1_epi64x(-1), (s)))
 
 /// Gathers four 64-bit integer values from memory \a m using scaled indexes
 ///    from the 256-bit vector of [4 x i64] in \a i.
@@ -5145,10 +5246,11 @@ _mm_srlv_epi64(__m128i __X, __m128i __Y) {
 ///    A literal constant scale factor for the indexes in \a i. Must be
 ///    1, 2, 4, or 8.
 /// \returns A 256-bit vector of [4 x i64] containing the gathered values.
-#define _mm256_i64gather_epi64(m, i, s)                                        \
-  ((__m256i)__builtin_ia32_gatherq_q256(                                       \
-      (__v4di)_mm256_undefined_si256(), (long long const *)(m),                \
-      (__v4di)(__m256i)(i), (__v4di)_mm256_set1_epi64x(-1), (s)))
+#define _mm256_i64gather_epi64(m, i, s) \
+  ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \
+                                        (long long const *)(m), \
+                                        (__v4di)(__m256i)(i), \
+                                        (__v4di)_mm256_set1_epi64x(-1), (s)))
 
 #undef __DEFAULT_FN_ATTRS256
 #undef __DEFAULT_FN_ATTRS128