r334244 - [X86] Add builtins for shuff32x4/shuff64x2/shufi32x4/shuff64x2 to enable target feature checking and immediate range checking.

Thu Jun 7 16:03:08 PDT 2018

Author: ctopper
Date: Thu Jun  7 16:03:08 2018
New Revision: 334244

URL: http://llvm.org/viewvc/llvm-project?rev=334244&view=rev
Log:
[X86] Add builtins for shuff32x4/shuff64x2/shufi32x4/shuff64x2 to enable target feature checking and immediate range checking.

Modified:
    cfe/trunk/include/clang/Basic/BuiltinsX86.def
    cfe/trunk/lib/CodeGen/CGBuiltin.cpp
    cfe/trunk/lib/Headers/avx512fintrin.h
    cfe/trunk/lib/Headers/avx512vlintrin.h
    cfe/trunk/lib/Sema/SemaChecking.cpp
    cfe/trunk/test/CodeGen/avx512f-builtins.c
    cfe/trunk/test/CodeGen/avx512vl-builtins.c

Modified: cfe/trunk/include/clang/Basic/BuiltinsX86.def
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/include/clang/Basic/BuiltinsX86.def?rev=334244&r1=334243&r2=334244&view=diff
==============================================================================

--- cfe/trunk/include/clang/Basic/BuiltinsX86.def (original)
+++ cfe/trunk/include/clang/Basic/BuiltinsX86.def Thu Jun  7 16:03:08 2018
@@ -1500,6 +1500,14 @@ TARGET_BUILTIN(__builtin_ia32_pternlogq1
 TARGET_BUILTIN(__builtin_ia32_pternlogq128_maskz, "V2LLiV2LLiV2LLiV2LLiIiUc", "nc", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_pternlogq256_mask, "V4LLiV4LLiV4LLiV4LLiIiUc", "nc", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_pternlogq256_maskz, "V4LLiV4LLiV4LLiV4LLiIiUc", "nc", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_shuf_f32x4, "V16fV16fV16fIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_shuf_f64x2, "V8dV8dV8dIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_shuf_i32x4, "V16iV16iV16iIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_shuf_i64x2, "V8LLiV8LLiV8LLiIi", "nc", "avx512f")
+TARGET_BUILTIN(__builtin_ia32_shuf_f32x4_256, "V8fV8fV8fIi", "nc", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_shuf_f64x2_256, "V4dV4dV4dIi", "nc", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_shuf_i32x4_256, "V8iV8iV8iIi", "nc", "avx512vl")
+TARGET_BUILTIN(__builtin_ia32_shuf_i64x2_256, "V4LLiV4LLiV4LLiIi", "nc", "avx512vl")
 TARGET_BUILTIN(__builtin_ia32_sqrtsd_round_mask, "V2dV2dV2dV2dUcIi", "nc", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_sqrtss_round_mask, "V4fV4fV4fV4fUcIi", "nc", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_rsqrt14pd128_mask, "V2dV2dV2dUc", "nc", "avx512vl")

Modified: cfe/trunk/lib/CodeGen/CGBuiltin.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/CodeGen/CGBuiltin.cpp?rev=334244&r1=334243&r2=334244&view=diff
==============================================================================
--- cfe/trunk/lib/CodeGen/CGBuiltin.cpp (original)
+++ cfe/trunk/lib/CodeGen/CGBuiltin.cpp Thu Jun  7 16:03:08 2018
@@ -9291,6 +9291,35 @@ Value *CodeGenFunction::EmitX86BuiltinEx
                                        makeArrayRef(Indices, NumElts),
                                        "valign");
   }
+  case X86::BI__builtin_ia32_shuf_f32x4_256:
+  case X86::BI__builtin_ia32_shuf_f64x2_256:
+  case X86::BI__builtin_ia32_shuf_i32x4_256:
+  case X86::BI__builtin_ia32_shuf_i64x2_256:
+  case X86::BI__builtin_ia32_shuf_f32x4:
+  case X86::BI__builtin_ia32_shuf_f64x2:
+  case X86::BI__builtin_ia32_shuf_i32x4:
+  case X86::BI__builtin_ia32_shuf_i64x2: {
+    unsigned Imm = cast<llvm::ConstantInt>(Ops[2])->getZExtValue();
+    llvm::Type *Ty = Ops[0]->getType();
+    unsigned NumElts = Ty->getVectorNumElements();
+    unsigned NumLanes = Ty->getPrimitiveSizeInBits() == 512 ? 4 : 2;
+    unsigned NumLaneElts = NumElts / NumLanes;
+
+    uint32_t Indices[16];
+    for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+      unsigned Index = (Imm % NumLanes) * NumLaneElts;
+      Imm /= NumLanes; // Discard the bits we just used.
+      if (l >= (NumElts / 2))
+        Index += NumElts; // Switch to other source.
+      for (unsigned i = 0; i != NumLaneElts; ++i) {
+        Indices[l + i] = Index + i;
+      }
+    }
+
+    return Builder.CreateShuffleVector(Ops[0], Ops[1],
+                                       makeArrayRef(Indices, NumElts),
+                                       "shuf");
+  }
 
   case X86::BI__builtin_ia32_vperm2f128_pd256:
   case X86::BI__builtin_ia32_vperm2f128_ps256:

Modified: cfe/trunk/lib/Headers/avx512fintrin.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/avx512fintrin.h?rev=334244&r1=334243&r2=334244&view=diff
==============================================================================
--- cfe/trunk/lib/Headers/avx512fintrin.h (original)
+++ cfe/trunk/lib/Headers/avx512fintrin.h Thu Jun  7 16:03:08 2018
@@ -6829,24 +6829,8 @@ _mm512_maskz_srai_epi64(__mmask8 __U, __
 }
 
 #define _mm512_shuffle_f32x4(A, B, imm) \
-  (__m512)__builtin_shufflevector((__v16sf)(__m512)(A), \
-                                  (__v16sf)(__m512)(B), \
-                                  0 + ((((imm) >> 0) & 0x3) * 4), \
-                                  1 + ((((imm) >> 0) & 0x3) * 4), \
-                                  2 + ((((imm) >> 0) & 0x3) * 4), \
-                                  3 + ((((imm) >> 0) & 0x3) * 4), \
-                                  0 + ((((imm) >> 2) & 0x3) * 4), \
-                                  1 + ((((imm) >> 2) & 0x3) * 4), \
-                                  2 + ((((imm) >> 2) & 0x3) * 4), \
-                                  3 + ((((imm) >> 2) & 0x3) * 4), \
-                                  16 + ((((imm) >> 4) & 0x3) * 4), \
-                                  17 + ((((imm) >> 4) & 0x3) * 4), \
-                                  18 + ((((imm) >> 4) & 0x3) * 4), \
-                                  19 + ((((imm) >> 4) & 0x3) * 4), \
-                                  16 + ((((imm) >> 6) & 0x3) * 4), \
-                                  17 + ((((imm) >> 6) & 0x3) * 4), \
-                                  18 + ((((imm) >> 6) & 0x3) * 4), \
-                                  19 + ((((imm) >> 6) & 0x3) * 4))
+  (__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \
+                                    (__v16sf)(__m512)(B), (int)(imm))
 
 #define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \
   (__m512)__builtin_ia32_selectps_512((__mmask16)(U), \
@@ -6859,16 +6843,8 @@ _mm512_maskz_srai_epi64(__mmask8 __U, __
                                       (__v16sf)_mm512_setzero_ps())
 
 #define _mm512_shuffle_f64x2(A, B, imm) \
-  (__m512d)__builtin_shufflevector((__v8df)(__m512d)(A), \
-                                   (__v8df)(__m512d)(B), \
-                                   0 + ((((imm) >> 0) & 0x3) * 2), \
-                                   1 + ((((imm) >> 0) & 0x3) * 2), \
-                                   0 + ((((imm) >> 2) & 0x3) * 2), \
-                                   1 + ((((imm) >> 2) & 0x3) * 2), \
-                                   8 + ((((imm) >> 4) & 0x3) * 2), \
-                                   9 + ((((imm) >> 4) & 0x3) * 2), \
-                                   8 + ((((imm) >> 6) & 0x3) * 2), \
-                                   9 + ((((imm) >> 6) & 0x3) * 2))
+  (__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \
+                                     (__v8df)(__m512d)(B), (int)(imm))
 
 #define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \
   (__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \
@@ -6881,16 +6857,8 @@ _mm512_maskz_srai_epi64(__mmask8 __U, __
                                        (__v8df)_mm512_setzero_pd())
 
 #define _mm512_shuffle_i32x4(A, B, imm) \
-  (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \
-                                   (__v8di)(__m512i)(B), \
-                                   0 + ((((imm) >> 0) & 0x3) * 2), \
-                                   1 + ((((imm) >> 0) & 0x3) * 2), \
-                                   0 + ((((imm) >> 2) & 0x3) * 2), \
-                                   1 + ((((imm) >> 2) & 0x3) * 2), \
-                                   8 + ((((imm) >> 4) & 0x3) * 2), \
-                                   9 + ((((imm) >> 4) & 0x3) * 2), \
-                                   8 + ((((imm) >> 6) & 0x3) * 2), \
-                                   9 + ((((imm) >> 6) & 0x3) * 2))
+  (__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \
+                                     (__v16si)(__m512i)(B), (int)(imm))
 
 #define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \
   (__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \
@@ -6903,16 +6871,8 @@ _mm512_maskz_srai_epi64(__mmask8 __U, __
                                       (__v16si)_mm512_setzero_si512())
 
 #define _mm512_shuffle_i64x2(A, B, imm) \
-  (__m512i)__builtin_shufflevector((__v8di)(__m512i)(A), \
-                                   (__v8di)(__m512i)(B), \
-                                   0 + ((((imm) >> 0) & 0x3) * 2), \
-                                   1 + ((((imm) >> 0) & 0x3) * 2), \
-                                   0 + ((((imm) >> 2) & 0x3) * 2), \
-                                   1 + ((((imm) >> 2) & 0x3) * 2), \
-                                   8 + ((((imm) >> 4) & 0x3) * 2), \
-                                   9 + ((((imm) >> 4) & 0x3) * 2), \
-                                   8 + ((((imm) >> 6) & 0x3) * 2), \
-                                   9 + ((((imm) >> 6) & 0x3) * 2))
+  (__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \
+                                     (__v8di)(__m512i)(B), (int)(imm))
 
 #define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \
   (__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \

Modified: cfe/trunk/lib/Headers/avx512vlintrin.h
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Headers/avx512vlintrin.h?rev=334244&r1=334243&r2=334244&view=diff
==============================================================================
--- cfe/trunk/lib/Headers/avx512vlintrin.h (original)
+++ cfe/trunk/lib/Headers/avx512vlintrin.h Thu Jun  7 16:03:08 2018
@@ -6465,16 +6465,8 @@ _mm256_maskz_srai_epi64(__mmask8 __U, __
 
 
 #define _mm256_shuffle_f32x4(A, B, imm) \
-  (__m256)__builtin_shufflevector((__v8sf)(__m256)(A), \
-                                  (__v8sf)(__m256)(B), \
-                                   0 + ((((imm) >> 0) & 0x1) * 4), \
-                                   1 + ((((imm) >> 0) & 0x1) * 4), \
-                                   2 + ((((imm) >> 0) & 0x1) * 4), \
-                                   3 + ((((imm) >> 0) & 0x1) * 4), \
-                                   8 + ((((imm) >> 1) & 0x1) * 4), \
-                                   9 + ((((imm) >> 1) & 0x1) * 4), \
-                                   10 + ((((imm) >> 1) & 0x1) * 4), \
-                                   11 + ((((imm) >> 1) & 0x1) * 4))                                  
+  (__m256)__builtin_ia32_shuf_f32x4_256((__v8sf)(__m256)(A), \
+                                        (__v8sf)(__m256)(B), (int)(imm))
 
 #define _mm256_mask_shuffle_f32x4(W, U, A, B, imm) \
   (__m256)__builtin_ia32_selectps_256((__mmask8)(U), \
@@ -6487,12 +6479,8 @@ _mm256_maskz_srai_epi64(__mmask8 __U, __
                                       (__v8sf)_mm256_setzero_ps())
 
 #define _mm256_shuffle_f64x2(A, B, imm) \
-  (__m256d)__builtin_shufflevector((__v4df)(__m256d)(A), \
-                                   (__v4df)(__m256d)(B), \
-                                   0 + ((((imm) >> 0) & 0x1) * 2), \
-                                   1 + ((((imm) >> 0) & 0x1) * 2), \
-                                   4 + ((((imm) >> 1) & 0x1) * 2), \
-                                   5 + ((((imm) >> 1) & 0x1) * 2))
+  (__m256d)__builtin_ia32_shuf_f64x2_256((__v4df)(__m256d)(A), \
+                                         (__v4df)(__m256d)(B), (int)(imm))
 
 #define _mm256_mask_shuffle_f64x2(W, U, A, B, imm) \
   (__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \
@@ -6505,12 +6493,8 @@ _mm256_maskz_srai_epi64(__mmask8 __U, __
                                       (__v4df)_mm256_setzero_pd())
 
 #define _mm256_shuffle_i32x4(A, B, imm) \
-  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(A), \
-                                   (__v4di)(__m256i)(B), \
-                                   0 + ((((imm) >> 0) & 0x1) * 2), \
-                                   1 + ((((imm) >> 0) & 0x1) * 2), \
-                                   4 + ((((imm) >> 1) & 0x1) * 2), \
-                                   5 + ((((imm) >> 1) & 0x1) * 2))
+  (__m256i)__builtin_ia32_shuf_i32x4_256((__v8si)(__m256i)(A), \
+                                         (__v8si)(__m256i)(B), (int)(imm))
 
 #define _mm256_mask_shuffle_i32x4(W, U, A, B, imm) \
   (__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \
@@ -6523,12 +6507,8 @@ _mm256_maskz_srai_epi64(__mmask8 __U, __
                                       (__v8si)_mm256_setzero_si256())
 
 #define _mm256_shuffle_i64x2(A, B, imm) \
-  (__m256i)__builtin_shufflevector((__v4di)(__m256i)(A), \
-                                   (__v4di)(__m256i)(B), \
-                                   0 + ((((imm) >> 0) & 0x1) * 2), \
-                                   1 + ((((imm) >> 0) & 0x1) * 2), \
-                                   4 + ((((imm) >> 1) & 0x1) * 2), \
-                                   5 + ((((imm) >> 1) & 0x1) * 2))
+  (__m256i)__builtin_ia32_shuf_i64x2_256((__v4di)(__m256i)(A), \
+                                         (__v4di)(__m256i)(B), (int)(imm))
 
 #define _mm256_mask_shuffle_i64x2(W, U, A, B, imm) \
   (__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \

Modified: cfe/trunk/lib/Sema/SemaChecking.cpp
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/lib/Sema/SemaChecking.cpp?rev=334244&r1=334243&r2=334244&view=diff
==============================================================================
--- cfe/trunk/lib/Sema/SemaChecking.cpp (original)
+++ cfe/trunk/lib/Sema/SemaChecking.cpp Thu Jun  7 16:03:08 2018
@@ -2627,6 +2627,10 @@ bool Sema::CheckX86BuiltinFunctionCall(u
   case X86::BI__builtin_ia32_vec_set_v4hi:
   case X86::BI__builtin_ia32_vec_set_v4si:
   case X86::BI__builtin_ia32_vec_set_v4di:
+  case X86::BI__builtin_ia32_shuf_f32x4_256:
+  case X86::BI__builtin_ia32_shuf_f64x2_256:
+  case X86::BI__builtin_ia32_shuf_i32x4_256:
+  case X86::BI__builtin_ia32_shuf_i64x2_256:
     i = 2; l = 0; u = 3;
     break;
   case X86::BI__builtin_ia32_vpermil2pd:
@@ -2761,6 +2765,10 @@ bool Sema::CheckX86BuiltinFunctionCall(u
   case X86::BI__builtin_ia32_alignq256:
   case X86::BI__builtin_ia32_vcomisd:
   case X86::BI__builtin_ia32_vcomiss:
+  case X86::BI__builtin_ia32_shuf_f32x4:
+  case X86::BI__builtin_ia32_shuf_f64x2:
+  case X86::BI__builtin_ia32_shuf_i32x4:
+  case X86::BI__builtin_ia32_shuf_i64x2:
   case X86::BI__builtin_ia32_dbpsadbw128_mask:
   case X86::BI__builtin_ia32_dbpsadbw256_mask:
   case X86::BI__builtin_ia32_dbpsadbw512_mask:

Modified: cfe/trunk/test/CodeGen/avx512f-builtins.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/avx512f-builtins.c?rev=334244&r1=334243&r2=334244&view=diff
==============================================================================
--- cfe/trunk/test/CodeGen/avx512f-builtins.c (original)
+++ cfe/trunk/test/CodeGen/avx512f-builtins.c Thu Jun  7 16:03:08 2018
@@ -4860,20 +4860,20 @@ __m512d test_mm512_maskz_shuffle_f64x2(_
 
 __m512i test_mm512_shuffle_i32x4(__m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_shuffle_i32x4
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
   return _mm512_shuffle_i32x4(__A, __B, 4); 
 }
 
 __m512i test_mm512_mask_shuffle_i32x4(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_mask_shuffle_i32x4
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_mask_shuffle_i32x4(__W, __U, __A, __B, 4); 
 }
 
 __m512i test_mm512_maskz_shuffle_i32x4(__mmask16 __U, __m512i __A, __m512i __B) {
   // CHECK-LABEL: @test_mm512_maskz_shuffle_i32x4
-  // CHECK: shufflevector <8 x i64> %{{.*}}, <8 x i64> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 8, i32 9>
+  // CHECK: shufflevector <16 x i32> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
   // CHECK: select <16 x i1> %{{.*}}, <16 x i32> %{{.*}}, <16 x i32> %{{.*}}
   return _mm512_maskz_shuffle_i32x4(__U, __A, __B, 4); 
 }

Modified: cfe/trunk/test/CodeGen/avx512vl-builtins.c
URL: http://llvm.org/viewvc/llvm-project/cfe/trunk/test/CodeGen/avx512vl-builtins.c?rev=334244&r1=334243&r2=334244&view=diff
==============================================================================
--- cfe/trunk/test/CodeGen/avx512vl-builtins.c (original)
+++ cfe/trunk/test/CodeGen/avx512vl-builtins.c Thu Jun  7 16:03:08 2018
@@ -6071,20 +6071,20 @@ __m256d test_mm256_maskz_shuffle_f64x2(_
 
 __m256i test_mm256_shuffle_i32x4(__m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_shuffle_i32x4
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   return _mm256_shuffle_i32x4(__A, __B, 3); 
 }
 
 __m256i test_mm256_mask_shuffle_i32x4(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_mask_shuffle_i32x4
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_mask_shuffle_i32x4(__W, __U, __A, __B, 3); 
 }
 
 __m256i test_mm256_maskz_shuffle_i32x4(__mmask8 __U, __m256i __A, __m256i __B) {
   // CHECK-LABEL: @test_mm256_maskz_shuffle_i32x4
-  // CHECK: shufflevector <4 x i64> %{{.*}}, <4 x i64> %{{.*}}, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
+  // CHECK: shufflevector <8 x i32> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 12, i32 13, i32 14, i32 15>
   // CHECK: select <8 x i1> %{{.*}}, <8 x i32> %{{.*}}, <8 x i32> %{{.*}}
   return _mm256_maskz_shuffle_i32x4(__U, __A, __B, 3); 
 }