[llvm-commits] [llvm-gcc-4.2] r76819 - in /llvm-gcc-4.2/trunk: build_gcc gcc/config.gcc gcc/config/arm/arm.c gcc/config/arm/arm.h gcc/config/arm/arm_neon.h gcc/config/arm/llvm-arm.cpp gcc/config/arm/neon-gen.ml gcc/config/darwin.h gcc/llvm-types.cpp

Bob Wilson bob.wilson at apple.com
Wed Jul 22 16:35:23 PDT 2009


Author: bwilson
Date: Wed Jul 22 18:35:22 2009
New Revision: 76819

URL: http://llvm.org/viewvc/llvm-project?rev=76819&view=rev
Log:
Merge llvm-gcc changes for ARM v7 support.
Yeah!  All done merging now....

Added:
    llvm-gcc-4.2/trunk/gcc/config/arm/llvm-arm.cpp
Modified:
    llvm-gcc-4.2/trunk/build_gcc
    llvm-gcc-4.2/trunk/gcc/config.gcc
    llvm-gcc-4.2/trunk/gcc/config/arm/arm.c
    llvm-gcc-4.2/trunk/gcc/config/arm/arm.h
    llvm-gcc-4.2/trunk/gcc/config/arm/arm_neon.h
    llvm-gcc-4.2/trunk/gcc/config/arm/neon-gen.ml
    llvm-gcc-4.2/trunk/gcc/config/darwin.h
    llvm-gcc-4.2/trunk/gcc/llvm-types.cpp

Modified: llvm-gcc-4.2/trunk/build_gcc
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/build_gcc?rev=76819&r1=76818&r2=76819&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/build_gcc (original)
+++ llvm-gcc-4.2/trunk/build_gcc Wed Jul 22 18:35:22 2009
@@ -141,7 +141,7 @@
 
 # ARM may require a sysroot option.  Check if libSystem has an ARM slice
 # as an indication of whether the files in / support ARM.
-if lipo -info /usr/lib/libSystem.dylib | grep arm; then
+if lipo -info /usr/lib/libSystem.dylib | grep armv7; then
   ARM_SYSROOT=/
 else
   if [ -d $ARM_EXTRA_SDK ]; then

Modified: llvm-gcc-4.2/trunk/gcc/config.gcc
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config.gcc?rev=76819&r1=76818&r2=76819&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config.gcc (original)
+++ llvm-gcc-4.2/trunk/gcc/config.gcc Wed Jul 22 18:35:22 2009
@@ -257,23 +257,35 @@
 	;;
 strongarm*-*-*)
 	cpu_type=arm
+# LLVM LOCAL begin
+	out_cxx_file=arm/llvm-arm.cpp
+# LLVM LOCAL end                               
 	;;
 arm*-*-*)
 	cpu_type=arm
         # APPLE LOCAL ARM v7 support, merge from Codesourcery.
 	extra_headers="mmintrin.h arm_neon.h"
+# LLVM LOCAL begin
+	out_cxx_file=arm/llvm-arm.cpp
+# LLVM LOCAL end                               
 	;;
 bfin*-*)
 	cpu_type=bfin
 	;;
 ep9312*-*-*)
 	cpu_type=arm
+# LLVM LOCAL begin
+	out_cxx_file=arm/llvm-arm.cpp
+# LLVM LOCAL end                               
 	;;
 frv*)	cpu_type=frv
 	;;
 xscale-*-*)
 	cpu_type=arm
 	extra_headers="mmintrin.h"
+# LLVM LOCAL begin
+	out_cxx_file=arm/llvm-arm.cpp
+# LLVM LOCAL end                               
 	;;
 # APPLE LOCAL begin mainline
 i[34567]86-*-*)

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/arm.c
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/arm.c?rev=76819&r1=76818&r2=76819&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/arm.c (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/arm.c Wed Jul 22 18:35:22 2009
@@ -16320,45 +16320,63 @@
   const enum insn_code codes[T_MAX];
   const unsigned int num_vars;
   unsigned int base_fcode;
+  /* LLVM LOCAL begin */
+  /* Map each entry to the corresponding neon_builtins enum value.
+     GCC does not make it easy to identify NEON builtins, but LLVM
+     needs to translate them to intrinsics.  */
+  enum neon_builtins neon_code;
+  /* LLVM LOCAL end */
 } neon_builtin_datum;
 
 #define CF(N,X) CODE_FOR_neon_##N##X
 
+/* LLVM LOCAL begin Add initializers for neon_code field.  */
 #define VAR1(T, N, A) \
-  #N, NEON_##T, UP (A), { CF (N, A) }, 1, 0
+  #N, NEON_##T, UP (A), { CF (N, A) }, 1, 0, \
+  NEON_BUILTIN_##N
 #define VAR2(T, N, A, B) \
-  #N, NEON_##T, UP (A) | UP (B), { CF (N, A), CF (N, B) }, 2, 0
+  #N, NEON_##T, UP (A) | UP (B), { CF (N, A), CF (N, B) }, 2, 0, \
+  NEON_BUILTIN_##N
 #define VAR3(T, N, A, B, C) \
   #N, NEON_##T, UP (A) | UP (B) | UP (C), \
-  { CF (N, A), CF (N, B), CF (N, C) }, 3, 0
+  { CF (N, A), CF (N, B), CF (N, C) }, 3, 0, \
+  NEON_BUILTIN_##N
 #define VAR4(T, N, A, B, C, D) \
   #N, NEON_##T, UP (A) | UP (B) | UP (C) | UP (D), \
-  { CF (N, A), CF (N, B), CF (N, C), CF (N, D) }, 4, 0
+  { CF (N, A), CF (N, B), CF (N, C), CF (N, D) }, 4, 0, \
+  NEON_BUILTIN_##N
 #define VAR5(T, N, A, B, C, D, E) \
   #N, NEON_##T, UP (A) | UP (B) | UP (C) | UP (D) | UP (E), \
-  { CF (N, A), CF (N, B), CF (N, C), CF (N, D), CF (N, E) }, 5, 0
+  { CF (N, A), CF (N, B), CF (N, C), CF (N, D), CF (N, E) }, 5, 0, \
+  NEON_BUILTIN_##N
 #define VAR6(T, N, A, B, C, D, E, F) \
   #N, NEON_##T, UP (A) | UP (B) | UP (C) | UP (D) | UP (E) | UP (F), \
-  { CF (N, A), CF (N, B), CF (N, C), CF (N, D), CF (N, E), CF (N, F) }, 6, 0
+  { CF (N, A), CF (N, B), CF (N, C), CF (N, D), CF (N, E), CF (N, F) }, 6, 0, \
+  NEON_BUILTIN_##N
 #define VAR7(T, N, A, B, C, D, E, F, G) \
   #N, NEON_##T, UP (A) | UP (B) | UP (C) | UP (D) | UP (E) | UP (F) | UP (G), \
   { CF (N, A), CF (N, B), CF (N, C), CF (N, D), CF (N, E), CF (N, F), \
-    CF (N, G) }, 7, 0
+    CF (N, G) }, 7, 0, \
+  NEON_BUILTIN_##N
 #define VAR8(T, N, A, B, C, D, E, F, G, H) \
   #N, NEON_##T, UP (A) | UP (B) | UP (C) | UP (D) | UP (E) | UP (F) | UP (G) \
                 | UP (H), \
   { CF (N, A), CF (N, B), CF (N, C), CF (N, D), CF (N, E), CF (N, F), \
-    CF (N, G), CF (N, H) }, 8, 0
+    CF (N, G), CF (N, H) }, 8, 0, \
+  NEON_BUILTIN_##N
 #define VAR9(T, N, A, B, C, D, E, F, G, H, I) \
   #N, NEON_##T, UP (A) | UP (B) | UP (C) | UP (D) | UP (E) | UP (F) | UP (G) \
                 | UP (H) | UP (I), \
   { CF (N, A), CF (N, B), CF (N, C), CF (N, D), CF (N, E), CF (N, F), \
-    CF (N, G), CF (N, H), CF (N, I) }, 9, 0
+    CF (N, G), CF (N, H), CF (N, I) }, 9, 0, \
+  NEON_BUILTIN_##N
 #define VAR10(T, N, A, B, C, D, E, F, G, H, I, J) \
   #N, NEON_##T, UP (A) | UP (B) | UP (C) | UP (D) | UP (E) | UP (F) | UP (G) \
                 | UP (H) | UP (I) | UP (J), \
   { CF (N, A), CF (N, B), CF (N, C), CF (N, D), CF (N, E), CF (N, F), \
-    CF (N, G), CF (N, H), CF (N, I), CF (N, J) }, 10, 0
+    CF (N, G), CF (N, H), CF (N, I), CF (N, J) }, 10, 0, \
+  NEON_BUILTIN_##N
+/* LLVM LOCAL end Add initializers for neon_code field.  */
 
 /* The mode entries in the following table correspond to the "key" type of the
    instruction variant, i.e. equivalent to that which would be specified after
@@ -19162,8 +19180,13 @@
     return 1;
 }
 
-static enum insn_code
-locate_neon_builtin_icode (int fcode, neon_itype *itype)
+/* LLVM LOCAL begin
+   Added neon_code argument below and made the function
+   non-static.  This is needed when translating Neon builtins to LLVM.  */
+enum insn_code
+locate_neon_builtin_icode (int fcode, neon_itype *itype,
+                           enum neon_builtins *neon_code)
+/* LLVM LOCAL end */ 
 {
   neon_builtin_datum key, *found;
   int idx;
@@ -19178,6 +19201,11 @@
   if (itype)
     *itype = found->itype;
 
+  /* LLVM LOCAL begin */
+  if (neon_code)
+    *neon_code = found->neon_code;
+  /* LLVM LOCAL end */
+
   return found->codes[idx];
 }
 
@@ -19318,7 +19346,8 @@
 arm_expand_neon_builtin (rtx target, int fcode, tree arglist)
 {
   neon_itype itype;
-  enum insn_code icode = locate_neon_builtin_icode (fcode, &itype);
+  /* LLVM LOCAL Added 0 argument to following call.  */
+  enum insn_code icode = locate_neon_builtin_icode (fcode, &itype, 0);
   
   switch (itype)
     {

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/arm.h
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/arm.h?rev=76819&r1=76818&r2=76819&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/arm.h (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/arm.h Wed Jul 22 18:35:22 2009
@@ -3226,6 +3226,165 @@
 
 /* LLVM LOCAL begin */
 #ifdef ENABLE_LLVM
+
+/* Define a static enumeration of the NEON builtins to be used when
+   converting to LLVM intrinsics.  These names are derived from the
+   neon_builtin_data table in arm.c and should be kept in sync with that.  */
+
+enum neon_builtins
+{
+  NEON_BUILTIN_vadd,
+  NEON_BUILTIN_vaddl,
+  NEON_BUILTIN_vaddw,
+  NEON_BUILTIN_vhadd,
+  NEON_BUILTIN_vqadd,
+  NEON_BUILTIN_vaddhn,
+  NEON_BUILTIN_vmul,
+  NEON_BUILTIN_vmla,
+  NEON_BUILTIN_vmlal,
+  NEON_BUILTIN_vmls,
+  NEON_BUILTIN_vmlsl,
+  NEON_BUILTIN_vqdmulh,
+  NEON_BUILTIN_vqdmlal,
+  NEON_BUILTIN_vqdmlsl,
+  NEON_BUILTIN_vmull,
+  NEON_BUILTIN_vmull_n,
+  NEON_BUILTIN_vmull_lane,
+  NEON_BUILTIN_vqdmull_n,
+  NEON_BUILTIN_vqdmull_lane,
+  NEON_BUILTIN_vqdmulh_n,
+  NEON_BUILTIN_vqdmulh_lane,
+  NEON_BUILTIN_vqdmull,
+  NEON_BUILTIN_vshl,
+  NEON_BUILTIN_vqshl,
+  NEON_BUILTIN_vshr_n,
+  NEON_BUILTIN_vshrn_n,
+  NEON_BUILTIN_vqshrn_n,
+  NEON_BUILTIN_vqshrun_n,
+  NEON_BUILTIN_vshl_n,
+  NEON_BUILTIN_vqshl_n,
+  NEON_BUILTIN_vqshlu_n,
+  NEON_BUILTIN_vshll_n,
+  NEON_BUILTIN_vsra_n,
+  NEON_BUILTIN_vsub,
+  NEON_BUILTIN_vsubl,
+  NEON_BUILTIN_vsubw,
+  NEON_BUILTIN_vqsub,
+  NEON_BUILTIN_vhsub,
+  NEON_BUILTIN_vsubhn,
+  NEON_BUILTIN_vceq,
+  NEON_BUILTIN_vcge,
+  NEON_BUILTIN_vcgt,
+  NEON_BUILTIN_vcage,
+  NEON_BUILTIN_vcagt,
+  NEON_BUILTIN_vtst,
+  NEON_BUILTIN_vabd,
+  NEON_BUILTIN_vabdl,
+  NEON_BUILTIN_vaba,
+  NEON_BUILTIN_vabal,
+  NEON_BUILTIN_vmax,
+  NEON_BUILTIN_vmin,
+  NEON_BUILTIN_vpadd,
+  NEON_BUILTIN_vpaddl,
+  NEON_BUILTIN_vpadal,
+  NEON_BUILTIN_vpmax,
+  NEON_BUILTIN_vpmin,
+  NEON_BUILTIN_vrecps,
+  NEON_BUILTIN_vrsqrts,
+  NEON_BUILTIN_vsri_n,
+  NEON_BUILTIN_vsli_n,
+  NEON_BUILTIN_vabs,
+  NEON_BUILTIN_vqabs,
+  NEON_BUILTIN_vneg,
+  NEON_BUILTIN_vqneg,
+  NEON_BUILTIN_vcls,
+  NEON_BUILTIN_vclz,
+  NEON_BUILTIN_vcnt,
+  NEON_BUILTIN_vrecpe,
+  NEON_BUILTIN_vrsqrte,
+  NEON_BUILTIN_vmvn,
+  NEON_BUILTIN_vget_lane,
+  NEON_BUILTIN_vset_lane,
+  NEON_BUILTIN_vcreate,
+  NEON_BUILTIN_vdup_n,
+  NEON_BUILTIN_vdup_lane,
+  NEON_BUILTIN_vcombine,
+  NEON_BUILTIN_vget_high,
+  NEON_BUILTIN_vget_low,
+  NEON_BUILTIN_vmovn,
+  NEON_BUILTIN_vqmovn,
+  NEON_BUILTIN_vqmovun,
+  NEON_BUILTIN_vmovl,
+  NEON_BUILTIN_vmul_lane,
+  NEON_BUILTIN_vmla_lane,
+  NEON_BUILTIN_vmlal_lane,
+  NEON_BUILTIN_vqdmlal_lane,
+  NEON_BUILTIN_vmls_lane,
+  NEON_BUILTIN_vmlsl_lane,
+  NEON_BUILTIN_vqdmlsl_lane,
+  NEON_BUILTIN_vmul_n,
+  NEON_BUILTIN_vmla_n,
+  NEON_BUILTIN_vmlal_n,
+  NEON_BUILTIN_vqdmlal_n,
+  NEON_BUILTIN_vmls_n,
+  NEON_BUILTIN_vmlsl_n,
+  NEON_BUILTIN_vqdmlsl_n,
+  NEON_BUILTIN_vext,
+  NEON_BUILTIN_vrev64,
+  NEON_BUILTIN_vrev32,
+  NEON_BUILTIN_vrev16,
+  NEON_BUILTIN_vcvt,
+  NEON_BUILTIN_vcvt_n,
+  NEON_BUILTIN_vbsl,
+  NEON_BUILTIN_vtbl1,
+  NEON_BUILTIN_vtbl2,
+  NEON_BUILTIN_vtbl3,
+  NEON_BUILTIN_vtbl4,
+  NEON_BUILTIN_vtbx1,
+  NEON_BUILTIN_vtbx2,
+  NEON_BUILTIN_vtbx3,
+  NEON_BUILTIN_vtbx4,
+  NEON_BUILTIN_vtrn,
+  NEON_BUILTIN_vzip,
+  NEON_BUILTIN_vuzp,
+  NEON_BUILTIN_vreinterpretv8qi,
+  NEON_BUILTIN_vreinterpretv4hi,
+  NEON_BUILTIN_vreinterpretv2si,
+  NEON_BUILTIN_vreinterpretv2sf,
+  NEON_BUILTIN_vreinterpretdi,
+  NEON_BUILTIN_vreinterpretv16qi,
+  NEON_BUILTIN_vreinterpretv8hi,
+  NEON_BUILTIN_vreinterpretv4si,
+  NEON_BUILTIN_vreinterpretv4sf,
+  NEON_BUILTIN_vreinterpretv2di,
+  NEON_BUILTIN_vld1,
+  NEON_BUILTIN_vld1_lane,
+  NEON_BUILTIN_vld1_dup,
+  NEON_BUILTIN_vst1,
+  NEON_BUILTIN_vst1_lane,
+  NEON_BUILTIN_vld2,
+  NEON_BUILTIN_vld2_lane,
+  NEON_BUILTIN_vld2_dup,
+  NEON_BUILTIN_vst2,
+  NEON_BUILTIN_vst2_lane,
+  NEON_BUILTIN_vld3,
+  NEON_BUILTIN_vld3_lane,
+  NEON_BUILTIN_vld3_dup,
+  NEON_BUILTIN_vst3,
+  NEON_BUILTIN_vst3_lane,
+  NEON_BUILTIN_vld4,
+  NEON_BUILTIN_vld4_lane,
+  NEON_BUILTIN_vld4_dup,
+  NEON_BUILTIN_vst4,
+  NEON_BUILTIN_vst4_lane,
+  NEON_BUILTIN_vand,
+  NEON_BUILTIN_vorr,
+  NEON_BUILTIN_veor,
+  NEON_BUILTIN_vbic,
+  NEON_BUILTIN_vorn,
+  NEON_BUILTIN_MAX
+};
+
 #define LLVM_TARGET_INTRINSIC_PREFIX "arm"
 
 /* LLVM_TARGET_NAME - This specifies the name of the target, which correlates to
@@ -3285,6 +3444,8 @@
       F.setCPU("arm7tdmi"); \
       break; \
     } \
+    if (TARGET_NEON) \
+      F.AddFeature("neon"); \
   }
 
 /* Encode arm / thumb modes and arm subversion number in the triplet. e.g.
@@ -3334,7 +3495,16 @@
   else if ((ESCAPED_CHAR) == '@') {       		\
     (RESULT) += ASM_COMMENT_START;                      \
   }
-#endif
+
+/* LLVM_TARGET_INTRINSIC_LOWER - To handle builtins, we want to expand the
+   invocation into normal LLVM code.  If the target can handle the builtin, this
+   macro should call the target TreeToLLVM::TargetIntrinsicLower method and
+   return true.  This macro is invoked from a method in the TreeToLLVM class. */
+#define LLVM_TARGET_INTRINSIC_LOWER(EXP, BUILTIN_CODE, DESTLOC, RESULT,       \
+                                    DESTTY, OPS)                              \
+        TargetIntrinsicLower(EXP, BUILTIN_CODE, DESTLOC, RESULT, DESTTY, OPS);
+
+#endif /* ENABLE_LLVM */
 /* LLVM LOCAL end */
 
 #endif /* ! GCC_ARM_H */

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/arm_neon.h
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/arm_neon.h?rev=76819&r1=76818&r2=76819&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/arm_neon.h (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/arm_neon.h Wed Jul 22 18:35:22 2009
@@ -1,3 +1,4 @@
+/* LLVM LOCAL file Changed to use preprocessor macros.  */
 /* APPLE LOCAL file v7 support. Merge from Codesourcery */
 /* ARM NEON intrinsics include file. This file is generated automatically
    using neon-gen.ml.  Please do not edit manually.
@@ -400,11778 +401,6798 @@
 } poly16x8x4_t;
 
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vadd_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vaddv8qi (__a, __b, 1);
-}
+#define vadd_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vaddv8qi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vadd_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vaddv4hi (__a, __b, 1);
-}
+#define vadd_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vaddv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vadd_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vaddv2si (__a, __b, 1);
-}
+#define vadd_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vaddv2si (__a, __b, 1);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vadd_s64 (int64x1_t __a, int64x1_t __b)
-{
-  return (int64x1_t)__builtin_neon_vadddi (__a, __b, 1);
-}
+#define vadd_s64(__a, __b) \
+  (int64x1_t)__builtin_neon_vadddi (__a, __b, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vadd_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (float32x2_t)__builtin_neon_vaddv2sf (__a, __b, 5);
-}
+#define vadd_f32(__a, __b) \
+  (float32x2_t)__builtin_neon_vaddv2sf (__a, __b, 5);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vadd_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vadd_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vaddv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vadd_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vaddv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vadd_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vaddv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vadd_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vaddv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vadd_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vaddv2si (__a, __b, 0);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vadd_u64 (uint64x1_t __a, uint64x1_t __b)
-{
-  return (uint64x1_t)__builtin_neon_vadddi ((int64x1_t) __a, (int64x1_t) __b, 0);
-}
+#define vadd_u64(__a, __b) \
+  (uint64x1_t)__builtin_neon_vadddi (__a, __b, 0);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vaddq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (int8x16_t)__builtin_neon_vaddv16qi (__a, __b, 1);
-}
+#define vaddq_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vaddv16qi (__a, __b, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vaddq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vaddv8hi (__a, __b, 1);
-}
+#define vaddq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vaddv8hi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vaddq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vaddv4si (__a, __b, 1);
-}
+#define vaddq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vaddv4si (__a, __b, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vaddq_s64 (int64x2_t __a, int64x2_t __b)
-{
-  return (int64x2_t)__builtin_neon_vaddv2di (__a, __b, 1);
-}
+#define vaddq_s64(__a, __b) \
+  (int64x2_t)__builtin_neon_vaddv2di (__a, __b, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vaddq_f32 (float32x4_t __a, float32x4_t __b)
-{
-  return (float32x4_t)__builtin_neon_vaddv4sf (__a, __b, 5);
-}
+#define vaddq_f32(__a, __b) \
+  (float32x4_t)__builtin_neon_vaddv4sf (__a, __b, 5);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vaddq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vaddv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
-}
+#define vaddq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vaddv16qi (__a, __b, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vaddq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vaddv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
+#define vaddq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vaddv8hi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vaddq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vaddv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
+#define vaddq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vaddv4si (__a, __b, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vaddq_u64 (uint64x2_t __a, uint64x2_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vaddv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
-}
+#define vaddq_u64(__a, __b) \
+  (uint64x2_t)__builtin_neon_vaddv2di (__a, __b, 0);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vaddl_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vaddlv8qi (__a, __b, 1);
-}
+#define vaddl_s8(__a, __b) \
+  (int16x8_t)__builtin_neon_vaddlv8qi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vaddl_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vaddlv4hi (__a, __b, 1);
-}
+#define vaddl_s16(__a, __b) \
+  (int32x4_t)__builtin_neon_vaddlv4hi (__a, __b, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vaddl_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int64x2_t)__builtin_neon_vaddlv2si (__a, __b, 1);
-}
+#define vaddl_s32(__a, __b) \
+  (int64x2_t)__builtin_neon_vaddlv2si (__a, __b, 1);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vaddl_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vaddlv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vaddl_u8(__a, __b) \
+  (uint16x8_t)__builtin_neon_vaddlv8qi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vaddl_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vaddlv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vaddl_u16(__a, __b) \
+  (uint32x4_t)__builtin_neon_vaddlv4hi (__a, __b, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vaddl_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vaddlv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vaddl_u32(__a, __b) \
+  (uint64x2_t)__builtin_neon_vaddlv2si (__a, __b, 0);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vaddw_s8 (int16x8_t __a, int8x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vaddwv8qi (__a, __b, 1);
-}
+#define vaddw_s8(__a, __b) \
+  (int16x8_t)__builtin_neon_vaddwv8qi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vaddw_s16 (int32x4_t __a, int16x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vaddwv4hi (__a, __b, 1);
-}
+#define vaddw_s16(__a, __b) \
+  (int32x4_t)__builtin_neon_vaddwv4hi (__a, __b, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vaddw_s32 (int64x2_t __a, int32x2_t __b)
-{
-  return (int64x2_t)__builtin_neon_vaddwv2si (__a, __b, 1);
-}
+#define vaddw_s32(__a, __b) \
+  (int64x2_t)__builtin_neon_vaddwv2si (__a, __b, 1);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vaddw_u8 (uint16x8_t __a, uint8x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vaddwv8qi ((int16x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vaddw_u8(__a, __b) \
+  (uint16x8_t)__builtin_neon_vaddwv8qi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vaddw_u16 (uint32x4_t __a, uint16x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vaddwv4hi ((int32x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vaddw_u16(__a, __b) \
+  (uint32x4_t)__builtin_neon_vaddwv4hi (__a, __b, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vaddw_u32 (uint64x2_t __a, uint32x2_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vaddwv2si ((int64x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vaddw_u32(__a, __b) \
+  (uint64x2_t)__builtin_neon_vaddwv2si (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vhadd_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vhaddv8qi (__a, __b, 1);
-}
+#define vhadd_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vhaddv8qi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vhadd_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vhaddv4hi (__a, __b, 1);
-}
+#define vhadd_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vhaddv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vhadd_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vhaddv2si (__a, __b, 1);
-}
+#define vhadd_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vhaddv2si (__a, __b, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vhadd_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vhaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vhadd_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vhaddv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vhadd_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vhaddv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vhadd_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vhaddv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vhadd_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vhaddv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vhadd_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vhaddv2si (__a, __b, 0);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vhaddq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (int8x16_t)__builtin_neon_vhaddv16qi (__a, __b, 1);
-}
+#define vhaddq_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vhaddv16qi (__a, __b, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vhaddq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vhaddv8hi (__a, __b, 1);
-}
+#define vhaddq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vhaddv8hi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vhaddq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vhaddv4si (__a, __b, 1);
-}
+#define vhaddq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vhaddv4si (__a, __b, 1);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vhaddq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vhaddv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
-}
+#define vhaddq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vhaddv16qi (__a, __b, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vhaddq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vhaddv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
+#define vhaddq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vhaddv8hi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vhaddq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vhaddv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
+#define vhaddq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vhaddv4si (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrhadd_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vhaddv8qi (__a, __b, 3);
-}
+#define vrhadd_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vhaddv8qi (__a, __b, 3);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vrhadd_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vhaddv4hi (__a, __b, 3);
-}
+#define vrhadd_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vhaddv4hi (__a, __b, 3);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vrhadd_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vhaddv2si (__a, __b, 3);
-}
+#define vrhadd_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vhaddv2si (__a, __b, 3);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrhadd_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vhaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 2);
-}
+#define vrhadd_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vhaddv8qi (__a, __b, 2);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vrhadd_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vhaddv4hi ((int16x4_t) __a, (int16x4_t) __b, 2);
-}
+#define vrhadd_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vhaddv4hi (__a, __b, 2);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrhadd_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vhaddv2si ((int32x2_t) __a, (int32x2_t) __b, 2);
-}
+#define vrhadd_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vhaddv2si (__a, __b, 2);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vrhaddq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (int8x16_t)__builtin_neon_vhaddv16qi (__a, __b, 3);
-}
+#define vrhaddq_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vhaddv16qi (__a, __b, 3);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vrhaddq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vhaddv8hi (__a, __b, 3);
-}
+#define vrhaddq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vhaddv8hi (__a, __b, 3);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vrhaddq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vhaddv4si (__a, __b, 3);
-}
+#define vrhaddq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vhaddv4si (__a, __b, 3);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vrhaddq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vhaddv16qi ((int8x16_t) __a, (int8x16_t) __b, 2);
-}
+#define vrhaddq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vhaddv16qi (__a, __b, 2);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vrhaddq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vhaddv8hi ((int16x8_t) __a, (int16x8_t) __b, 2);
-}
+#define vrhaddq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vhaddv8hi (__a, __b, 2);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vrhaddq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vhaddv4si ((int32x4_t) __a, (int32x4_t) __b, 2);
-}
+#define vrhaddq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vhaddv4si (__a, __b, 2);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqadd_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vqaddv8qi (__a, __b, 1);
-}
+#define vqadd_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vqaddv8qi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqadd_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vqaddv4hi (__a, __b, 1);
-}
+#define vqadd_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vqaddv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqadd_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vqaddv2si (__a, __b, 1);
-}
+#define vqadd_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vqaddv2si (__a, __b, 1);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vqadd_s64 (int64x1_t __a, int64x1_t __b)
-{
-  return (int64x1_t)__builtin_neon_vqadddi (__a, __b, 1);
-}
+#define vqadd_s64(__a, __b) \
+  (int64x1_t)__builtin_neon_vqadddi (__a, __b, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqadd_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vqaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vqadd_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vqaddv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqadd_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vqaddv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vqadd_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vqaddv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqadd_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vqaddv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vqadd_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vqaddv2si (__a, __b, 0);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vqadd_u64 (uint64x1_t __a, uint64x1_t __b)
-{
-  return (uint64x1_t)__builtin_neon_vqadddi ((int64x1_t) __a, (int64x1_t) __b, 0);
-}
+#define vqadd_u64(__a, __b) \
+  (uint64x1_t)__builtin_neon_vqadddi (__a, __b, 0);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqaddq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (int8x16_t)__builtin_neon_vqaddv16qi (__a, __b, 1);
-}
+#define vqaddq_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vqaddv16qi (__a, __b, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqaddq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vqaddv8hi (__a, __b, 1);
-}
+#define vqaddq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vqaddv8hi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqaddq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vqaddv4si (__a, __b, 1);
-}
+#define vqaddq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vqaddv4si (__a, __b, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqaddq_s64 (int64x2_t __a, int64x2_t __b)
-{
-  return (int64x2_t)__builtin_neon_vqaddv2di (__a, __b, 1);
-}
+#define vqaddq_s64(__a, __b) \
+  (int64x2_t)__builtin_neon_vqaddv2di (__a, __b, 1);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqaddq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vqaddv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
-}
+#define vqaddq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vqaddv16qi (__a, __b, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vqaddq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vqaddv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
+#define vqaddq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vqaddv8hi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vqaddq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vqaddv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
+#define vqaddq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vqaddv4si (__a, __b, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vqaddq_u64 (uint64x2_t __a, uint64x2_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vqaddv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
-}
+#define vqaddq_u64(__a, __b) \
+  (uint64x2_t)__builtin_neon_vqaddv2di (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vaddhn_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vaddhnv8hi (__a, __b, 1);
-}
+#define vaddhn_s16(__a, __b) \
+  (int8x8_t)__builtin_neon_vaddhnv8hi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vaddhn_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vaddhnv4si (__a, __b, 1);
-}
+#define vaddhn_s32(__a, __b) \
+  (int16x4_t)__builtin_neon_vaddhnv4si (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vaddhn_s64 (int64x2_t __a, int64x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vaddhnv2di (__a, __b, 1);
-}
+#define vaddhn_s64(__a, __b) \
+  (int32x2_t)__builtin_neon_vaddhnv2di (__a, __b, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vaddhn_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vaddhnv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
+#define vaddhn_u16(__a, __b) \
+  (uint8x8_t)__builtin_neon_vaddhnv8hi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vaddhn_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vaddhnv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
+#define vaddhn_u32(__a, __b) \
+  (uint16x4_t)__builtin_neon_vaddhnv4si (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vaddhn_u64 (uint64x2_t __a, uint64x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vaddhnv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
-}
+#define vaddhn_u64(__a, __b) \
+  (uint32x2_t)__builtin_neon_vaddhnv2di (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vraddhn_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vaddhnv8hi (__a, __b, 3);
-}
+#define vraddhn_s16(__a, __b) \
+  (int8x8_t)__builtin_neon_vaddhnv8hi (__a, __b, 3);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vraddhn_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vaddhnv4si (__a, __b, 3);
-}
+#define vraddhn_s32(__a, __b) \
+  (int16x4_t)__builtin_neon_vaddhnv4si (__a, __b, 3);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vraddhn_s64 (int64x2_t __a, int64x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vaddhnv2di (__a, __b, 3);
-}
+#define vraddhn_s64(__a, __b) \
+  (int32x2_t)__builtin_neon_vaddhnv2di (__a, __b, 3);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vraddhn_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vaddhnv8hi ((int16x8_t) __a, (int16x8_t) __b, 2);
-}
+#define vraddhn_u16(__a, __b) \
+  (uint8x8_t)__builtin_neon_vaddhnv8hi (__a, __b, 2);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vraddhn_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vaddhnv4si ((int32x4_t) __a, (int32x4_t) __b, 2);
-}
+#define vraddhn_u32(__a, __b) \
+  (uint16x4_t)__builtin_neon_vaddhnv4si (__a, __b, 2);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vraddhn_u64 (uint64x2_t __a, uint64x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vaddhnv2di ((int64x2_t) __a, (int64x2_t) __b, 2);
-}
+#define vraddhn_u64(__a, __b) \
+  (uint32x2_t)__builtin_neon_vaddhnv2di (__a, __b, 2);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vmul_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vmulv8qi (__a, __b, 1);
-}
+#define vmul_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vmulv8qi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmul_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vmulv4hi (__a, __b, 1);
-}
+#define vmul_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vmulv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmul_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vmulv2si (__a, __b, 1);
-}
+#define vmul_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vmulv2si (__a, __b, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmul_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (float32x2_t)__builtin_neon_vmulv2sf (__a, __b, 5);
-}
+#define vmul_f32(__a, __b) \
+  (float32x2_t)__builtin_neon_vmulv2sf (__a, __b, 5);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vmul_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vmulv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vmul_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vmulv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmul_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vmulv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vmul_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vmulv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmul_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vmulv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vmul_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vmulv2si (__a, __b, 0);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vmul_p8 (poly8x8_t __a, poly8x8_t __b)
-{
-  return (poly8x8_t)__builtin_neon_vmulv8qi ((int8x8_t) __a, (int8x8_t) __b, 4);
-}
+#define vmul_p8(__a, __b) \
+  (poly8x8_t)__builtin_neon_vmulv8qi (__a, __b, 4);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vmulq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (int8x16_t)__builtin_neon_vmulv16qi (__a, __b, 1);
-}
+#define vmulq_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vmulv16qi (__a, __b, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmulq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vmulv8hi (__a, __b, 1);
-}
+#define vmulq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vmulv8hi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmulq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vmulv4si (__a, __b, 1);
-}
+#define vmulq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vmulv4si (__a, __b, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulq_f32 (float32x4_t __a, float32x4_t __b)
-{
-  return (float32x4_t)__builtin_neon_vmulv4sf (__a, __b, 5);
-}
+#define vmulq_f32(__a, __b) \
+  (float32x4_t)__builtin_neon_vmulv4sf (__a, __b, 5);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vmulq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vmulv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
-}
+#define vmulq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vmulv16qi (__a, __b, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmulq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vmulv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
+#define vmulq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vmulv8hi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmulq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vmulv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
+#define vmulq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vmulv4si (__a, __b, 0);
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vmulq_p8 (poly8x16_t __a, poly8x16_t __b)
-{
-  return (poly8x16_t)__builtin_neon_vmulv16qi ((int8x16_t) __a, (int8x16_t) __b, 4);
-}
+#define vmulq_p8(__a, __b) \
+  (poly8x16_t)__builtin_neon_vmulv16qi (__a, __b, 4);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqdmulh_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vqdmulhv4hi (__a, __b, 1);
-}
+#define vqdmulh_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vqdmulhv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqdmulh_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vqdmulhv2si (__a, __b, 1);
-}
+#define vqdmulh_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vqdmulhv2si (__a, __b, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqdmulhq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vqdmulhv8hi (__a, __b, 1);
-}
+#define vqdmulhq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vqdmulhv8hi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmulhq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vqdmulhv4si (__a, __b, 1);
-}
+#define vqdmulhq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vqdmulhv4si (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqrdmulh_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vqdmulhv4hi (__a, __b, 3);
-}
+#define vqrdmulh_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vqdmulhv4hi (__a, __b, 3);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqrdmulh_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vqdmulhv2si (__a, __b, 3);
-}
+#define vqrdmulh_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vqdmulhv2si (__a, __b, 3);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqrdmulhq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vqdmulhv8hi (__a, __b, 3);
-}
+#define vqrdmulhq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vqdmulhv8hi (__a, __b, 3);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqrdmulhq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vqdmulhv4si (__a, __b, 3);
-}
+#define vqrdmulhq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vqdmulhv4si (__a, __b, 3);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmull_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vmullv8qi (__a, __b, 1);
-}
+#define vmull_s8(__a, __b) \
+  (int16x8_t)__builtin_neon_vmullv8qi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vmullv4hi (__a, __b, 1);
-}
+#define vmull_s16(__a, __b) \
+  (int32x4_t)__builtin_neon_vmullv4hi (__a, __b, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int64x2_t)__builtin_neon_vmullv2si (__a, __b, 1);
-}
+#define vmull_s32(__a, __b) \
+  (int64x2_t)__builtin_neon_vmullv2si (__a, __b, 1);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmull_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vmullv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vmull_u8(__a, __b) \
+  (uint16x8_t)__builtin_neon_vmullv8qi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vmullv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vmull_u16(__a, __b) \
+  (uint32x4_t)__builtin_neon_vmullv4hi (__a, __b, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vmullv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vmull_u32(__a, __b) \
+  (uint64x2_t)__builtin_neon_vmullv2si (__a, __b, 0);
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vmull_p8 (poly8x8_t __a, poly8x8_t __b)
-{
-  return (poly16x8_t)__builtin_neon_vmullv8qi ((int8x8_t) __a, (int8x8_t) __b, 4);
-}
+#define vmull_p8(__a, __b) \
+  (poly16x8_t)__builtin_neon_vmullv8qi (__a, __b, 4);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmull_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vqdmullv4hi (__a, __b, 1);
-}
+#define vqdmull_s16(__a, __b) \
+  (int32x4_t)__builtin_neon_vqdmullv4hi (__a, __b, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmull_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int64x2_t)__builtin_neon_vqdmullv2si (__a, __b, 1);
-}
+#define vqdmull_s32(__a, __b) \
+  (int64x2_t)__builtin_neon_vqdmullv2si (__a, __b, 1);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
-{
-  return (int8x8_t)__builtin_neon_vmlav8qi (__a, __b, __c, 1);
-}
+#define vmla_s8(__a, __b, __c) \
+  (int8x8_t)__builtin_neon_vmlav8qi (__a, __b, __c, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
-{
-  return (int16x4_t)__builtin_neon_vmlav4hi (__a, __b, __c, 1);
-}
+#define vmla_s16(__a, __b, __c) \
+  (int16x4_t)__builtin_neon_vmlav4hi (__a, __b, __c, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
-{
-  return (int32x2_t)__builtin_neon_vmlav2si (__a, __b, __c, 1);
-}
+#define vmla_s32(__a, __b, __c) \
+  (int32x2_t)__builtin_neon_vmlav2si (__a, __b, __c, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmla_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
-{
-  return (float32x2_t)__builtin_neon_vmlav2sf (__a, __b, __c, 5);
-}
+#define vmla_f32(__a, __b, __c) \
+  (float32x2_t)__builtin_neon_vmlav2sf (__a, __b, __c, 5);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
-{
-  return (uint8x8_t)__builtin_neon_vmlav8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0);
-}
+#define vmla_u8(__a, __b, __c) \
+  (uint8x8_t)__builtin_neon_vmlav8qi (__a, __b, __c, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
-{
-  return (uint16x4_t)__builtin_neon_vmlav4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0);
-}
+#define vmla_u16(__a, __b, __c) \
+  (uint16x4_t)__builtin_neon_vmlav4hi (__a, __b, __c, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
-{
-  return (uint32x2_t)__builtin_neon_vmlav2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0);
-}
+#define vmla_u32(__a, __b, __c) \
+  (uint32x2_t)__builtin_neon_vmlav2si (__a, __b, __c, 0);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
-{
-  return (int8x16_t)__builtin_neon_vmlav16qi (__a, __b, __c, 1);
-}
+#define vmlaq_s8(__a, __b, __c) \
+  (int8x16_t)__builtin_neon_vmlav16qi (__a, __b, __c, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
-{
-  return (int16x8_t)__builtin_neon_vmlav8hi (__a, __b, __c, 1);
-}
+#define vmlaq_s16(__a, __b, __c) \
+  (int16x8_t)__builtin_neon_vmlav8hi (__a, __b, __c, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
-{
-  return (int32x4_t)__builtin_neon_vmlav4si (__a, __b, __c, 1);
-}
+#define vmlaq_s32(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vmlav4si (__a, __b, __c, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmlaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
-{
-  return (float32x4_t)__builtin_neon_vmlav4sf (__a, __b, __c, 5);
-}
+#define vmlaq_f32(__a, __b, __c) \
+  (float32x4_t)__builtin_neon_vmlav4sf (__a, __b, __c, 5);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
-{
-  return (uint8x16_t)__builtin_neon_vmlav16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c, 0);
-}
+#define vmlaq_u8(__a, __b, __c) \
+  (uint8x16_t)__builtin_neon_vmlav16qi (__a, __b, __c, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
-{
-  return (uint16x8_t)__builtin_neon_vmlav8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c, 0);
-}
+#define vmlaq_u16(__a, __b, __c) \
+  (uint16x8_t)__builtin_neon_vmlav8hi (__a, __b, __c, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
-{
-  return (uint32x4_t)__builtin_neon_vmlav4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c, 0);
-}
+#define vmlaq_u32(__a, __b, __c) \
+  (uint32x4_t)__builtin_neon_vmlav4si (__a, __b, __c, 0);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmlal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
-{
-  return (int16x8_t)__builtin_neon_vmlalv8qi (__a, __b, __c, 1);
-}
+#define vmlal_s8(__a, __b, __c) \
+  (int16x8_t)__builtin_neon_vmlalv8qi (__a, __b, __c, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
-{
-  return (int32x4_t)__builtin_neon_vmlalv4hi (__a, __b, __c, 1);
-}
+#define vmlal_s16(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vmlalv4hi (__a, __b, __c, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
-{
-  return (int64x2_t)__builtin_neon_vmlalv2si (__a, __b, __c, 1);
-}
+#define vmlal_s32(__a, __b, __c) \
+  (int64x2_t)__builtin_neon_vmlalv2si (__a, __b, __c, 1);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
-{
-  return (uint16x8_t)__builtin_neon_vmlalv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0);
-}
+#define vmlal_u8(__a, __b, __c) \
+  (uint16x8_t)__builtin_neon_vmlalv8qi (__a, __b, __c, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
-{
-  return (uint32x4_t)__builtin_neon_vmlalv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0);
-}
+#define vmlal_u16(__a, __b, __c) \
+  (uint32x4_t)__builtin_neon_vmlalv4hi (__a, __b, __c, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmlal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
-{
-  return (uint64x2_t)__builtin_neon_vmlalv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0);
-}
+#define vmlal_u32(__a, __b, __c) \
+  (uint64x2_t)__builtin_neon_vmlalv2si (__a, __b, __c, 0);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
-{
-  return (int32x4_t)__builtin_neon_vqdmlalv4hi (__a, __b, __c, 1);
-}
+#define vqdmlal_s16(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vqdmlalv4hi (__a, __b, __c, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
-{
-  return (int64x2_t)__builtin_neon_vqdmlalv2si (__a, __b, __c, 1);
-}
+#define vqdmlal_s32(__a, __b, __c) \
+  (int64x2_t)__builtin_neon_vqdmlalv2si (__a, __b, __c, 1);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vmls_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
-{
-  return (int8x8_t)__builtin_neon_vmlsv8qi (__a, __b, __c, 1);
-}
+#define vmls_s8(__a, __b, __c) \
+  (int8x8_t)__builtin_neon_vmlsv8qi (__a, __b, __c, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmls_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
-{
-  return (int16x4_t)__builtin_neon_vmlsv4hi (__a, __b, __c, 1);
-}
+#define vmls_s16(__a, __b, __c) \
+  (int16x4_t)__builtin_neon_vmlsv4hi (__a, __b, __c, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmls_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
-{
-  return (int32x2_t)__builtin_neon_vmlsv2si (__a, __b, __c, 1);
-}
+#define vmls_s32(__a, __b, __c) \
+  (int32x2_t)__builtin_neon_vmlsv2si (__a, __b, __c, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmls_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
-{
-  return (float32x2_t)__builtin_neon_vmlsv2sf (__a, __b, __c, 5);
-}
+#define vmls_f32(__a, __b, __c) \
+  (float32x2_t)__builtin_neon_vmlsv2sf (__a, __b, __c, 5);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vmls_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
-{
-  return (uint8x8_t)__builtin_neon_vmlsv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0);
-}
+#define vmls_u8(__a, __b, __c) \
+  (uint8x8_t)__builtin_neon_vmlsv8qi (__a, __b, __c, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmls_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
-{
-  return (uint16x4_t)__builtin_neon_vmlsv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0);
-}
+#define vmls_u16(__a, __b, __c) \
+  (uint16x4_t)__builtin_neon_vmlsv4hi (__a, __b, __c, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmls_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
-{
-  return (uint32x2_t)__builtin_neon_vmlsv2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0);
-}
+#define vmls_u32(__a, __b, __c) \
+  (uint32x2_t)__builtin_neon_vmlsv2si (__a, __b, __c, 0);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vmlsq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
-{
-  return (int8x16_t)__builtin_neon_vmlsv16qi (__a, __b, __c, 1);
-}
+#define vmlsq_s8(__a, __b, __c) \
+  (int8x16_t)__builtin_neon_vmlsv16qi (__a, __b, __c, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmlsq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
-{
-  return (int16x8_t)__builtin_neon_vmlsv8hi (__a, __b, __c, 1);
-}
+#define vmlsq_s16(__a, __b, __c) \
+  (int16x8_t)__builtin_neon_vmlsv8hi (__a, __b, __c, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlsq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
-{
-  return (int32x4_t)__builtin_neon_vmlsv4si (__a, __b, __c, 1);
-}
+#define vmlsq_s32(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vmlsv4si (__a, __b, __c, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmlsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
-{
-  return (float32x4_t)__builtin_neon_vmlsv4sf (__a, __b, __c, 5);
-}
+#define vmlsq_f32(__a, __b, __c) \
+  (float32x4_t)__builtin_neon_vmlsv4sf (__a, __b, __c, 5);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vmlsq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
-{
-  return (uint8x16_t)__builtin_neon_vmlsv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c, 0);
-}
+#define vmlsq_u8(__a, __b, __c) \
+  (uint8x16_t)__builtin_neon_vmlsv16qi (__a, __b, __c, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlsq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
-{
-  return (uint16x8_t)__builtin_neon_vmlsv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c, 0);
-}
+#define vmlsq_u16(__a, __b, __c) \
+  (uint16x8_t)__builtin_neon_vmlsv8hi (__a, __b, __c, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlsq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
-{
-  return (uint32x4_t)__builtin_neon_vmlsv4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c, 0);
-}
+#define vmlsq_u32(__a, __b, __c) \
+  (uint32x4_t)__builtin_neon_vmlsv4si (__a, __b, __c, 0);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmlsl_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
-{
-  return (int16x8_t)__builtin_neon_vmlslv8qi (__a, __b, __c, 1);
-}
+#define vmlsl_s8(__a, __b, __c) \
+  (int16x8_t)__builtin_neon_vmlslv8qi (__a, __b, __c, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
-{
-  return (int32x4_t)__builtin_neon_vmlslv4hi (__a, __b, __c, 1);
-}
+#define vmlsl_s16(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vmlslv4hi (__a, __b, __c, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
-{
-  return (int64x2_t)__builtin_neon_vmlslv2si (__a, __b, __c, 1);
-}
+#define vmlsl_s32(__a, __b, __c) \
+  (int64x2_t)__builtin_neon_vmlslv2si (__a, __b, __c, 1);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlsl_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
-{
-  return (uint16x8_t)__builtin_neon_vmlslv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0);
-}
+#define vmlsl_u8(__a, __b, __c) \
+  (uint16x8_t)__builtin_neon_vmlslv8qi (__a, __b, __c, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlsl_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
-{
-  return (uint32x4_t)__builtin_neon_vmlslv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0);
-}
+#define vmlsl_u16(__a, __b, __c) \
+  (uint32x4_t)__builtin_neon_vmlslv4hi (__a, __b, __c, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmlsl_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
-{
-  return (uint64x2_t)__builtin_neon_vmlslv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0);
-}
+#define vmlsl_u32(__a, __b, __c) \
+  (uint64x2_t)__builtin_neon_vmlslv2si (__a, __b, __c, 0);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
-{
-  return (int32x4_t)__builtin_neon_vqdmlslv4hi (__a, __b, __c, 1);
-}
+#define vqdmlsl_s16(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vqdmlslv4hi (__a, __b, __c, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
-{
-  return (int64x2_t)__builtin_neon_vqdmlslv2si (__a, __b, __c, 1);
-}
+#define vqdmlsl_s32(__a, __b, __c) \
+  (int64x2_t)__builtin_neon_vqdmlslv2si (__a, __b, __c, 1);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vsub_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vsubv8qi (__a, __b, 1);
-}
+#define vsub_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vsubv8qi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vsub_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vsubv4hi (__a, __b, 1);
-}
+#define vsub_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vsubv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vsub_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vsubv2si (__a, __b, 1);
-}
+#define vsub_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vsubv2si (__a, __b, 1);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vsub_s64 (int64x1_t __a, int64x1_t __b)
-{
-  return (int64x1_t)__builtin_neon_vsubdi (__a, __b, 1);
-}
+#define vsub_s64(__a, __b) \
+  (int64x1_t)__builtin_neon_vsubdi (__a, __b, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vsub_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (float32x2_t)__builtin_neon_vsubv2sf (__a, __b, 5);
-}
+#define vsub_f32(__a, __b) \
+  (float32x2_t)__builtin_neon_vsubv2sf (__a, __b, 5);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vsub_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vsubv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vsub_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vsubv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vsub_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vsubv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vsub_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vsubv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vsub_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vsubv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vsub_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vsubv2si (__a, __b, 0);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vsub_u64 (uint64x1_t __a, uint64x1_t __b)
-{
-  return (uint64x1_t)__builtin_neon_vsubdi ((int64x1_t) __a, (int64x1_t) __b, 0);
-}
+#define vsub_u64(__a, __b) \
+  (uint64x1_t)__builtin_neon_vsubdi (__a, __b, 0);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vsubq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (int8x16_t)__builtin_neon_vsubv16qi (__a, __b, 1);
-}
+#define vsubq_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vsubv16qi (__a, __b, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vsubq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vsubv8hi (__a, __b, 1);
-}
+#define vsubq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vsubv8hi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vsubq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vsubv4si (__a, __b, 1);
-}
+#define vsubq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vsubv4si (__a, __b, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vsubq_s64 (int64x2_t __a, int64x2_t __b)
-{
-  return (int64x2_t)__builtin_neon_vsubv2di (__a, __b, 1);
-}
+#define vsubq_s64(__a, __b) \
+  (int64x2_t)__builtin_neon_vsubv2di (__a, __b, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vsubq_f32 (float32x4_t __a, float32x4_t __b)
-{
-  return (float32x4_t)__builtin_neon_vsubv4sf (__a, __b, 5);
-}
+#define vsubq_f32(__a, __b) \
+  (float32x4_t)__builtin_neon_vsubv4sf (__a, __b, 5);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vsubq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vsubv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
-}
+#define vsubq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vsubv16qi (__a, __b, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vsubq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vsubv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
+#define vsubq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vsubv8hi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsubq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vsubv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
+#define vsubq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vsubv4si (__a, __b, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vsubq_u64 (uint64x2_t __a, uint64x2_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vsubv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
-}
+#define vsubq_u64(__a, __b) \
+  (uint64x2_t)__builtin_neon_vsubv2di (__a, __b, 0);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vsubl_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vsublv8qi (__a, __b, 1);
-}
+#define vsubl_s8(__a, __b) \
+  (int16x8_t)__builtin_neon_vsublv8qi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vsubl_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vsublv4hi (__a, __b, 1);
-}
+#define vsubl_s16(__a, __b) \
+  (int32x4_t)__builtin_neon_vsublv4hi (__a, __b, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vsubl_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int64x2_t)__builtin_neon_vsublv2si (__a, __b, 1);
-}
+#define vsubl_s32(__a, __b) \
+  (int64x2_t)__builtin_neon_vsublv2si (__a, __b, 1);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vsubl_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vsublv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vsubl_u8(__a, __b) \
+  (uint16x8_t)__builtin_neon_vsublv8qi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsubl_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vsublv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vsubl_u16(__a, __b) \
+  (uint32x4_t)__builtin_neon_vsublv4hi (__a, __b, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vsubl_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vsublv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vsubl_u32(__a, __b) \
+  (uint64x2_t)__builtin_neon_vsublv2si (__a, __b, 0);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vsubw_s8 (int16x8_t __a, int8x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vsubwv8qi (__a, __b, 1);
-}
+#define vsubw_s8(__a, __b) \
+  (int16x8_t)__builtin_neon_vsubwv8qi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vsubw_s16 (int32x4_t __a, int16x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vsubwv4hi (__a, __b, 1);
-}
+#define vsubw_s16(__a, __b) \
+  (int32x4_t)__builtin_neon_vsubwv4hi (__a, __b, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vsubw_s32 (int64x2_t __a, int32x2_t __b)
-{
-  return (int64x2_t)__builtin_neon_vsubwv2si (__a, __b, 1);
-}
+#define vsubw_s32(__a, __b) \
+  (int64x2_t)__builtin_neon_vsubwv2si (__a, __b, 1);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vsubw_u8 (uint16x8_t __a, uint8x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vsubwv8qi ((int16x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vsubw_u8(__a, __b) \
+  (uint16x8_t)__builtin_neon_vsubwv8qi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsubw_u16 (uint32x4_t __a, uint16x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vsubwv4hi ((int32x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vsubw_u16(__a, __b) \
+  (uint32x4_t)__builtin_neon_vsubwv4hi (__a, __b, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vsubw_u32 (uint64x2_t __a, uint32x2_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vsubwv2si ((int64x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vsubw_u32(__a, __b) \
+  (uint64x2_t)__builtin_neon_vsubwv2si (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vhsub_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vhsubv8qi (__a, __b, 1);
-}
+#define vhsub_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vhsubv8qi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vhsub_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vhsubv4hi (__a, __b, 1);
-}
+#define vhsub_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vhsubv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vhsub_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vhsubv2si (__a, __b, 1);
-}
+#define vhsub_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vhsubv2si (__a, __b, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vhsub_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vhsubv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vhsub_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vhsubv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vhsub_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vhsubv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vhsub_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vhsubv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vhsub_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vhsubv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vhsub_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vhsubv2si (__a, __b, 0);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vhsubq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (int8x16_t)__builtin_neon_vhsubv16qi (__a, __b, 1);
-}
+#define vhsubq_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vhsubv16qi (__a, __b, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vhsubq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vhsubv8hi (__a, __b, 1);
-}
+#define vhsubq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vhsubv8hi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vhsubq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vhsubv4si (__a, __b, 1);
-}
+#define vhsubq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vhsubv4si (__a, __b, 1);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vhsubq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vhsubv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
-}
+#define vhsubq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vhsubv16qi (__a, __b, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vhsubq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vhsubv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
+#define vhsubq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vhsubv8hi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vhsubq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vhsubv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
+#define vhsubq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vhsubv4si (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqsub_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vqsubv8qi (__a, __b, 1);
-}
+#define vqsub_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vqsubv8qi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqsub_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vqsubv4hi (__a, __b, 1);
-}
+#define vqsub_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vqsubv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqsub_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vqsubv2si (__a, __b, 1);
-}
+#define vqsub_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vqsubv2si (__a, __b, 1);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vqsub_s64 (int64x1_t __a, int64x1_t __b)
-{
-  return (int64x1_t)__builtin_neon_vqsubdi (__a, __b, 1);
-}
+#define vqsub_s64(__a, __b) \
+  (int64x1_t)__builtin_neon_vqsubdi (__a, __b, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqsub_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vqsubv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vqsub_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vqsubv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqsub_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vqsubv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vqsub_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vqsubv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqsub_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vqsubv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vqsub_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vqsubv2si (__a, __b, 0);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vqsub_u64 (uint64x1_t __a, uint64x1_t __b)
-{
-  return (uint64x1_t)__builtin_neon_vqsubdi ((int64x1_t) __a, (int64x1_t) __b, 0);
-}
+#define vqsub_u64(__a, __b) \
+  (uint64x1_t)__builtin_neon_vqsubdi (__a, __b, 0);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqsubq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (int8x16_t)__builtin_neon_vqsubv16qi (__a, __b, 1);
-}
+#define vqsubq_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vqsubv16qi (__a, __b, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqsubq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vqsubv8hi (__a, __b, 1);
-}
+#define vqsubq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vqsubv8hi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqsubq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vqsubv4si (__a, __b, 1);
-}
+#define vqsubq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vqsubv4si (__a, __b, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqsubq_s64 (int64x2_t __a, int64x2_t __b)
-{
-  return (int64x2_t)__builtin_neon_vqsubv2di (__a, __b, 1);
-}
+#define vqsubq_s64(__a, __b) \
+  (int64x2_t)__builtin_neon_vqsubv2di (__a, __b, 1);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqsubq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vqsubv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
-}
+#define vqsubq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vqsubv16qi (__a, __b, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vqsubq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vqsubv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
+#define vqsubq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vqsubv8hi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vqsubq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vqsubv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
+#define vqsubq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vqsubv4si (__a, __b, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vqsubq_u64 (uint64x2_t __a, uint64x2_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vqsubv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
-}
+#define vqsubq_u64(__a, __b) \
+  (uint64x2_t)__builtin_neon_vqsubv2di (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vsubhn_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vsubhnv8hi (__a, __b, 1);
-}
+#define vsubhn_s16(__a, __b) \
+  (int8x8_t)__builtin_neon_vsubhnv8hi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vsubhn_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vsubhnv4si (__a, __b, 1);
-}
+#define vsubhn_s32(__a, __b) \
+  (int16x4_t)__builtin_neon_vsubhnv4si (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vsubhn_s64 (int64x2_t __a, int64x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vsubhnv2di (__a, __b, 1);
-}
+#define vsubhn_s64(__a, __b) \
+  (int32x2_t)__builtin_neon_vsubhnv2di (__a, __b, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vsubhnv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
+#define vsubhn_u16(__a, __b) \
+  (uint8x8_t)__builtin_neon_vsubhnv8hi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vsubhnv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
+#define vsubhn_u32(__a, __b) \
+  (uint16x4_t)__builtin_neon_vsubhnv4si (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vsubhnv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
-}
+#define vsubhn_u64(__a, __b) \
+  (uint32x2_t)__builtin_neon_vsubhnv2di (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrsubhn_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vsubhnv8hi (__a, __b, 3);
-}
+#define vrsubhn_s16(__a, __b) \
+  (int8x8_t)__builtin_neon_vsubhnv8hi (__a, __b, 3);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vrsubhn_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vsubhnv4si (__a, __b, 3);
-}
+#define vrsubhn_s32(__a, __b) \
+  (int16x4_t)__builtin_neon_vsubhnv4si (__a, __b, 3);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vrsubhn_s64 (int64x2_t __a, int64x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vsubhnv2di (__a, __b, 3);
-}
+#define vrsubhn_s64(__a, __b) \
+  (int32x2_t)__builtin_neon_vsubhnv2di (__a, __b, 3);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vsubhnv8hi ((int16x8_t) __a, (int16x8_t) __b, 2);
-}
+#define vrsubhn_u16(__a, __b) \
+  (uint8x8_t)__builtin_neon_vsubhnv8hi (__a, __b, 2);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vrsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vsubhnv4si ((int32x4_t) __a, (int32x4_t) __b, 2);
-}
+#define vrsubhn_u32(__a, __b) \
+  (uint16x4_t)__builtin_neon_vsubhnv4si (__a, __b, 2);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vsubhnv2di ((int64x2_t) __a, (int64x2_t) __b, 2);
-}
+#define vrsubhn_u64(__a, __b) \
+  (uint32x2_t)__builtin_neon_vsubhnv2di (__a, __b, 2);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vceq_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vceqv8qi (__a, __b, 1);
-}
+#define vceq_s8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vceqv8qi (__a, __b, 1);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vceq_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vceqv4hi (__a, __b, 1);
-}
+#define vceq_s16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vceqv4hi (__a, __b, 1);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vceq_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vceqv2si (__a, __b, 1);
-}
+#define vceq_s32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vceqv2si (__a, __b, 1);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vceq_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vceqv2sf (__a, __b, 5);
-}
+#define vceq_f32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vceqv2sf (__a, __b, 5);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vceq_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vceqv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vceq_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vceqv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vceq_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vceqv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vceq_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vceqv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vceq_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vceqv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vceq_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vceqv2si (__a, __b, 0);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vceq_p8 (poly8x8_t __a, poly8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vceqv8qi ((int8x8_t) __a, (int8x8_t) __b, 4);
-}
+#define vceq_p8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vceqv8qi (__a, __b, 4);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vceqq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vceqv16qi (__a, __b, 1);
-}
+#define vceqq_s8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vceqv16qi (__a, __b, 1);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vceqq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vceqv8hi (__a, __b, 1);
-}
+#define vceqq_s16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vceqv8hi (__a, __b, 1);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vceqq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vceqv4si (__a, __b, 1);
-}
+#define vceqq_s32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vceqv4si (__a, __b, 1);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vceqq_f32 (float32x4_t __a, float32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vceqv4sf (__a, __b, 5);
-}
+#define vceqq_f32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vceqv4sf (__a, __b, 5);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vceqq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vceqv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
-}
+#define vceqq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vceqv16qi (__a, __b, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vceqq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vceqv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
+#define vceqq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vceqv8hi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vceqq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vceqv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
+#define vceqq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vceqv4si (__a, __b, 0);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vceqq_p8 (poly8x16_t __a, poly8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vceqv16qi ((int8x16_t) __a, (int8x16_t) __b, 4);
-}
+#define vceqq_p8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vceqv16qi (__a, __b, 4);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcge_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vcgev8qi (__a, __b, 1);
-}
+#define vcge_s8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vcgev8qi (__a, __b, 1);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcge_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vcgev4hi (__a, __b, 1);
-}
+#define vcge_s16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vcgev4hi (__a, __b, 1);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcge_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vcgev2si (__a, __b, 1);
-}
+#define vcge_s32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vcgev2si (__a, __b, 1);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcge_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vcgev2sf (__a, __b, 5);
-}
+#define vcge_f32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vcgev2sf (__a, __b, 5);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcge_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vcgev8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vcge_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vcgev8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcge_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vcgev4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vcge_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vcgev4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcge_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vcgev2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vcge_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vcgev2si (__a, __b, 0);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcgeq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vcgev16qi (__a, __b, 1);
-}
+#define vcgeq_s8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vcgev16qi (__a, __b, 1);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcgeq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vcgev8hi (__a, __b, 1);
-}
+#define vcgeq_s16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vcgev8hi (__a, __b, 1);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgeq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vcgev4si (__a, __b, 1);
-}
+#define vcgeq_s32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vcgev4si (__a, __b, 1);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgeq_f32 (float32x4_t __a, float32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vcgev4sf (__a, __b, 5);
-}
+#define vcgeq_f32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vcgev4sf (__a, __b, 5);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcgeq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vcgev16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
-}
+#define vcgeq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vcgev16qi (__a, __b, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcgeq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vcgev8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
+#define vcgeq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vcgev8hi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgeq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vcgev4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
+#define vcgeq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vcgev4si (__a, __b, 0);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcle_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vcgev8qi (__b, __a, 1);
-}
+#define vcle_s8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vcgev8qi (__b, __a, 1);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcle_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vcgev4hi (__b, __a, 1);
-}
+#define vcle_s16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vcgev4hi (__b, __a, 1);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcle_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vcgev2si (__b, __a, 1);
-}
+#define vcle_s32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vcgev2si (__b, __a, 1);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcle_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vcgev2sf (__b, __a, 5);
-}
+#define vcle_f32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vcgev2sf (__b, __a, 5);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcle_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vcgev8qi ((int8x8_t) __b, (int8x8_t) __a, 0);
-}
+#define vcle_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vcgev8qi (__b, __a, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcle_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vcgev4hi ((int16x4_t) __b, (int16x4_t) __a, 0);
-}
+#define vcle_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vcgev4hi (__b, __a, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcle_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vcgev2si ((int32x2_t) __b, (int32x2_t) __a, 0);
-}
+#define vcle_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vcgev2si (__b, __a, 0);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcleq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vcgev16qi (__b, __a, 1);
-}
+#define vcleq_s8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vcgev16qi (__b, __a, 1);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcleq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vcgev8hi (__b, __a, 1);
-}
+#define vcleq_s16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vcgev8hi (__b, __a, 1);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcleq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vcgev4si (__b, __a, 1);
-}
+#define vcleq_s32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vcgev4si (__b, __a, 1);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcleq_f32 (float32x4_t __a, float32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vcgev4sf (__b, __a, 5);
-}
+#define vcleq_f32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vcgev4sf (__b, __a, 5);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcleq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vcgev16qi ((int8x16_t) __b, (int8x16_t) __a, 0);
-}
+#define vcleq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vcgev16qi (__b, __a, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcleq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vcgev8hi ((int16x8_t) __b, (int16x8_t) __a, 0);
-}
+#define vcleq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vcgev8hi (__b, __a, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcleq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vcgev4si ((int32x4_t) __b, (int32x4_t) __a, 0);
-}
+#define vcleq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vcgev4si (__b, __a, 0);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcgt_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vcgtv8qi (__a, __b, 1);
-}
+#define vcgt_s8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vcgtv8qi (__a, __b, 1);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcgt_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vcgtv4hi (__a, __b, 1);
-}
+#define vcgt_s16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vcgtv4hi (__a, __b, 1);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcgt_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vcgtv2si (__a, __b, 1);
-}
+#define vcgt_s32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vcgtv2si (__a, __b, 1);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcgt_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vcgtv2sf (__a, __b, 5);
-}
+#define vcgt_f32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vcgtv2sf (__a, __b, 5);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcgt_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vcgtv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vcgt_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vcgtv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcgt_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vcgtv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vcgt_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vcgtv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcgt_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vcgtv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vcgt_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vcgtv2si (__a, __b, 0);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcgtq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vcgtv16qi (__a, __b, 1);
-}
+#define vcgtq_s8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vcgtv16qi (__a, __b, 1);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcgtq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vcgtv8hi (__a, __b, 1);
-}
+#define vcgtq_s16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vcgtv8hi (__a, __b, 1);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgtq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vcgtv4si (__a, __b, 1);
-}
+#define vcgtq_s32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vcgtv4si (__a, __b, 1);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgtq_f32 (float32x4_t __a, float32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vcgtv4sf (__a, __b, 5);
-}
+#define vcgtq_f32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vcgtv4sf (__a, __b, 5);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcgtq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vcgtv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
-}
+#define vcgtq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vcgtv16qi (__a, __b, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcgtq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vcgtv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
+#define vcgtq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vcgtv8hi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcgtq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vcgtv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
+#define vcgtq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vcgtv4si (__a, __b, 0);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vclt_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vcgtv8qi (__b, __a, 1);
-}
+#define vclt_s8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vcgtv8qi (__b, __a, 1);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vclt_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vcgtv4hi (__b, __a, 1);
-}
+#define vclt_s16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vcgtv4hi (__b, __a, 1);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vclt_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vcgtv2si (__b, __a, 1);
-}
+#define vclt_s32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vcgtv2si (__b, __a, 1);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vclt_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vcgtv2sf (__b, __a, 5);
-}
+#define vclt_f32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vcgtv2sf (__b, __a, 5);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vclt_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vcgtv8qi ((int8x8_t) __b, (int8x8_t) __a, 0);
-}
+#define vclt_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vcgtv8qi (__b, __a, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vclt_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vcgtv4hi ((int16x4_t) __b, (int16x4_t) __a, 0);
-}
+#define vclt_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vcgtv4hi (__b, __a, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vclt_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vcgtv2si ((int32x2_t) __b, (int32x2_t) __a, 0);
-}
+#define vclt_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vcgtv2si (__b, __a, 0);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcltq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vcgtv16qi (__b, __a, 1);
-}
+#define vcltq_s8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vcgtv16qi (__b, __a, 1);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcltq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vcgtv8hi (__b, __a, 1);
-}
+#define vcltq_s16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vcgtv8hi (__b, __a, 1);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcltq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vcgtv4si (__b, __a, 1);
-}
+#define vcltq_s32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vcgtv4si (__b, __a, 1);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcltq_f32 (float32x4_t __a, float32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vcgtv4sf (__b, __a, 5);
-}
+#define vcltq_f32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vcgtv4sf (__b, __a, 5);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcltq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vcgtv16qi ((int8x16_t) __b, (int8x16_t) __a, 0);
-}
+#define vcltq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vcgtv16qi (__b, __a, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcltq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vcgtv8hi ((int16x8_t) __b, (int16x8_t) __a, 0);
-}
+#define vcltq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vcgtv8hi (__b, __a, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcltq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vcgtv4si ((int32x4_t) __b, (int32x4_t) __a, 0);
-}
+#define vcltq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vcgtv4si (__b, __a, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcage_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vcagev2sf (__a, __b, 5);
-}
+#define vcage_f32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vcagev2sf (__a, __b, 5);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcageq_f32 (float32x4_t __a, float32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vcagev4sf (__a, __b, 5);
-}
+#define vcageq_f32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vcagev4sf (__a, __b, 5);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcale_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vcagev2sf (__b, __a, 5);
-}
+#define vcale_f32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vcagev2sf (__b, __a, 5);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcaleq_f32 (float32x4_t __a, float32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vcagev4sf (__b, __a, 5);
-}
+#define vcaleq_f32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vcagev4sf (__b, __a, 5);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcagt_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vcagtv2sf (__a, __b, 5);
-}
+#define vcagt_f32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vcagtv2sf (__a, __b, 5);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcagtq_f32 (float32x4_t __a, float32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vcagtv4sf (__a, __b, 5);
-}
+#define vcagtq_f32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vcagtv4sf (__a, __b, 5);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcalt_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vcagtv2sf (__b, __a, 5);
-}
+#define vcalt_f32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vcagtv2sf (__b, __a, 5);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcaltq_f32 (float32x4_t __a, float32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vcagtv4sf (__b, __a, 5);
-}
+#define vcaltq_f32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vcagtv4sf (__b, __a, 5);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtst_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vtstv8qi (__a, __b, 1);
-}
+#define vtst_s8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vtstv8qi (__a, __b, 1);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vtst_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vtstv4hi (__a, __b, 1);
-}
+#define vtst_s16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vtstv4hi (__a, __b, 1);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vtst_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vtstv2si (__a, __b, 1);
-}
+#define vtst_s32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vtstv2si (__a, __b, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtst_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vtstv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vtst_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vtstv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vtst_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vtstv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vtst_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vtstv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vtst_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vtstv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vtst_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vtstv2si (__a, __b, 0);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtst_p8 (poly8x8_t __a, poly8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vtstv8qi ((int8x8_t) __a, (int8x8_t) __b, 4);
-}
+#define vtst_p8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vtstv8qi (__a, __b, 4);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vtstq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vtstv16qi (__a, __b, 1);
-}
+#define vtstq_s8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vtstv16qi (__a, __b, 1);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vtstq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vtstv8hi (__a, __b, 1);
-}
+#define vtstq_s16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vtstv8hi (__a, __b, 1);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vtstq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vtstv4si (__a, __b, 1);
-}
+#define vtstq_s32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vtstv4si (__a, __b, 1);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vtstq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vtstv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
-}
+#define vtstq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vtstv16qi (__a, __b, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vtstq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vtstv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
+#define vtstq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vtstv8hi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vtstq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vtstv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
+#define vtstq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vtstv4si (__a, __b, 0);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vtstq_p8 (poly8x16_t __a, poly8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vtstv16qi ((int8x16_t) __a, (int8x16_t) __b, 4);
-}
+#define vtstq_p8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vtstv16qi (__a, __b, 4);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vabd_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vabdv8qi (__a, __b, 1);
-}
+#define vabd_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vabdv8qi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vabd_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vabdv4hi (__a, __b, 1);
-}
+#define vabd_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vabdv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vabd_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vabdv2si (__a, __b, 1);
-}
+#define vabd_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vabdv2si (__a, __b, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vabd_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (float32x2_t)__builtin_neon_vabdv2sf (__a, __b, 5);
-}
+#define vabd_f32(__a, __b) \
+  (float32x2_t)__builtin_neon_vabdv2sf (__a, __b, 5);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vabd_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vabdv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vabd_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vabdv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vabd_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vabdv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vabd_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vabdv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vabd_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vabdv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vabd_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vabdv2si (__a, __b, 0);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vabdq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (int8x16_t)__builtin_neon_vabdv16qi (__a, __b, 1);
-}
+#define vabdq_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vabdv16qi (__a, __b, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vabdq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vabdv8hi (__a, __b, 1);
-}
+#define vabdq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vabdv8hi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vabdq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vabdv4si (__a, __b, 1);
-}
+#define vabdq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vabdv4si (__a, __b, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vabdq_f32 (float32x4_t __a, float32x4_t __b)
-{
-  return (float32x4_t)__builtin_neon_vabdv4sf (__a, __b, 5);
-}
+#define vabdq_f32(__a, __b) \
+  (float32x4_t)__builtin_neon_vabdv4sf (__a, __b, 5);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vabdq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vabdv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
-}
+#define vabdq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vabdv16qi (__a, __b, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vabdq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vabdv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
+#define vabdq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vabdv8hi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vabdq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vabdv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
+#define vabdq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vabdv4si (__a, __b, 0);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vabdl_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vabdlv8qi (__a, __b, 1);
-}
+#define vabdl_s8(__a, __b) \
+  (int16x8_t)__builtin_neon_vabdlv8qi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vabdl_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vabdlv4hi (__a, __b, 1);
-}
+#define vabdl_s16(__a, __b) \
+  (int32x4_t)__builtin_neon_vabdlv4hi (__a, __b, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vabdl_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int64x2_t)__builtin_neon_vabdlv2si (__a, __b, 1);
-}
+#define vabdl_s32(__a, __b) \
+  (int64x2_t)__builtin_neon_vabdlv2si (__a, __b, 1);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vabdl_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vabdlv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vabdl_u8(__a, __b) \
+  (uint16x8_t)__builtin_neon_vabdlv8qi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vabdl_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vabdlv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vabdl_u16(__a, __b) \
+  (uint32x4_t)__builtin_neon_vabdlv4hi (__a, __b, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vabdl_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vabdlv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vabdl_u32(__a, __b) \
+  (uint64x2_t)__builtin_neon_vabdlv2si (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vaba_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
-{
-  return (int8x8_t)__builtin_neon_vabav8qi (__a, __b, __c, 1);
-}
+#define vaba_s8(__a, __b, __c) \
+  (int8x8_t)__builtin_neon_vabav8qi (__a, __b, __c, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vaba_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
-{
-  return (int16x4_t)__builtin_neon_vabav4hi (__a, __b, __c, 1);
-}
+#define vaba_s16(__a, __b, __c) \
+  (int16x4_t)__builtin_neon_vabav4hi (__a, __b, __c, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vaba_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
-{
-  return (int32x2_t)__builtin_neon_vabav2si (__a, __b, __c, 1);
-}
+#define vaba_s32(__a, __b, __c) \
+  (int32x2_t)__builtin_neon_vabav2si (__a, __b, __c, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vaba_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
-{
-  return (uint8x8_t)__builtin_neon_vabav8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0);
-}
+#define vaba_u8(__a, __b, __c) \
+  (uint8x8_t)__builtin_neon_vabav8qi (__a, __b, __c, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vaba_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
-{
-  return (uint16x4_t)__builtin_neon_vabav4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0);
-}
+#define vaba_u16(__a, __b, __c) \
+  (uint16x4_t)__builtin_neon_vabav4hi (__a, __b, __c, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vaba_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
-{
-  return (uint32x2_t)__builtin_neon_vabav2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0);
-}
+#define vaba_u32(__a, __b, __c) \
+  (uint32x2_t)__builtin_neon_vabav2si (__a, __b, __c, 0);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vabaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
-{
-  return (int8x16_t)__builtin_neon_vabav16qi (__a, __b, __c, 1);
-}
+#define vabaq_s8(__a, __b, __c) \
+  (int8x16_t)__builtin_neon_vabav16qi (__a, __b, __c, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vabaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
-{
-  return (int16x8_t)__builtin_neon_vabav8hi (__a, __b, __c, 1);
-}
+#define vabaq_s16(__a, __b, __c) \
+  (int16x8_t)__builtin_neon_vabav8hi (__a, __b, __c, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vabaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
-{
-  return (int32x4_t)__builtin_neon_vabav4si (__a, __b, __c, 1);
-}
+#define vabaq_s32(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vabav4si (__a, __b, __c, 1);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vabaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
-{
-  return (uint8x16_t)__builtin_neon_vabav16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c, 0);
-}
+#define vabaq_u8(__a, __b, __c) \
+  (uint8x16_t)__builtin_neon_vabav16qi (__a, __b, __c, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vabaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
-{
-  return (uint16x8_t)__builtin_neon_vabav8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c, 0);
-}
+#define vabaq_u16(__a, __b, __c) \
+  (uint16x8_t)__builtin_neon_vabav8hi (__a, __b, __c, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vabaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
-{
-  return (uint32x4_t)__builtin_neon_vabav4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c, 0);
-}
+#define vabaq_u32(__a, __b, __c) \
+  (uint32x4_t)__builtin_neon_vabav4si (__a, __b, __c, 0);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vabal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
-{
-  return (int16x8_t)__builtin_neon_vabalv8qi (__a, __b, __c, 1);
-}
+#define vabal_s8(__a, __b, __c) \
+  (int16x8_t)__builtin_neon_vabalv8qi (__a, __b, __c, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vabal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
-{
-  return (int32x4_t)__builtin_neon_vabalv4hi (__a, __b, __c, 1);
-}
+#define vabal_s16(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vabalv4hi (__a, __b, __c, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vabal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
-{
-  return (int64x2_t)__builtin_neon_vabalv2si (__a, __b, __c, 1);
-}
+#define vabal_s32(__a, __b, __c) \
+  (int64x2_t)__builtin_neon_vabalv2si (__a, __b, __c, 1);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vabal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
-{
-  return (uint16x8_t)__builtin_neon_vabalv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0);
-}
+#define vabal_u8(__a, __b, __c) \
+  (uint16x8_t)__builtin_neon_vabalv8qi (__a, __b, __c, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vabal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
-{
-  return (uint32x4_t)__builtin_neon_vabalv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0);
-}
+#define vabal_u16(__a, __b, __c) \
+  (uint32x4_t)__builtin_neon_vabalv4hi (__a, __b, __c, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vabal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
-{
-  return (uint64x2_t)__builtin_neon_vabalv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0);
-}
+#define vabal_u32(__a, __b, __c) \
+  (uint64x2_t)__builtin_neon_vabalv2si (__a, __b, __c, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vmax_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vmaxv8qi (__a, __b, 1);
-}
+#define vmax_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vmaxv8qi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmax_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vmaxv4hi (__a, __b, 1);
-}
+#define vmax_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vmaxv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmax_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vmaxv2si (__a, __b, 1);
-}
+#define vmax_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vmaxv2si (__a, __b, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmax_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (float32x2_t)__builtin_neon_vmaxv2sf (__a, __b, 5);
-}
+#define vmax_f32(__a, __b) \
+  (float32x2_t)__builtin_neon_vmaxv2sf (__a, __b, 5);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vmax_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vmaxv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vmax_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vmaxv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmax_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vmaxv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vmax_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vmaxv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmax_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vmaxv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vmax_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vmaxv2si (__a, __b, 0);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vmaxq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (int8x16_t)__builtin_neon_vmaxv16qi (__a, __b, 1);
-}
+#define vmaxq_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vmaxv16qi (__a, __b, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmaxq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vmaxv8hi (__a, __b, 1);
-}
+#define vmaxq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vmaxv8hi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmaxq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vmaxv4si (__a, __b, 1);
-}
+#define vmaxq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vmaxv4si (__a, __b, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmaxq_f32 (float32x4_t __a, float32x4_t __b)
-{
-  return (float32x4_t)__builtin_neon_vmaxv4sf (__a, __b, 5);
-}
+#define vmaxq_f32(__a, __b) \
+  (float32x4_t)__builtin_neon_vmaxv4sf (__a, __b, 5);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vmaxq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vmaxv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
-}
+#define vmaxq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vmaxv16qi (__a, __b, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmaxq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vmaxv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
+#define vmaxq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vmaxv8hi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmaxq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vmaxv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
+#define vmaxq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vmaxv4si (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vmin_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vminv8qi (__a, __b, 1);
-}
+#define vmin_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vminv8qi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmin_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vminv4hi (__a, __b, 1);
-}
+#define vmin_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vminv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmin_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vminv2si (__a, __b, 1);
-}
+#define vmin_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vminv2si (__a, __b, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmin_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (float32x2_t)__builtin_neon_vminv2sf (__a, __b, 5);
-}
+#define vmin_f32(__a, __b) \
+  (float32x2_t)__builtin_neon_vminv2sf (__a, __b, 5);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vmin_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vminv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vmin_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vminv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmin_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vminv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vmin_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vminv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmin_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vminv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vmin_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vminv2si (__a, __b, 0);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vminq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (int8x16_t)__builtin_neon_vminv16qi (__a, __b, 1);
-}
+#define vminq_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vminv16qi (__a, __b, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vminq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vminv8hi (__a, __b, 1);
-}
+#define vminq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vminv8hi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vminq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vminv4si (__a, __b, 1);
-}
+#define vminq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vminv4si (__a, __b, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vminq_f32 (float32x4_t __a, float32x4_t __b)
-{
-  return (float32x4_t)__builtin_neon_vminv4sf (__a, __b, 5);
-}
+#define vminq_f32(__a, __b) \
+  (float32x4_t)__builtin_neon_vminv4sf (__a, __b, 5);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vminq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vminv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
-}
+#define vminq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vminv16qi (__a, __b, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vminq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vminv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
+#define vminq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vminv8hi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vminq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vminv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
+#define vminq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vminv4si (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vpadd_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vpaddv8qi (__a, __b, 1);
-}
+#define vpadd_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vpaddv8qi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vpadd_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vpaddv4hi (__a, __b, 1);
-}
+#define vpadd_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vpaddv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vpadd_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vpaddv2si (__a, __b, 1);
-}
+#define vpadd_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vpaddv2si (__a, __b, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vpadd_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (float32x2_t)__builtin_neon_vpaddv2sf (__a, __b, 5);
-}
+#define vpadd_f32(__a, __b) \
+  (float32x2_t)__builtin_neon_vpaddv2sf (__a, __b, 5);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vpadd_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vpaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vpadd_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vpaddv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vpadd_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vpaddv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vpadd_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vpaddv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vpadd_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vpaddv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vpadd_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vpaddv2si (__a, __b, 0);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vpaddl_s8 (int8x8_t __a)
-{
-  return (int16x4_t)__builtin_neon_vpaddlv8qi (__a, 1);
-}
+#define vpaddl_s8(__a) \
+  (int16x4_t)__builtin_neon_vpaddlv8qi (__a, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vpaddl_s16 (int16x4_t __a)
-{
-  return (int32x2_t)__builtin_neon_vpaddlv4hi (__a, 1);
-}
+#define vpaddl_s16(__a) \
+  (int32x2_t)__builtin_neon_vpaddlv4hi (__a, 1);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vpaddl_s32 (int32x2_t __a)
-{
-  return (int64x1_t)__builtin_neon_vpaddlv2si (__a, 1);
-}
+#define vpaddl_s32(__a) \
+  (int64x1_t)__builtin_neon_vpaddlv2si (__a, 1);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vpaddl_u8 (uint8x8_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vpaddlv8qi ((int8x8_t) __a, 0);
-}
+#define vpaddl_u8(__a) \
+  (uint16x4_t)__builtin_neon_vpaddlv8qi (__a, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vpaddl_u16 (uint16x4_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vpaddlv4hi ((int16x4_t) __a, 0);
-}
+#define vpaddl_u16(__a) \
+  (uint32x2_t)__builtin_neon_vpaddlv4hi (__a, 0);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vpaddl_u32 (uint32x2_t __a)
-{
-  return (uint64x1_t)__builtin_neon_vpaddlv2si ((int32x2_t) __a, 0);
-}
+#define vpaddl_u32(__a) \
+  (uint64x1_t)__builtin_neon_vpaddlv2si (__a, 0);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vpaddlq_s8 (int8x16_t __a)
-{
-  return (int16x8_t)__builtin_neon_vpaddlv16qi (__a, 1);
-}
+#define vpaddlq_s8(__a) \
+  (int16x8_t)__builtin_neon_vpaddlv16qi (__a, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vpaddlq_s16 (int16x8_t __a)
-{
-  return (int32x4_t)__builtin_neon_vpaddlv8hi (__a, 1);
-}
+#define vpaddlq_s16(__a) \
+  (int32x4_t)__builtin_neon_vpaddlv8hi (__a, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vpaddlq_s32 (int32x4_t __a)
-{
-  return (int64x2_t)__builtin_neon_vpaddlv4si (__a, 1);
-}
+#define vpaddlq_s32(__a) \
+  (int64x2_t)__builtin_neon_vpaddlv4si (__a, 1);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vpaddlq_u8 (uint8x16_t __a)
-{
-  return (uint16x8_t)__builtin_neon_vpaddlv16qi ((int8x16_t) __a, 0);
-}
+#define vpaddlq_u8(__a) \
+  (uint16x8_t)__builtin_neon_vpaddlv16qi (__a, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vpaddlq_u16 (uint16x8_t __a)
-{
-  return (uint32x4_t)__builtin_neon_vpaddlv8hi ((int16x8_t) __a, 0);
-}
+#define vpaddlq_u16(__a) \
+  (uint32x4_t)__builtin_neon_vpaddlv8hi (__a, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vpaddlq_u32 (uint32x4_t __a)
-{
-  return (uint64x2_t)__builtin_neon_vpaddlv4si ((int32x4_t) __a, 0);
-}
+#define vpaddlq_u32(__a) \
+  (uint64x2_t)__builtin_neon_vpaddlv4si (__a, 0);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vpadal_s8 (int16x4_t __a, int8x8_t __b)
-{
-  return (int16x4_t)__builtin_neon_vpadalv8qi (__a, __b, 1);
-}
+#define vpadal_s8(__a, __b) \
+  (int16x4_t)__builtin_neon_vpadalv8qi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vpadal_s16 (int32x2_t __a, int16x4_t __b)
-{
-  return (int32x2_t)__builtin_neon_vpadalv4hi (__a, __b, 1);
-}
+#define vpadal_s16(__a, __b) \
+  (int32x2_t)__builtin_neon_vpadalv4hi (__a, __b, 1);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vpadal_s32 (int64x1_t __a, int32x2_t __b)
-{
-  return (int64x1_t)__builtin_neon_vpadalv2si (__a, __b, 1);
-}
+#define vpadal_s32(__a, __b) \
+  (int64x1_t)__builtin_neon_vpadalv2si (__a, __b, 1);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vpadal_u8 (uint16x4_t __a, uint8x8_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vpadalv8qi ((int16x4_t) __a, (int8x8_t) __b, 0);
-}
+#define vpadal_u8(__a, __b) \
+  (uint16x4_t)__builtin_neon_vpadalv8qi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vpadal_u16 (uint32x2_t __a, uint16x4_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vpadalv4hi ((int32x2_t) __a, (int16x4_t) __b, 0);
-}
+#define vpadal_u16(__a, __b) \
+  (uint32x2_t)__builtin_neon_vpadalv4hi (__a, __b, 0);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vpadal_u32 (uint64x1_t __a, uint32x2_t __b)
-{
-  return (uint64x1_t)__builtin_neon_vpadalv2si ((int64x1_t) __a, (int32x2_t) __b, 0);
-}
+#define vpadal_u32(__a, __b) \
+  (uint64x1_t)__builtin_neon_vpadalv2si (__a, __b, 0);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vpadalq_s8 (int16x8_t __a, int8x16_t __b)
-{
-  return (int16x8_t)__builtin_neon_vpadalv16qi (__a, __b, 1);
-}
+#define vpadalq_s8(__a, __b) \
+  (int16x8_t)__builtin_neon_vpadalv16qi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vpadalq_s16 (int32x4_t __a, int16x8_t __b)
-{
-  return (int32x4_t)__builtin_neon_vpadalv8hi (__a, __b, 1);
-}
+#define vpadalq_s16(__a, __b) \
+  (int32x4_t)__builtin_neon_vpadalv8hi (__a, __b, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vpadalq_s32 (int64x2_t __a, int32x4_t __b)
-{
-  return (int64x2_t)__builtin_neon_vpadalv4si (__a, __b, 1);
-}
+#define vpadalq_s32(__a, __b) \
+  (int64x2_t)__builtin_neon_vpadalv4si (__a, __b, 1);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vpadalq_u8 (uint16x8_t __a, uint8x16_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vpadalv16qi ((int16x8_t) __a, (int8x16_t) __b, 0);
-}
+#define vpadalq_u8(__a, __b) \
+  (uint16x8_t)__builtin_neon_vpadalv16qi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vpadalq_u16 (uint32x4_t __a, uint16x8_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vpadalv8hi ((int32x4_t) __a, (int16x8_t) __b, 0);
-}
+#define vpadalq_u16(__a, __b) \
+  (uint32x4_t)__builtin_neon_vpadalv8hi (__a, __b, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vpadalq_u32 (uint64x2_t __a, uint32x4_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vpadalv4si ((int64x2_t) __a, (int32x4_t) __b, 0);
-}
+#define vpadalq_u32(__a, __b) \
+  (uint64x2_t)__builtin_neon_vpadalv4si (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vpmax_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vpmaxv8qi (__a, __b, 1);
-}
+#define vpmax_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vpmaxv8qi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vpmax_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vpmaxv4hi (__a, __b, 1);
-}
+#define vpmax_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vpmaxv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vpmax_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vpmaxv2si (__a, __b, 1);
-}
+#define vpmax_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vpmaxv2si (__a, __b, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vpmax_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (float32x2_t)__builtin_neon_vpmaxv2sf (__a, __b, 5);
-}
+#define vpmax_f32(__a, __b) \
+  (float32x2_t)__builtin_neon_vpmaxv2sf (__a, __b, 5);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vpmax_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vpmaxv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vpmax_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vpmaxv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vpmax_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vpmaxv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vpmax_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vpmaxv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vpmax_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vpmaxv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vpmax_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vpmaxv2si (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vpmin_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vpminv8qi (__a, __b, 1);
-}
+#define vpmin_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vpminv8qi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vpmin_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vpminv4hi (__a, __b, 1);
-}
+#define vpmin_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vpminv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vpmin_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vpminv2si (__a, __b, 1);
-}
+#define vpmin_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vpminv2si (__a, __b, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vpmin_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (float32x2_t)__builtin_neon_vpminv2sf (__a, __b, 5);
-}
+#define vpmin_f32(__a, __b) \
+  (float32x2_t)__builtin_neon_vpminv2sf (__a, __b, 5);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vpmin_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vpminv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
+#define vpmin_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vpminv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vpmin_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vpminv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
+#define vpmin_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vpminv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vpmin_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vpminv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
+#define vpmin_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vpminv2si (__a, __b, 0);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrecps_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (float32x2_t)__builtin_neon_vrecpsv2sf (__a, __b, 5);
-}
+#define vrecps_f32(__a, __b) \
+  (float32x2_t)__builtin_neon_vrecpsv2sf (__a, __b, 5);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrecpsq_f32 (float32x4_t __a, float32x4_t __b)
-{
-  return (float32x4_t)__builtin_neon_vrecpsv4sf (__a, __b, 5);
-}
+#define vrecpsq_f32(__a, __b) \
+  (float32x4_t)__builtin_neon_vrecpsv4sf (__a, __b, 5);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrsqrts_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (float32x2_t)__builtin_neon_vrsqrtsv2sf (__a, __b, 5);
-}
+#define vrsqrts_f32(__a, __b) \
+  (float32x2_t)__builtin_neon_vrsqrtsv2sf (__a, __b, 5);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrsqrtsq_f32 (float32x4_t __a, float32x4_t __b)
-{
-  return (float32x4_t)__builtin_neon_vrsqrtsv4sf (__a, __b, 5);
-}
+#define vrsqrtsq_f32(__a, __b) \
+  (float32x4_t)__builtin_neon_vrsqrtsv4sf (__a, __b, 5);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vshl_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vshlv8qi (__a, __b, 1);
-}
+#define vshl_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vshlv8qi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vshl_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vshlv4hi (__a, __b, 1);
-}
+#define vshl_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vshlv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vshl_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vshlv2si (__a, __b, 1);
-}
+#define vshl_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vshlv2si (__a, __b, 1);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vshl_s64 (int64x1_t __a, int64x1_t __b)
-{
-  return (int64x1_t)__builtin_neon_vshldi (__a, __b, 1);
-}
+#define vshl_s64(__a, __b) \
+  (int64x1_t)__builtin_neon_vshldi (__a, __b, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vshl_u8 (uint8x8_t __a, int8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vshlv8qi ((int8x8_t) __a, __b, 0);
-}
+#define vshl_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vshlv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vshl_u16 (uint16x4_t __a, int16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vshlv4hi ((int16x4_t) __a, __b, 0);
-}
+#define vshl_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vshlv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vshl_u32 (uint32x2_t __a, int32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vshlv2si ((int32x2_t) __a, __b, 0);
-}
+#define vshl_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vshlv2si (__a, __b, 0);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vshl_u64 (uint64x1_t __a, int64x1_t __b)
-{
-  return (uint64x1_t)__builtin_neon_vshldi ((int64x1_t) __a, __b, 0);
-}
+#define vshl_u64(__a, __b) \
+  (uint64x1_t)__builtin_neon_vshldi (__a, __b, 0);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vshlq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (int8x16_t)__builtin_neon_vshlv16qi (__a, __b, 1);
-}
+#define vshlq_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vshlv16qi (__a, __b, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vshlq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vshlv8hi (__a, __b, 1);
-}
+#define vshlq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vshlv8hi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vshlq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vshlv4si (__a, __b, 1);
-}
+#define vshlq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vshlv4si (__a, __b, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vshlq_s64 (int64x2_t __a, int64x2_t __b)
-{
-  return (int64x2_t)__builtin_neon_vshlv2di (__a, __b, 1);
-}
+#define vshlq_s64(__a, __b) \
+  (int64x2_t)__builtin_neon_vshlv2di (__a, __b, 1);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vshlq_u8 (uint8x16_t __a, int8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vshlv16qi ((int8x16_t) __a, __b, 0);
-}
+#define vshlq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vshlv16qi (__a, __b, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vshlq_u16 (uint16x8_t __a, int16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vshlv8hi ((int16x8_t) __a, __b, 0);
-}
+#define vshlq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vshlv8hi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vshlq_u32 (uint32x4_t __a, int32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vshlv4si ((int32x4_t) __a, __b, 0);
-}
+#define vshlq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vshlv4si (__a, __b, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vshlq_u64 (uint64x2_t __a, int64x2_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vshlv2di ((int64x2_t) __a, __b, 0);
-}
+#define vshlq_u64(__a, __b) \
+  (uint64x2_t)__builtin_neon_vshlv2di (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrshl_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vshlv8qi (__a, __b, 3);
-}
+#define vrshl_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vshlv8qi (__a, __b, 3);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vrshl_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vshlv4hi (__a, __b, 3);
-}
+#define vrshl_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vshlv4hi (__a, __b, 3);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vrshl_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vshlv2si (__a, __b, 3);
-}
+#define vrshl_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vshlv2si (__a, __b, 3);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vrshl_s64 (int64x1_t __a, int64x1_t __b)
-{
-  return (int64x1_t)__builtin_neon_vshldi (__a, __b, 3);
-}
+#define vrshl_s64(__a, __b) \
+  (int64x1_t)__builtin_neon_vshldi (__a, __b, 3);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrshl_u8 (uint8x8_t __a, int8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vshlv8qi ((int8x8_t) __a, __b, 2);
-}
+#define vrshl_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vshlv8qi (__a, __b, 2);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vrshl_u16 (uint16x4_t __a, int16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vshlv4hi ((int16x4_t) __a, __b, 2);
-}
+#define vrshl_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vshlv4hi (__a, __b, 2);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrshl_u32 (uint32x2_t __a, int32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vshlv2si ((int32x2_t) __a, __b, 2);
-}
+#define vrshl_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vshlv2si (__a, __b, 2);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vrshl_u64 (uint64x1_t __a, int64x1_t __b)
-{
-  return (uint64x1_t)__builtin_neon_vshldi ((int64x1_t) __a, __b, 2);
-}
+#define vrshl_u64(__a, __b) \
+  (uint64x1_t)__builtin_neon_vshldi (__a, __b, 2);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vrshlq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (int8x16_t)__builtin_neon_vshlv16qi (__a, __b, 3);
-}
+#define vrshlq_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vshlv16qi (__a, __b, 3);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vrshlq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vshlv8hi (__a, __b, 3);
-}
+#define vrshlq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vshlv8hi (__a, __b, 3);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vrshlq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vshlv4si (__a, __b, 3);
-}
+#define vrshlq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vshlv4si (__a, __b, 3);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vrshlq_s64 (int64x2_t __a, int64x2_t __b)
-{
-  return (int64x2_t)__builtin_neon_vshlv2di (__a, __b, 3);
-}
+#define vrshlq_s64(__a, __b) \
+  (int64x2_t)__builtin_neon_vshlv2di (__a, __b, 3);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vrshlq_u8 (uint8x16_t __a, int8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vshlv16qi ((int8x16_t) __a, __b, 2);
-}
+#define vrshlq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vshlv16qi (__a, __b, 2);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vrshlq_u16 (uint16x8_t __a, int16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vshlv8hi ((int16x8_t) __a, __b, 2);
-}
+#define vrshlq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vshlv8hi (__a, __b, 2);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vrshlq_u32 (uint32x4_t __a, int32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vshlv4si ((int32x4_t) __a, __b, 2);
-}
+#define vrshlq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vshlv4si (__a, __b, 2);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vrshlq_u64 (uint64x2_t __a, int64x2_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vshlv2di ((int64x2_t) __a, __b, 2);
-}
+#define vrshlq_u64(__a, __b) \
+  (uint64x2_t)__builtin_neon_vshlv2di (__a, __b, 2);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqshl_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vqshlv8qi (__a, __b, 1);
-}
+#define vqshl_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vqshlv8qi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqshl_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vqshlv4hi (__a, __b, 1);
-}
+#define vqshl_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vqshlv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqshl_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vqshlv2si (__a, __b, 1);
-}
+#define vqshl_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vqshlv2si (__a, __b, 1);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vqshl_s64 (int64x1_t __a, int64x1_t __b)
-{
-  return (int64x1_t)__builtin_neon_vqshldi (__a, __b, 1);
-}
+#define vqshl_s64(__a, __b) \
+  (int64x1_t)__builtin_neon_vqshldi (__a, __b, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqshl_u8 (uint8x8_t __a, int8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vqshlv8qi ((int8x8_t) __a, __b, 0);
-}
+#define vqshl_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vqshlv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqshl_u16 (uint16x4_t __a, int16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vqshlv4hi ((int16x4_t) __a, __b, 0);
-}
+#define vqshl_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vqshlv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqshl_u32 (uint32x2_t __a, int32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vqshlv2si ((int32x2_t) __a, __b, 0);
-}
+#define vqshl_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vqshlv2si (__a, __b, 0);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vqshl_u64 (uint64x1_t __a, int64x1_t __b)
-{
-  return (uint64x1_t)__builtin_neon_vqshldi ((int64x1_t) __a, __b, 0);
-}
+#define vqshl_u64(__a, __b) \
+  (uint64x1_t)__builtin_neon_vqshldi (__a, __b, 0);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqshlq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (int8x16_t)__builtin_neon_vqshlv16qi (__a, __b, 1);
-}
+#define vqshlq_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vqshlv16qi (__a, __b, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqshlq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vqshlv8hi (__a, __b, 1);
-}
+#define vqshlq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vqshlv8hi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqshlq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vqshlv4si (__a, __b, 1);
-}
+#define vqshlq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vqshlv4si (__a, __b, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqshlq_s64 (int64x2_t __a, int64x2_t __b)
-{
-  return (int64x2_t)__builtin_neon_vqshlv2di (__a, __b, 1);
-}
+#define vqshlq_s64(__a, __b) \
+  (int64x2_t)__builtin_neon_vqshlv2di (__a, __b, 1);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqshlq_u8 (uint8x16_t __a, int8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vqshlv16qi ((int8x16_t) __a, __b, 0);
-}
+#define vqshlq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vqshlv16qi (__a, __b, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vqshlq_u16 (uint16x8_t __a, int16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vqshlv8hi ((int16x8_t) __a, __b, 0);
-}
+#define vqshlq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vqshlv8hi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vqshlq_u32 (uint32x4_t __a, int32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vqshlv4si ((int32x4_t) __a, __b, 0);
-}
+#define vqshlq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vqshlv4si (__a, __b, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vqshlq_u64 (uint64x2_t __a, int64x2_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vqshlv2di ((int64x2_t) __a, __b, 0);
-}
+#define vqshlq_u64(__a, __b) \
+  (uint64x2_t)__builtin_neon_vqshlv2di (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqrshl_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vqshlv8qi (__a, __b, 3);
-}
+#define vqrshl_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vqshlv8qi (__a, __b, 3);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqrshl_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vqshlv4hi (__a, __b, 3);
-}
+#define vqrshl_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vqshlv4hi (__a, __b, 3);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqrshl_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vqshlv2si (__a, __b, 3);
-}
+#define vqrshl_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vqshlv2si (__a, __b, 3);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vqrshl_s64 (int64x1_t __a, int64x1_t __b)
-{
-  return (int64x1_t)__builtin_neon_vqshldi (__a, __b, 3);
-}
+#define vqrshl_s64(__a, __b) \
+  (int64x1_t)__builtin_neon_vqshldi (__a, __b, 3);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqrshl_u8 (uint8x8_t __a, int8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vqshlv8qi ((int8x8_t) __a, __b, 2);
-}
+#define vqrshl_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vqshlv8qi (__a, __b, 2);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqrshl_u16 (uint16x4_t __a, int16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vqshlv4hi ((int16x4_t) __a, __b, 2);
-}
+#define vqrshl_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vqshlv4hi (__a, __b, 2);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqrshl_u32 (uint32x2_t __a, int32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vqshlv2si ((int32x2_t) __a, __b, 2);
-}
+#define vqrshl_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vqshlv2si (__a, __b, 2);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vqrshl_u64 (uint64x1_t __a, int64x1_t __b)
-{
-  return (uint64x1_t)__builtin_neon_vqshldi ((int64x1_t) __a, __b, 2);
-}
+#define vqrshl_u64(__a, __b) \
+  (uint64x1_t)__builtin_neon_vqshldi (__a, __b, 2);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqrshlq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (int8x16_t)__builtin_neon_vqshlv16qi (__a, __b, 3);
-}
+#define vqrshlq_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vqshlv16qi (__a, __b, 3);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqrshlq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vqshlv8hi (__a, __b, 3);
-}
+#define vqrshlq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vqshlv8hi (__a, __b, 3);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqrshlq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vqshlv4si (__a, __b, 3);
-}
+#define vqrshlq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vqshlv4si (__a, __b, 3);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqrshlq_s64 (int64x2_t __a, int64x2_t __b)
-{
-  return (int64x2_t)__builtin_neon_vqshlv2di (__a, __b, 3);
-}
+#define vqrshlq_s64(__a, __b) \
+  (int64x2_t)__builtin_neon_vqshlv2di (__a, __b, 3);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqrshlq_u8 (uint8x16_t __a, int8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vqshlv16qi ((int8x16_t) __a, __b, 2);
-}
+#define vqrshlq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vqshlv16qi (__a, __b, 2);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vqrshlq_u16 (uint16x8_t __a, int16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vqshlv8hi ((int16x8_t) __a, __b, 2);
-}
+#define vqrshlq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vqshlv8hi (__a, __b, 2);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vqrshlq_u32 (uint32x4_t __a, int32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vqshlv4si ((int32x4_t) __a, __b, 2);
-}
+#define vqrshlq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vqshlv4si (__a, __b, 2);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vqrshlq_u64 (uint64x2_t __a, int64x2_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vqshlv2di ((int64x2_t) __a, __b, 2);
-}
+#define vqrshlq_u64(__a, __b) \
+  (uint64x2_t)__builtin_neon_vqshlv2di (__a, __b, 2);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vshr_n_s8 (int8x8_t __a, const int __b)
-{
-  return (int8x8_t)__builtin_neon_vshr_nv8qi (__a, __b, 1);
-}
+#define vshr_n_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vshr_nv8qi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vshr_n_s16 (int16x4_t __a, const int __b)
-{
-  return (int16x4_t)__builtin_neon_vshr_nv4hi (__a, __b, 1);
-}
+#define vshr_n_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vshr_nv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vshr_n_s32 (int32x2_t __a, const int __b)
-{
-  return (int32x2_t)__builtin_neon_vshr_nv2si (__a, __b, 1);
-}
+#define vshr_n_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vshr_nv2si (__a, __b, 1);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vshr_n_s64 (int64x1_t __a, const int __b)
-{
-  return (int64x1_t)__builtin_neon_vshr_ndi (__a, __b, 1);
-}
+#define vshr_n_s64(__a, __b) \
+  (int64x1_t)__builtin_neon_vshr_ndi (__a, __b, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vshr_n_u8 (uint8x8_t __a, const int __b)
-{
-  return (uint8x8_t)__builtin_neon_vshr_nv8qi ((int8x8_t) __a, __b, 0);
-}
+#define vshr_n_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vshr_nv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vshr_n_u16 (uint16x4_t __a, const int __b)
-{
-  return (uint16x4_t)__builtin_neon_vshr_nv4hi ((int16x4_t) __a, __b, 0);
-}
+#define vshr_n_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vshr_nv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vshr_n_u32 (uint32x2_t __a, const int __b)
-{
-  return (uint32x2_t)__builtin_neon_vshr_nv2si ((int32x2_t) __a, __b, 0);
-}
+#define vshr_n_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vshr_nv2si (__a, __b, 0);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vshr_n_u64 (uint64x1_t __a, const int __b)
-{
-  return (uint64x1_t)__builtin_neon_vshr_ndi ((int64x1_t) __a, __b, 0);
-}
+#define vshr_n_u64(__a, __b) \
+  (uint64x1_t)__builtin_neon_vshr_ndi (__a, __b, 0);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vshrq_n_s8 (int8x16_t __a, const int __b)
-{
-  return (int8x16_t)__builtin_neon_vshr_nv16qi (__a, __b, 1);
-}
+#define vshrq_n_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vshr_nv16qi (__a, __b, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vshrq_n_s16 (int16x8_t __a, const int __b)
-{
-  return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
-}
+#define vshrq_n_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vshrq_n_s32 (int32x4_t __a, const int __b)
-{
-  return (int32x4_t)__builtin_neon_vshr_nv4si (__a, __b, 1);
-}
+#define vshrq_n_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vshr_nv4si (__a, __b, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vshrq_n_s64 (int64x2_t __a, const int __b)
-{
-  return (int64x2_t)__builtin_neon_vshr_nv2di (__a, __b, 1);
-}
+#define vshrq_n_s64(__a, __b) \
+  (int64x2_t)__builtin_neon_vshr_nv2di (__a, __b, 1);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vshrq_n_u8 (uint8x16_t __a, const int __b)
-{
-  return (uint8x16_t)__builtin_neon_vshr_nv16qi ((int8x16_t) __a, __b, 0);
-}
+#define vshrq_n_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vshr_nv16qi (__a, __b, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vshrq_n_u16 (uint16x8_t __a, const int __b)
-{
-  return (uint16x8_t)__builtin_neon_vshr_nv8hi ((int16x8_t) __a, __b, 0);
-}
+#define vshrq_n_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vshrq_n_u32 (uint32x4_t __a, const int __b)
-{
-  return (uint32x4_t)__builtin_neon_vshr_nv4si ((int32x4_t) __a, __b, 0);
-}
+#define vshrq_n_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vshr_nv4si (__a, __b, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vshrq_n_u64 (uint64x2_t __a, const int __b)
-{
-  return (uint64x2_t)__builtin_neon_vshr_nv2di ((int64x2_t) __a, __b, 0);
-}
+#define vshrq_n_u64(__a, __b) \
+  (uint64x2_t)__builtin_neon_vshr_nv2di (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrshr_n_s8 (int8x8_t __a, const int __b)
-{
-  return (int8x8_t)__builtin_neon_vshr_nv8qi (__a, __b, 3);
-}
+#define vrshr_n_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vshr_nv8qi (__a, __b, 3);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vrshr_n_s16 (int16x4_t __a, const int __b)
-{
-  return (int16x4_t)__builtin_neon_vshr_nv4hi (__a, __b, 3);
-}
+#define vrshr_n_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vshr_nv4hi (__a, __b, 3);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vrshr_n_s32 (int32x2_t __a, const int __b)
-{
-  return (int32x2_t)__builtin_neon_vshr_nv2si (__a, __b, 3);
-}
+#define vrshr_n_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vshr_nv2si (__a, __b, 3);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vrshr_n_s64 (int64x1_t __a, const int __b)
-{
-  return (int64x1_t)__builtin_neon_vshr_ndi (__a, __b, 3);
-}
+#define vrshr_n_s64(__a, __b) \
+  (int64x1_t)__builtin_neon_vshr_ndi (__a, __b, 3);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrshr_n_u8 (uint8x8_t __a, const int __b)
-{
-  return (uint8x8_t)__builtin_neon_vshr_nv8qi ((int8x8_t) __a, __b, 2);
-}
+#define vrshr_n_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vshr_nv8qi (__a, __b, 2);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vrshr_n_u16 (uint16x4_t __a, const int __b)
-{
-  return (uint16x4_t)__builtin_neon_vshr_nv4hi ((int16x4_t) __a, __b, 2);
-}
+#define vrshr_n_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vshr_nv4hi (__a, __b, 2);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrshr_n_u32 (uint32x2_t __a, const int __b)
-{
-  return (uint32x2_t)__builtin_neon_vshr_nv2si ((int32x2_t) __a, __b, 2);
-}
+#define vrshr_n_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vshr_nv2si (__a, __b, 2);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vrshr_n_u64 (uint64x1_t __a, const int __b)
-{
-  return (uint64x1_t)__builtin_neon_vshr_ndi ((int64x1_t) __a, __b, 2);
-}
+#define vrshr_n_u64(__a, __b) \
+  (uint64x1_t)__builtin_neon_vshr_ndi (__a, __b, 2);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vrshrq_n_s8 (int8x16_t __a, const int __b)
-{
-  return (int8x16_t)__builtin_neon_vshr_nv16qi (__a, __b, 3);
-}
+#define vrshrq_n_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vshr_nv16qi (__a, __b, 3);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vrshrq_n_s16 (int16x8_t __a, const int __b)
-{
-  return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 3);
-}
+#define vrshrq_n_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 3);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vrshrq_n_s32 (int32x4_t __a, const int __b)
-{
-  return (int32x4_t)__builtin_neon_vshr_nv4si (__a, __b, 3);
-}
+#define vrshrq_n_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vshr_nv4si (__a, __b, 3);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vrshrq_n_s64 (int64x2_t __a, const int __b)
-{
-  return (int64x2_t)__builtin_neon_vshr_nv2di (__a, __b, 3);
-}
+#define vrshrq_n_s64(__a, __b) \
+  (int64x2_t)__builtin_neon_vshr_nv2di (__a, __b, 3);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vrshrq_n_u8 (uint8x16_t __a, const int __b)
-{
-  return (uint8x16_t)__builtin_neon_vshr_nv16qi ((int8x16_t) __a, __b, 2);
-}
+#define vrshrq_n_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vshr_nv16qi (__a, __b, 2);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vrshrq_n_u16 (uint16x8_t __a, const int __b)
-{
-  return (uint16x8_t)__builtin_neon_vshr_nv8hi ((int16x8_t) __a, __b, 2);
-}
+#define vrshrq_n_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 2);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vrshrq_n_u32 (uint32x4_t __a, const int __b)
-{
-  return (uint32x4_t)__builtin_neon_vshr_nv4si ((int32x4_t) __a, __b, 2);
-}
+#define vrshrq_n_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vshr_nv4si (__a, __b, 2);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vrshrq_n_u64 (uint64x2_t __a, const int __b)
-{
-  return (uint64x2_t)__builtin_neon_vshr_nv2di ((int64x2_t) __a, __b, 2);
-}
+#define vrshrq_n_u64(__a, __b) \
+  (uint64x2_t)__builtin_neon_vshr_nv2di (__a, __b, 2);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vshrn_n_s16 (int16x8_t __a, const int __b)
-{
-  return (int8x8_t)__builtin_neon_vshrn_nv8hi (__a, __b, 1);
-}
+#define vshrn_n_s16(__a, __b) \
+  (int8x8_t)__builtin_neon_vshrn_nv8hi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vshrn_n_s32 (int32x4_t __a, const int __b)
-{
-  return (int16x4_t)__builtin_neon_vshrn_nv4si (__a, __b, 1);
-}
+#define vshrn_n_s32(__a, __b) \
+  (int16x4_t)__builtin_neon_vshrn_nv4si (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vshrn_n_s64 (int64x2_t __a, const int __b)
-{
-  return (int32x2_t)__builtin_neon_vshrn_nv2di (__a, __b, 1);
-}
+#define vshrn_n_s64(__a, __b) \
+  (int32x2_t)__builtin_neon_vshrn_nv2di (__a, __b, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vshrn_n_u16 (uint16x8_t __a, const int __b)
-{
-  return (uint8x8_t)__builtin_neon_vshrn_nv8hi ((int16x8_t) __a, __b, 0);
-}
+#define vshrn_n_u16(__a, __b) \
+  (uint8x8_t)__builtin_neon_vshrn_nv8hi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vshrn_n_u32 (uint32x4_t __a, const int __b)
-{
-  return (uint16x4_t)__builtin_neon_vshrn_nv4si ((int32x4_t) __a, __b, 0);
-}
+#define vshrn_n_u32(__a, __b) \
+  (uint16x4_t)__builtin_neon_vshrn_nv4si (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vshrn_n_u64 (uint64x2_t __a, const int __b)
-{
-  return (uint32x2_t)__builtin_neon_vshrn_nv2di ((int64x2_t) __a, __b, 0);
-}
+#define vshrn_n_u64(__a, __b) \
+  (uint32x2_t)__builtin_neon_vshrn_nv2di (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrshrn_n_s16 (int16x8_t __a, const int __b)
-{
-  return (int8x8_t)__builtin_neon_vshrn_nv8hi (__a, __b, 3);
-}
+#define vrshrn_n_s16(__a, __b) \
+  (int8x8_t)__builtin_neon_vshrn_nv8hi (__a, __b, 3);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vrshrn_n_s32 (int32x4_t __a, const int __b)
-{
-  return (int16x4_t)__builtin_neon_vshrn_nv4si (__a, __b, 3);
-}
+#define vrshrn_n_s32(__a, __b) \
+  (int16x4_t)__builtin_neon_vshrn_nv4si (__a, __b, 3);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vrshrn_n_s64 (int64x2_t __a, const int __b)
-{
-  return (int32x2_t)__builtin_neon_vshrn_nv2di (__a, __b, 3);
-}
+#define vrshrn_n_s64(__a, __b) \
+  (int32x2_t)__builtin_neon_vshrn_nv2di (__a, __b, 3);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrshrn_n_u16 (uint16x8_t __a, const int __b)
-{
-  return (uint8x8_t)__builtin_neon_vshrn_nv8hi ((int16x8_t) __a, __b, 2);
-}
+#define vrshrn_n_u16(__a, __b) \
+  (uint8x8_t)__builtin_neon_vshrn_nv8hi (__a, __b, 2);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vrshrn_n_u32 (uint32x4_t __a, const int __b)
-{
-  return (uint16x4_t)__builtin_neon_vshrn_nv4si ((int32x4_t) __a, __b, 2);
-}
+#define vrshrn_n_u32(__a, __b) \
+  (uint16x4_t)__builtin_neon_vshrn_nv4si (__a, __b, 2);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrshrn_n_u64 (uint64x2_t __a, const int __b)
-{
-  return (uint32x2_t)__builtin_neon_vshrn_nv2di ((int64x2_t) __a, __b, 2);
-}
+#define vrshrn_n_u64(__a, __b) \
+  (uint32x2_t)__builtin_neon_vshrn_nv2di (__a, __b, 2);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqshrn_n_s16 (int16x8_t __a, const int __b)
-{
-  return (int8x8_t)__builtin_neon_vqshrn_nv8hi (__a, __b, 1);
-}
+#define vqshrn_n_s16(__a, __b) \
+  (int8x8_t)__builtin_neon_vqshrn_nv8hi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqshrn_n_s32 (int32x4_t __a, const int __b)
-{
-  return (int16x4_t)__builtin_neon_vqshrn_nv4si (__a, __b, 1);
-}
+#define vqshrn_n_s32(__a, __b) \
+  (int16x4_t)__builtin_neon_vqshrn_nv4si (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqshrn_n_s64 (int64x2_t __a, const int __b)
-{
-  return (int32x2_t)__builtin_neon_vqshrn_nv2di (__a, __b, 1);
-}
+#define vqshrn_n_s64(__a, __b) \
+  (int32x2_t)__builtin_neon_vqshrn_nv2di (__a, __b, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqshrn_n_u16 (uint16x8_t __a, const int __b)
-{
-  return (uint8x8_t)__builtin_neon_vqshrn_nv8hi ((int16x8_t) __a, __b, 0);
-}
+#define vqshrn_n_u16(__a, __b) \
+  (uint8x8_t)__builtin_neon_vqshrn_nv8hi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqshrn_n_u32 (uint32x4_t __a, const int __b)
-{
-  return (uint16x4_t)__builtin_neon_vqshrn_nv4si ((int32x4_t) __a, __b, 0);
-}
+#define vqshrn_n_u32(__a, __b) \
+  (uint16x4_t)__builtin_neon_vqshrn_nv4si (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqshrn_n_u64 (uint64x2_t __a, const int __b)
-{
-  return (uint32x2_t)__builtin_neon_vqshrn_nv2di ((int64x2_t) __a, __b, 0);
-}
+#define vqshrn_n_u64(__a, __b) \
+  (uint32x2_t)__builtin_neon_vqshrn_nv2di (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqrshrn_n_s16 (int16x8_t __a, const int __b)
-{
-  return (int8x8_t)__builtin_neon_vqshrn_nv8hi (__a, __b, 3);
-}
+#define vqrshrn_n_s16(__a, __b) \
+  (int8x8_t)__builtin_neon_vqshrn_nv8hi (__a, __b, 3);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqrshrn_n_s32 (int32x4_t __a, const int __b)
-{
-  return (int16x4_t)__builtin_neon_vqshrn_nv4si (__a, __b, 3);
-}
+#define vqrshrn_n_s32(__a, __b) \
+  (int16x4_t)__builtin_neon_vqshrn_nv4si (__a, __b, 3);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqrshrn_n_s64 (int64x2_t __a, const int __b)
-{
-  return (int32x2_t)__builtin_neon_vqshrn_nv2di (__a, __b, 3);
-}
+#define vqrshrn_n_s64(__a, __b) \
+  (int32x2_t)__builtin_neon_vqshrn_nv2di (__a, __b, 3);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqrshrn_n_u16 (uint16x8_t __a, const int __b)
-{
-  return (uint8x8_t)__builtin_neon_vqshrn_nv8hi ((int16x8_t) __a, __b, 2);
-}
+#define vqrshrn_n_u16(__a, __b) \
+  (uint8x8_t)__builtin_neon_vqshrn_nv8hi (__a, __b, 2);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqrshrn_n_u32 (uint32x4_t __a, const int __b)
-{
-  return (uint16x4_t)__builtin_neon_vqshrn_nv4si ((int32x4_t) __a, __b, 2);
-}
+#define vqrshrn_n_u32(__a, __b) \
+  (uint16x4_t)__builtin_neon_vqshrn_nv4si (__a, __b, 2);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqrshrn_n_u64 (uint64x2_t __a, const int __b)
-{
-  return (uint32x2_t)__builtin_neon_vqshrn_nv2di ((int64x2_t) __a, __b, 2);
-}
+#define vqrshrn_n_u64(__a, __b) \
+  (uint32x2_t)__builtin_neon_vqshrn_nv2di (__a, __b, 2);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqshrun_n_s16 (int16x8_t __a, const int __b)
-{
-  return (uint8x8_t)__builtin_neon_vqshrun_nv8hi (__a, __b, 1);
-}
+#define vqshrun_n_s16(__a, __b) \
+  (uint8x8_t)__builtin_neon_vqshrun_nv8hi (__a, __b, 1);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqshrun_n_s32 (int32x4_t __a, const int __b)
-{
-  return (uint16x4_t)__builtin_neon_vqshrun_nv4si (__a, __b, 1);
-}
+#define vqshrun_n_s32(__a, __b) \
+  (uint16x4_t)__builtin_neon_vqshrun_nv4si (__a, __b, 1);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqshrun_n_s64 (int64x2_t __a, const int __b)
-{
-  return (uint32x2_t)__builtin_neon_vqshrun_nv2di (__a, __b, 1);
-}
+#define vqshrun_n_s64(__a, __b) \
+  (uint32x2_t)__builtin_neon_vqshrun_nv2di (__a, __b, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqrshrun_n_s16 (int16x8_t __a, const int __b)
-{
-  return (uint8x8_t)__builtin_neon_vqshrun_nv8hi (__a, __b, 3);
-}
+#define vqrshrun_n_s16(__a, __b) \
+  (uint8x8_t)__builtin_neon_vqshrun_nv8hi (__a, __b, 3);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqrshrun_n_s32 (int32x4_t __a, const int __b)
-{
-  return (uint16x4_t)__builtin_neon_vqshrun_nv4si (__a, __b, 3);
-}
+#define vqrshrun_n_s32(__a, __b) \
+  (uint16x4_t)__builtin_neon_vqshrun_nv4si (__a, __b, 3);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqrshrun_n_s64 (int64x2_t __a, const int __b)
-{
-  return (uint32x2_t)__builtin_neon_vqshrun_nv2di (__a, __b, 3);
-}
+#define vqrshrun_n_s64(__a, __b) \
+  (uint32x2_t)__builtin_neon_vqshrun_nv2di (__a, __b, 3);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vshl_n_s8 (int8x8_t __a, const int __b)
-{
-  return (int8x8_t)__builtin_neon_vshl_nv8qi (__a, __b, 1);
-}
+#define vshl_n_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vshl_nv8qi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vshl_n_s16 (int16x4_t __a, const int __b)
-{
-  return (int16x4_t)__builtin_neon_vshl_nv4hi (__a, __b, 1);
-}
+#define vshl_n_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vshl_nv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vshl_n_s32 (int32x2_t __a, const int __b)
-{
-  return (int32x2_t)__builtin_neon_vshl_nv2si (__a, __b, 1);
-}
+#define vshl_n_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vshl_nv2si (__a, __b, 1);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vshl_n_s64 (int64x1_t __a, const int __b)
-{
-  return (int64x1_t)__builtin_neon_vshl_ndi (__a, __b, 1);
-}
+#define vshl_n_s64(__a, __b) \
+  (int64x1_t)__builtin_neon_vshl_ndi (__a, __b, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vshl_n_u8 (uint8x8_t __a, const int __b)
-{
-  return (uint8x8_t)__builtin_neon_vshl_nv8qi ((int8x8_t) __a, __b, 0);
-}
+#define vshl_n_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vshl_nv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vshl_n_u16 (uint16x4_t __a, const int __b)
-{
-  return (uint16x4_t)__builtin_neon_vshl_nv4hi ((int16x4_t) __a, __b, 0);
-}
+#define vshl_n_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vshl_nv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vshl_n_u32 (uint32x2_t __a, const int __b)
-{
-  return (uint32x2_t)__builtin_neon_vshl_nv2si ((int32x2_t) __a, __b, 0);
-}
+#define vshl_n_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vshl_nv2si (__a, __b, 0);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vshl_n_u64 (uint64x1_t __a, const int __b)
-{
-  return (uint64x1_t)__builtin_neon_vshl_ndi ((int64x1_t) __a, __b, 0);
-}
+#define vshl_n_u64(__a, __b) \
+  (uint64x1_t)__builtin_neon_vshl_ndi (__a, __b, 0);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vshlq_n_s8 (int8x16_t __a, const int __b)
-{
-  return (int8x16_t)__builtin_neon_vshl_nv16qi (__a, __b, 1);
-}
+#define vshlq_n_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vshl_nv16qi (__a, __b, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vshlq_n_s16 (int16x8_t __a, const int __b)
-{
-  return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1);
-}
+#define vshlq_n_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vshlq_n_s32 (int32x4_t __a, const int __b)
-{
-  return (int32x4_t)__builtin_neon_vshl_nv4si (__a, __b, 1);
-}
+#define vshlq_n_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vshl_nv4si (__a, __b, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vshlq_n_s64 (int64x2_t __a, const int __b)
-{
-  return (int64x2_t)__builtin_neon_vshl_nv2di (__a, __b, 1);
-}
+#define vshlq_n_s64(__a, __b) \
+  (int64x2_t)__builtin_neon_vshl_nv2di (__a, __b, 1);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vshlq_n_u8 (uint8x16_t __a, const int __b)
-{
-  return (uint8x16_t)__builtin_neon_vshl_nv16qi ((int8x16_t) __a, __b, 0);
-}
+#define vshlq_n_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vshl_nv16qi (__a, __b, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vshlq_n_u16 (uint16x8_t __a, const int __b)
-{
-  return (uint16x8_t)__builtin_neon_vshl_nv8hi ((int16x8_t) __a, __b, 0);
-}
+#define vshlq_n_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vshlq_n_u32 (uint32x4_t __a, const int __b)
-{
-  return (uint32x4_t)__builtin_neon_vshl_nv4si ((int32x4_t) __a, __b, 0);
-}
+#define vshlq_n_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vshl_nv4si (__a, __b, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vshlq_n_u64 (uint64x2_t __a, const int __b)
-{
-  return (uint64x2_t)__builtin_neon_vshl_nv2di ((int64x2_t) __a, __b, 0);
-}
+#define vshlq_n_u64(__a, __b) \
+  (uint64x2_t)__builtin_neon_vshl_nv2di (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqshl_n_s8 (int8x8_t __a, const int __b)
-{
-  return (int8x8_t)__builtin_neon_vqshl_nv8qi (__a, __b, 1);
-}
+#define vqshl_n_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vqshl_nv8qi (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqshl_n_s16 (int16x4_t __a, const int __b)
-{
-  return (int16x4_t)__builtin_neon_vqshl_nv4hi (__a, __b, 1);
-}
+#define vqshl_n_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vqshl_nv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqshl_n_s32 (int32x2_t __a, const int __b)
-{
-  return (int32x2_t)__builtin_neon_vqshl_nv2si (__a, __b, 1);
-}
+#define vqshl_n_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vqshl_nv2si (__a, __b, 1);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vqshl_n_s64 (int64x1_t __a, const int __b)
-{
-  return (int64x1_t)__builtin_neon_vqshl_ndi (__a, __b, 1);
-}
+#define vqshl_n_s64(__a, __b) \
+  (int64x1_t)__builtin_neon_vqshl_ndi (__a, __b, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqshl_n_u8 (uint8x8_t __a, const int __b)
-{
-  return (uint8x8_t)__builtin_neon_vqshl_nv8qi ((int8x8_t) __a, __b, 0);
-}
+#define vqshl_n_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vqshl_nv8qi (__a, __b, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqshl_n_u16 (uint16x4_t __a, const int __b)
-{
-  return (uint16x4_t)__builtin_neon_vqshl_nv4hi ((int16x4_t) __a, __b, 0);
-}
+#define vqshl_n_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vqshl_nv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqshl_n_u32 (uint32x2_t __a, const int __b)
-{
-  return (uint32x2_t)__builtin_neon_vqshl_nv2si ((int32x2_t) __a, __b, 0);
-}
+#define vqshl_n_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vqshl_nv2si (__a, __b, 0);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vqshl_n_u64 (uint64x1_t __a, const int __b)
-{
-  return (uint64x1_t)__builtin_neon_vqshl_ndi ((int64x1_t) __a, __b, 0);
-}
+#define vqshl_n_u64(__a, __b) \
+  (uint64x1_t)__builtin_neon_vqshl_ndi (__a, __b, 0);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqshlq_n_s8 (int8x16_t __a, const int __b)
-{
-  return (int8x16_t)__builtin_neon_vqshl_nv16qi (__a, __b, 1);
-}
+#define vqshlq_n_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vqshl_nv16qi (__a, __b, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqshlq_n_s16 (int16x8_t __a, const int __b)
-{
-  return (int16x8_t)__builtin_neon_vqshl_nv8hi (__a, __b, 1);
-}
+#define vqshlq_n_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vqshl_nv8hi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqshlq_n_s32 (int32x4_t __a, const int __b)
-{
-  return (int32x4_t)__builtin_neon_vqshl_nv4si (__a, __b, 1);
-}
+#define vqshlq_n_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vqshl_nv4si (__a, __b, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqshlq_n_s64 (int64x2_t __a, const int __b)
-{
-  return (int64x2_t)__builtin_neon_vqshl_nv2di (__a, __b, 1);
-}
+#define vqshlq_n_s64(__a, __b) \
+  (int64x2_t)__builtin_neon_vqshl_nv2di (__a, __b, 1);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqshlq_n_u8 (uint8x16_t __a, const int __b)
-{
-  return (uint8x16_t)__builtin_neon_vqshl_nv16qi ((int8x16_t) __a, __b, 0);
-}
+#define vqshlq_n_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vqshl_nv16qi (__a, __b, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vqshlq_n_u16 (uint16x8_t __a, const int __b)
-{
-  return (uint16x8_t)__builtin_neon_vqshl_nv8hi ((int16x8_t) __a, __b, 0);
-}
+#define vqshlq_n_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vqshl_nv8hi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vqshlq_n_u32 (uint32x4_t __a, const int __b)
-{
-  return (uint32x4_t)__builtin_neon_vqshl_nv4si ((int32x4_t) __a, __b, 0);
-}
+#define vqshlq_n_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vqshl_nv4si (__a, __b, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vqshlq_n_u64 (uint64x2_t __a, const int __b)
-{
-  return (uint64x2_t)__builtin_neon_vqshl_nv2di ((int64x2_t) __a, __b, 0);
-}
+#define vqshlq_n_u64(__a, __b) \
+  (uint64x2_t)__builtin_neon_vqshl_nv2di (__a, __b, 0);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqshlu_n_s8 (int8x8_t __a, const int __b)
-{
-  return (uint8x8_t)__builtin_neon_vqshlu_nv8qi (__a, __b, 1);
-}
+#define vqshlu_n_s8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vqshlu_nv8qi (__a, __b, 1);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqshlu_n_s16 (int16x4_t __a, const int __b)
-{
-  return (uint16x4_t)__builtin_neon_vqshlu_nv4hi (__a, __b, 1);
-}
+#define vqshlu_n_s16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vqshlu_nv4hi (__a, __b, 1);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqshlu_n_s32 (int32x2_t __a, const int __b)
-{
-  return (uint32x2_t)__builtin_neon_vqshlu_nv2si (__a, __b, 1);
-}
+#define vqshlu_n_s32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vqshlu_nv2si (__a, __b, 1);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vqshlu_n_s64 (int64x1_t __a, const int __b)
-{
-  return (uint64x1_t)__builtin_neon_vqshlu_ndi (__a, __b, 1);
-}
+#define vqshlu_n_s64(__a, __b) \
+  (uint64x1_t)__builtin_neon_vqshlu_ndi (__a, __b, 1);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vqshluq_n_s8 (int8x16_t __a, const int __b)
-{
-  return (uint8x16_t)__builtin_neon_vqshlu_nv16qi (__a, __b, 1);
-}
+#define vqshluq_n_s8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vqshlu_nv16qi (__a, __b, 1);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vqshluq_n_s16 (int16x8_t __a, const int __b)
-{
-  return (uint16x8_t)__builtin_neon_vqshlu_nv8hi (__a, __b, 1);
-}
+#define vqshluq_n_s16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vqshlu_nv8hi (__a, __b, 1);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vqshluq_n_s32 (int32x4_t __a, const int __b)
-{
-  return (uint32x4_t)__builtin_neon_vqshlu_nv4si (__a, __b, 1);
-}
+#define vqshluq_n_s32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vqshlu_nv4si (__a, __b, 1);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vqshluq_n_s64 (int64x2_t __a, const int __b)
-{
-  return (uint64x2_t)__builtin_neon_vqshlu_nv2di (__a, __b, 1);
-}
+#define vqshluq_n_s64(__a, __b) \
+  (uint64x2_t)__builtin_neon_vqshlu_nv2di (__a, __b, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vshll_n_s8 (int8x8_t __a, const int __b)
-{
-  return (int16x8_t)__builtin_neon_vshll_nv8qi (__a, __b, 1);
-}
+#define vshll_n_s8(__a, __b) \
+  (int16x8_t)__builtin_neon_vshll_nv8qi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vshll_n_s16 (int16x4_t __a, const int __b)
-{
-  return (int32x4_t)__builtin_neon_vshll_nv4hi (__a, __b, 1);
-}
+#define vshll_n_s16(__a, __b) \
+  (int32x4_t)__builtin_neon_vshll_nv4hi (__a, __b, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vshll_n_s32 (int32x2_t __a, const int __b)
-{
-  return (int64x2_t)__builtin_neon_vshll_nv2si (__a, __b, 1);
-}
+#define vshll_n_s32(__a, __b) \
+  (int64x2_t)__builtin_neon_vshll_nv2si (__a, __b, 1);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vshll_n_u8 (uint8x8_t __a, const int __b)
-{
-  return (uint16x8_t)__builtin_neon_vshll_nv8qi ((int8x8_t) __a, __b, 0);
-}
+#define vshll_n_u8(__a, __b) \
+  (uint16x8_t)__builtin_neon_vshll_nv8qi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vshll_n_u16 (uint16x4_t __a, const int __b)
-{
-  return (uint32x4_t)__builtin_neon_vshll_nv4hi ((int16x4_t) __a, __b, 0);
-}
+#define vshll_n_u16(__a, __b) \
+  (uint32x4_t)__builtin_neon_vshll_nv4hi (__a, __b, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vshll_n_u32 (uint32x2_t __a, const int __b)
-{
-  return (uint64x2_t)__builtin_neon_vshll_nv2si ((int32x2_t) __a, __b, 0);
-}
+#define vshll_n_u32(__a, __b) \
+  (uint64x2_t)__builtin_neon_vshll_nv2si (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
-{
-  return (int8x8_t)__builtin_neon_vsra_nv8qi (__a, __b, __c, 1);
-}
+#define vsra_n_s8(__a, __b, __c) \
+  (int8x8_t)__builtin_neon_vsra_nv8qi (__a, __b, __c, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
-{
-  return (int16x4_t)__builtin_neon_vsra_nv4hi (__a, __b, __c, 1);
-}
+#define vsra_n_s16(__a, __b, __c) \
+  (int16x4_t)__builtin_neon_vsra_nv4hi (__a, __b, __c, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
-{
-  return (int32x2_t)__builtin_neon_vsra_nv2si (__a, __b, __c, 1);
-}
+#define vsra_n_s32(__a, __b, __c) \
+  (int32x2_t)__builtin_neon_vsra_nv2si (__a, __b, __c, 1);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
-{
-  return (int64x1_t)__builtin_neon_vsra_ndi (__a, __b, __c, 1);
-}
+#define vsra_n_s64(__a, __b, __c) \
+  (int64x1_t)__builtin_neon_vsra_ndi (__a, __b, __c, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
-{
-  return (uint8x8_t)__builtin_neon_vsra_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c, 0);
-}
+#define vsra_n_u8(__a, __b, __c) \
+  (uint8x8_t)__builtin_neon_vsra_nv8qi (__a, __b, __c, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
-{
-  return (uint16x4_t)__builtin_neon_vsra_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c, 0);
-}
+#define vsra_n_u16(__a, __b, __c) \
+  (uint16x4_t)__builtin_neon_vsra_nv4hi (__a, __b, __c, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
-{
-  return (uint32x2_t)__builtin_neon_vsra_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c, 0);
-}
+#define vsra_n_u32(__a, __b, __c) \
+  (uint32x2_t)__builtin_neon_vsra_nv2si (__a, __b, __c, 0);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
-{
-  return (uint64x1_t)__builtin_neon_vsra_ndi ((int64x1_t) __a, (int64x1_t) __b, __c, 0);
-}
+#define vsra_n_u64(__a, __b, __c) \
+  (uint64x1_t)__builtin_neon_vsra_ndi (__a, __b, __c, 0);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
-{
-  return (int8x16_t)__builtin_neon_vsra_nv16qi (__a, __b, __c, 1);
-}
+#define vsraq_n_s8(__a, __b, __c) \
+  (int8x16_t)__builtin_neon_vsra_nv16qi (__a, __b, __c, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
-{
-  return (int16x8_t)__builtin_neon_vsra_nv8hi (__a, __b, __c, 1);
-}
+#define vsraq_n_s16(__a, __b, __c) \
+  (int16x8_t)__builtin_neon_vsra_nv8hi (__a, __b, __c, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
-{
-  return (int32x4_t)__builtin_neon_vsra_nv4si (__a, __b, __c, 1);
-}
+#define vsraq_n_s32(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vsra_nv4si (__a, __b, __c, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
-{
-  return (int64x2_t)__builtin_neon_vsra_nv2di (__a, __b, __c, 1);
-}
+#define vsraq_n_s64(__a, __b, __c) \
+  (int64x2_t)__builtin_neon_vsra_nv2di (__a, __b, __c, 1);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
-{
-  return (uint8x16_t)__builtin_neon_vsra_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c, 0);
-}
+#define vsraq_n_u8(__a, __b, __c) \
+  (uint8x16_t)__builtin_neon_vsra_nv16qi (__a, __b, __c, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
-{
-  return (uint16x8_t)__builtin_neon_vsra_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c, 0);
-}
+#define vsraq_n_u16(__a, __b, __c) \
+  (uint16x8_t)__builtin_neon_vsra_nv8hi (__a, __b, __c, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
-{
-  return (uint32x4_t)__builtin_neon_vsra_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c, 0);
-}
+#define vsraq_n_u32(__a, __b, __c) \
+  (uint32x4_t)__builtin_neon_vsra_nv4si (__a, __b, __c, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
-{
-  return (uint64x2_t)__builtin_neon_vsra_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c, 0);
-}
+#define vsraq_n_u64(__a, __b, __c) \
+  (uint64x2_t)__builtin_neon_vsra_nv2di (__a, __b, __c, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
-{
-  return (int8x8_t)__builtin_neon_vsra_nv8qi (__a, __b, __c, 3);
-}
+#define vrsra_n_s8(__a, __b, __c) \
+  (int8x8_t)__builtin_neon_vsra_nv8qi (__a, __b, __c, 3);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vrsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
-{
-  return (int16x4_t)__builtin_neon_vsra_nv4hi (__a, __b, __c, 3);
-}
+#define vrsra_n_s16(__a, __b, __c) \
+  (int16x4_t)__builtin_neon_vsra_nv4hi (__a, __b, __c, 3);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vrsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
-{
-  return (int32x2_t)__builtin_neon_vsra_nv2si (__a, __b, __c, 3);
-}
+#define vrsra_n_s32(__a, __b, __c) \
+  (int32x2_t)__builtin_neon_vsra_nv2si (__a, __b, __c, 3);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vrsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
-{
-  return (int64x1_t)__builtin_neon_vsra_ndi (__a, __b, __c, 3);
-}
+#define vrsra_n_s64(__a, __b, __c) \
+  (int64x1_t)__builtin_neon_vsra_ndi (__a, __b, __c, 3);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
-{
-  return (uint8x8_t)__builtin_neon_vsra_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c, 2);
-}
+#define vrsra_n_u8(__a, __b, __c) \
+  (uint8x8_t)__builtin_neon_vsra_nv8qi (__a, __b, __c, 2);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vrsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
-{
-  return (uint16x4_t)__builtin_neon_vsra_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c, 2);
-}
+#define vrsra_n_u16(__a, __b, __c) \
+  (uint16x4_t)__builtin_neon_vsra_nv4hi (__a, __b, __c, 2);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
-{
-  return (uint32x2_t)__builtin_neon_vsra_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c, 2);
-}
+#define vrsra_n_u32(__a, __b, __c) \
+  (uint32x2_t)__builtin_neon_vsra_nv2si (__a, __b, __c, 2);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vrsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
-{
-  return (uint64x1_t)__builtin_neon_vsra_ndi ((int64x1_t) __a, (int64x1_t) __b, __c, 2);
-}
+#define vrsra_n_u64(__a, __b, __c) \
+  (uint64x1_t)__builtin_neon_vsra_ndi (__a, __b, __c, 2);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vrsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
-{
-  return (int8x16_t)__builtin_neon_vsra_nv16qi (__a, __b, __c, 3);
-}
+#define vrsraq_n_s8(__a, __b, __c) \
+  (int8x16_t)__builtin_neon_vsra_nv16qi (__a, __b, __c, 3);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vrsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
-{
-  return (int16x8_t)__builtin_neon_vsra_nv8hi (__a, __b, __c, 3);
-}
+#define vrsraq_n_s16(__a, __b, __c) \
+  (int16x8_t)__builtin_neon_vsra_nv8hi (__a, __b, __c, 3);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vrsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
-{
-  return (int32x4_t)__builtin_neon_vsra_nv4si (__a, __b, __c, 3);
-}
+#define vrsraq_n_s32(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vsra_nv4si (__a, __b, __c, 3);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vrsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
-{
-  return (int64x2_t)__builtin_neon_vsra_nv2di (__a, __b, __c, 3);
-}
+#define vrsraq_n_s64(__a, __b, __c) \
+  (int64x2_t)__builtin_neon_vsra_nv2di (__a, __b, __c, 3);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vrsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
-{
-  return (uint8x16_t)__builtin_neon_vsra_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c, 2);
-}
+#define vrsraq_n_u8(__a, __b, __c) \
+  (uint8x16_t)__builtin_neon_vsra_nv16qi (__a, __b, __c, 2);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vrsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
-{
-  return (uint16x8_t)__builtin_neon_vsra_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c, 2);
-}
+#define vrsraq_n_u16(__a, __b, __c) \
+  (uint16x8_t)__builtin_neon_vsra_nv8hi (__a, __b, __c, 2);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vrsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
-{
-  return (uint32x4_t)__builtin_neon_vsra_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c, 2);
-}
+#define vrsraq_n_u32(__a, __b, __c) \
+  (uint32x4_t)__builtin_neon_vsra_nv4si (__a, __b, __c, 2);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vrsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
-{
-  return (uint64x2_t)__builtin_neon_vsra_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c, 2);
-}
+#define vrsraq_n_u64(__a, __b, __c) \
+  (uint64x2_t)__builtin_neon_vsra_nv2di (__a, __b, __c, 2);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vsri_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
-{
-  return (int8x8_t)__builtin_neon_vsri_nv8qi (__a, __b, __c);
-}
+#define vsri_n_s8(__a, __b, __c) \
+  (int8x8_t)__builtin_neon_vsri_nv8qi (__a, __b, __c);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vsri_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
-{
-  return (int16x4_t)__builtin_neon_vsri_nv4hi (__a, __b, __c);
-}
+#define vsri_n_s16(__a, __b, __c) \
+  (int16x4_t)__builtin_neon_vsri_nv4hi (__a, __b, __c);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vsri_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
-{
-  return (int32x2_t)__builtin_neon_vsri_nv2si (__a, __b, __c);
-}
+#define vsri_n_s32(__a, __b, __c) \
+  (int32x2_t)__builtin_neon_vsri_nv2si (__a, __b, __c);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vsri_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
-{
-  return (int64x1_t)__builtin_neon_vsri_ndi (__a, __b, __c);
-}
+#define vsri_n_s64(__a, __b, __c) \
+  (int64x1_t)__builtin_neon_vsri_ndi (__a, __b, __c);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vsri_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
-{
-  return (uint8x8_t)__builtin_neon_vsri_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
-}
+#define vsri_n_u8(__a, __b, __c) \
+  (uint8x8_t)__builtin_neon_vsri_nv8qi (__a, __b, __c);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vsri_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
-{
-  return (uint16x4_t)__builtin_neon_vsri_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
-}
+#define vsri_n_u16(__a, __b, __c) \
+  (uint16x4_t)__builtin_neon_vsri_nv4hi (__a, __b, __c);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vsri_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
-{
-  return (uint32x2_t)__builtin_neon_vsri_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
-}
+#define vsri_n_u32(__a, __b, __c) \
+  (uint32x2_t)__builtin_neon_vsri_nv2si (__a, __b, __c);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vsri_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
-{
-  return (uint64x1_t)__builtin_neon_vsri_ndi ((int64x1_t) __a, (int64x1_t) __b, __c);
-}
+#define vsri_n_u64(__a, __b, __c) \
+  (uint64x1_t)__builtin_neon_vsri_ndi (__a, __b, __c);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vsri_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
-{
-  return (poly8x8_t)__builtin_neon_vsri_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
-}
+#define vsri_n_p8(__a, __b, __c) \
+  (poly8x8_t)__builtin_neon_vsri_nv8qi (__a, __b, __c);
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vsri_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
-{
-  return (poly16x4_t)__builtin_neon_vsri_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
-}
+#define vsri_n_p16(__a, __b, __c) \
+  (poly16x4_t)__builtin_neon_vsri_nv4hi (__a, __b, __c);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vsriq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
-{
-  return (int8x16_t)__builtin_neon_vsri_nv16qi (__a, __b, __c);
-}
+#define vsriq_n_s8(__a, __b, __c) \
+  (int8x16_t)__builtin_neon_vsri_nv16qi (__a, __b, __c);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vsriq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
-{
-  return (int16x8_t)__builtin_neon_vsri_nv8hi (__a, __b, __c);
-}
+#define vsriq_n_s16(__a, __b, __c) \
+  (int16x8_t)__builtin_neon_vsri_nv8hi (__a, __b, __c);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vsriq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
-{
-  return (int32x4_t)__builtin_neon_vsri_nv4si (__a, __b, __c);
-}
+#define vsriq_n_s32(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vsri_nv4si (__a, __b, __c);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vsriq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
-{
-  return (int64x2_t)__builtin_neon_vsri_nv2di (__a, __b, __c);
-}
+#define vsriq_n_s64(__a, __b, __c) \
+  (int64x2_t)__builtin_neon_vsri_nv2di (__a, __b, __c);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vsriq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
-{
-  return (uint8x16_t)__builtin_neon_vsri_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
-}
+#define vsriq_n_u8(__a, __b, __c) \
+  (uint8x16_t)__builtin_neon_vsri_nv16qi (__a, __b, __c);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vsriq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
-{
-  return (uint16x8_t)__builtin_neon_vsri_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
-}
+#define vsriq_n_u16(__a, __b, __c) \
+  (uint16x8_t)__builtin_neon_vsri_nv8hi (__a, __b, __c);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsriq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
-{
-  return (uint32x4_t)__builtin_neon_vsri_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
-}
+#define vsriq_n_u32(__a, __b, __c) \
+  (uint32x4_t)__builtin_neon_vsri_nv4si (__a, __b, __c);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vsriq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
-{
-  return (uint64x2_t)__builtin_neon_vsri_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
-}
+#define vsriq_n_u64(__a, __b, __c) \
+  (uint64x2_t)__builtin_neon_vsri_nv2di (__a, __b, __c);
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vsriq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
-{
-  return (poly8x16_t)__builtin_neon_vsri_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
-}
+#define vsriq_n_p8(__a, __b, __c) \
+  (poly8x16_t)__builtin_neon_vsri_nv16qi (__a, __b, __c);
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vsriq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
-{
-  return (poly16x8_t)__builtin_neon_vsri_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
-}
+#define vsriq_n_p16(__a, __b, __c) \
+  (poly16x8_t)__builtin_neon_vsri_nv8hi (__a, __b, __c);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vsli_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
-{
-  return (int8x8_t)__builtin_neon_vsli_nv8qi (__a, __b, __c);
-}
+#define vsli_n_s8(__a, __b, __c) \
+  (int8x8_t)__builtin_neon_vsli_nv8qi (__a, __b, __c);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vsli_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
-{
-  return (int16x4_t)__builtin_neon_vsli_nv4hi (__a, __b, __c);
-}
+#define vsli_n_s16(__a, __b, __c) \
+  (int16x4_t)__builtin_neon_vsli_nv4hi (__a, __b, __c);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vsli_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
-{
-  return (int32x2_t)__builtin_neon_vsli_nv2si (__a, __b, __c);
-}
+#define vsli_n_s32(__a, __b, __c) \
+  (int32x2_t)__builtin_neon_vsli_nv2si (__a, __b, __c);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vsli_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
-{
-  return (int64x1_t)__builtin_neon_vsli_ndi (__a, __b, __c);
-}
+#define vsli_n_s64(__a, __b, __c) \
+  (int64x1_t)__builtin_neon_vsli_ndi (__a, __b, __c);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vsli_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
-{
-  return (uint8x8_t)__builtin_neon_vsli_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
-}
+#define vsli_n_u8(__a, __b, __c) \
+  (uint8x8_t)__builtin_neon_vsli_nv8qi (__a, __b, __c);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vsli_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
-{
-  return (uint16x4_t)__builtin_neon_vsli_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
-}
+#define vsli_n_u16(__a, __b, __c) \
+  (uint16x4_t)__builtin_neon_vsli_nv4hi (__a, __b, __c);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vsli_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
-{
-  return (uint32x2_t)__builtin_neon_vsli_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
-}
+#define vsli_n_u32(__a, __b, __c) \
+  (uint32x2_t)__builtin_neon_vsli_nv2si (__a, __b, __c);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vsli_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
-{
-  return (uint64x1_t)__builtin_neon_vsli_ndi ((int64x1_t) __a, (int64x1_t) __b, __c);
-}
+#define vsli_n_u64(__a, __b, __c) \
+  (uint64x1_t)__builtin_neon_vsli_ndi (__a, __b, __c);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vsli_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
-{
-  return (poly8x8_t)__builtin_neon_vsli_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
-}
+#define vsli_n_p8(__a, __b, __c) \
+  (poly8x8_t)__builtin_neon_vsli_nv8qi (__a, __b, __c);
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vsli_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
-{
-  return (poly16x4_t)__builtin_neon_vsli_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
-}
+#define vsli_n_p16(__a, __b, __c) \
+  (poly16x4_t)__builtin_neon_vsli_nv4hi (__a, __b, __c);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vsliq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
-{
-  return (int8x16_t)__builtin_neon_vsli_nv16qi (__a, __b, __c);
-}
+#define vsliq_n_s8(__a, __b, __c) \
+  (int8x16_t)__builtin_neon_vsli_nv16qi (__a, __b, __c);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vsliq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
-{
-  return (int16x8_t)__builtin_neon_vsli_nv8hi (__a, __b, __c);
-}
+#define vsliq_n_s16(__a, __b, __c) \
+  (int16x8_t)__builtin_neon_vsli_nv8hi (__a, __b, __c);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vsliq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
-{
-  return (int32x4_t)__builtin_neon_vsli_nv4si (__a, __b, __c);
-}
+#define vsliq_n_s32(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vsli_nv4si (__a, __b, __c);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vsliq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
-{
-  return (int64x2_t)__builtin_neon_vsli_nv2di (__a, __b, __c);
-}
+#define vsliq_n_s64(__a, __b, __c) \
+  (int64x2_t)__builtin_neon_vsli_nv2di (__a, __b, __c);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vsliq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
-{
-  return (uint8x16_t)__builtin_neon_vsli_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
-}
+#define vsliq_n_u8(__a, __b, __c) \
+  (uint8x16_t)__builtin_neon_vsli_nv16qi (__a, __b, __c);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vsliq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
-{
-  return (uint16x8_t)__builtin_neon_vsli_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
-}
+#define vsliq_n_u16(__a, __b, __c) \
+  (uint16x8_t)__builtin_neon_vsli_nv8hi (__a, __b, __c);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsliq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
-{
-  return (uint32x4_t)__builtin_neon_vsli_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
-}
+#define vsliq_n_u32(__a, __b, __c) \
+  (uint32x4_t)__builtin_neon_vsli_nv4si (__a, __b, __c);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vsliq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
-{
-  return (uint64x2_t)__builtin_neon_vsli_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
-}
+#define vsliq_n_u64(__a, __b, __c) \
+  (uint64x2_t)__builtin_neon_vsli_nv2di (__a, __b, __c);
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vsliq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
-{
-  return (poly8x16_t)__builtin_neon_vsli_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
-}
+#define vsliq_n_p8(__a, __b, __c) \
+  (poly8x16_t)__builtin_neon_vsli_nv16qi (__a, __b, __c);
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vsliq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
-{
-  return (poly16x8_t)__builtin_neon_vsli_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
-}
+#define vsliq_n_p16(__a, __b, __c) \
+  (poly16x8_t)__builtin_neon_vsli_nv8hi (__a, __b, __c);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vabs_s8 (int8x8_t __a)
-{
-  return (int8x8_t)__builtin_neon_vabsv8qi (__a, 1);
-}
+#define vabs_s8(__a) \
+  (int8x8_t)__builtin_neon_vabsv8qi (__a, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vabs_s16 (int16x4_t __a)
-{
-  return (int16x4_t)__builtin_neon_vabsv4hi (__a, 1);
-}
+#define vabs_s16(__a) \
+  (int16x4_t)__builtin_neon_vabsv4hi (__a, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vabs_s32 (int32x2_t __a)
-{
-  return (int32x2_t)__builtin_neon_vabsv2si (__a, 1);
-}
+#define vabs_s32(__a) \
+  (int32x2_t)__builtin_neon_vabsv2si (__a, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vabs_f32 (float32x2_t __a)
-{
-  return (float32x2_t)__builtin_neon_vabsv2sf (__a, 5);
-}
+#define vabs_f32(__a) \
+  (float32x2_t)__builtin_neon_vabsv2sf (__a, 5);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vabsq_s8 (int8x16_t __a)
-{
-  return (int8x16_t)__builtin_neon_vabsv16qi (__a, 1);
-}
+#define vabsq_s8(__a) \
+  (int8x16_t)__builtin_neon_vabsv16qi (__a, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vabsq_s16 (int16x8_t __a)
-{
-  return (int16x8_t)__builtin_neon_vabsv8hi (__a, 1);
-}
+#define vabsq_s16(__a) \
+  (int16x8_t)__builtin_neon_vabsv8hi (__a, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vabsq_s32 (int32x4_t __a)
-{
-  return (int32x4_t)__builtin_neon_vabsv4si (__a, 1);
-}
+#define vabsq_s32(__a) \
+  (int32x4_t)__builtin_neon_vabsv4si (__a, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vabsq_f32 (float32x4_t __a)
-{
-  return (float32x4_t)__builtin_neon_vabsv4sf (__a, 5);
-}
+#define vabsq_f32(__a) \
+  (float32x4_t)__builtin_neon_vabsv4sf (__a, 5);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqabs_s8 (int8x8_t __a)
-{
-  return (int8x8_t)__builtin_neon_vqabsv8qi (__a, 1);
-}
+#define vqabs_s8(__a) \
+  (int8x8_t)__builtin_neon_vqabsv8qi (__a, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqabs_s16 (int16x4_t __a)
-{
-  return (int16x4_t)__builtin_neon_vqabsv4hi (__a, 1);
-}
+#define vqabs_s16(__a) \
+  (int16x4_t)__builtin_neon_vqabsv4hi (__a, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqabs_s32 (int32x2_t __a)
-{
-  return (int32x2_t)__builtin_neon_vqabsv2si (__a, 1);
-}
+#define vqabs_s32(__a) \
+  (int32x2_t)__builtin_neon_vqabsv2si (__a, 1);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqabsq_s8 (int8x16_t __a)
-{
-  return (int8x16_t)__builtin_neon_vqabsv16qi (__a, 1);
-}
+#define vqabsq_s8(__a) \
+  (int8x16_t)__builtin_neon_vqabsv16qi (__a, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqabsq_s16 (int16x8_t __a)
-{
-  return (int16x8_t)__builtin_neon_vqabsv8hi (__a, 1);
-}
+#define vqabsq_s16(__a) \
+  (int16x8_t)__builtin_neon_vqabsv8hi (__a, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqabsq_s32 (int32x4_t __a)
-{
-  return (int32x4_t)__builtin_neon_vqabsv4si (__a, 1);
-}
+#define vqabsq_s32(__a) \
+  (int32x4_t)__builtin_neon_vqabsv4si (__a, 1);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vneg_s8 (int8x8_t __a)
-{
-  return (int8x8_t)__builtin_neon_vnegv8qi (__a, 1);
-}
+#define vneg_s8(__a) \
+  (int8x8_t)__builtin_neon_vnegv8qi (__a, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vneg_s16 (int16x4_t __a)
-{
-  return (int16x4_t)__builtin_neon_vnegv4hi (__a, 1);
-}
+#define vneg_s16(__a) \
+  (int16x4_t)__builtin_neon_vnegv4hi (__a, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vneg_s32 (int32x2_t __a)
-{
-  return (int32x2_t)__builtin_neon_vnegv2si (__a, 1);
-}
+#define vneg_s32(__a) \
+  (int32x2_t)__builtin_neon_vnegv2si (__a, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vneg_f32 (float32x2_t __a)
-{
-  return (float32x2_t)__builtin_neon_vnegv2sf (__a, 5);
-}
+#define vneg_f32(__a) \
+  (float32x2_t)__builtin_neon_vnegv2sf (__a, 5);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vnegq_s8 (int8x16_t __a)
-{
-  return (int8x16_t)__builtin_neon_vnegv16qi (__a, 1);
-}
+#define vnegq_s8(__a) \
+  (int8x16_t)__builtin_neon_vnegv16qi (__a, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vnegq_s16 (int16x8_t __a)
-{
-  return (int16x8_t)__builtin_neon_vnegv8hi (__a, 1);
-}
+#define vnegq_s16(__a) \
+  (int16x8_t)__builtin_neon_vnegv8hi (__a, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vnegq_s32 (int32x4_t __a)
-{
-  return (int32x4_t)__builtin_neon_vnegv4si (__a, 1);
-}
+#define vnegq_s32(__a) \
+  (int32x4_t)__builtin_neon_vnegv4si (__a, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vnegq_f32 (float32x4_t __a)
-{
-  return (float32x4_t)__builtin_neon_vnegv4sf (__a, 5);
-}
+#define vnegq_f32(__a) \
+  (float32x4_t)__builtin_neon_vnegv4sf (__a, 5);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqneg_s8 (int8x8_t __a)
-{
-  return (int8x8_t)__builtin_neon_vqnegv8qi (__a, 1);
-}
+#define vqneg_s8(__a) \
+  (int8x8_t)__builtin_neon_vqnegv8qi (__a, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqneg_s16 (int16x4_t __a)
-{
-  return (int16x4_t)__builtin_neon_vqnegv4hi (__a, 1);
-}
+#define vqneg_s16(__a) \
+  (int16x4_t)__builtin_neon_vqnegv4hi (__a, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqneg_s32 (int32x2_t __a)
-{
-  return (int32x2_t)__builtin_neon_vqnegv2si (__a, 1);
-}
+#define vqneg_s32(__a) \
+  (int32x2_t)__builtin_neon_vqnegv2si (__a, 1);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vqnegq_s8 (int8x16_t __a)
-{
-  return (int8x16_t)__builtin_neon_vqnegv16qi (__a, 1);
-}
+#define vqnegq_s8(__a) \
+  (int8x16_t)__builtin_neon_vqnegv16qi (__a, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqnegq_s16 (int16x8_t __a)
-{
-  return (int16x8_t)__builtin_neon_vqnegv8hi (__a, 1);
-}
+#define vqnegq_s16(__a) \
+  (int16x8_t)__builtin_neon_vqnegv8hi (__a, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqnegq_s32 (int32x4_t __a)
-{
-  return (int32x4_t)__builtin_neon_vqnegv4si (__a, 1);
-}
+#define vqnegq_s32(__a) \
+  (int32x4_t)__builtin_neon_vqnegv4si (__a, 1);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vmvn_s8 (int8x8_t __a)
-{
-  return (int8x8_t)__builtin_neon_vmvnv8qi (__a, 1);
-}
+#define vmvn_s8(__a) \
+  (int8x8_t)__builtin_neon_vmvnv8qi (__a, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmvn_s16 (int16x4_t __a)
-{
-  return (int16x4_t)__builtin_neon_vmvnv4hi (__a, 1);
-}
+#define vmvn_s16(__a) \
+  (int16x4_t)__builtin_neon_vmvnv4hi (__a, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmvn_s32 (int32x2_t __a)
-{
-  return (int32x2_t)__builtin_neon_vmvnv2si (__a, 1);
-}
+#define vmvn_s32(__a) \
+  (int32x2_t)__builtin_neon_vmvnv2si (__a, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vmvn_u8 (uint8x8_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vmvnv8qi ((int8x8_t) __a, 0);
-}
+#define vmvn_u8(__a) \
+  (uint8x8_t)__builtin_neon_vmvnv8qi (__a, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmvn_u16 (uint16x4_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vmvnv4hi ((int16x4_t) __a, 0);
-}
+#define vmvn_u16(__a) \
+  (uint16x4_t)__builtin_neon_vmvnv4hi (__a, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmvn_u32 (uint32x2_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vmvnv2si ((int32x2_t) __a, 0);
-}
+#define vmvn_u32(__a) \
+  (uint32x2_t)__builtin_neon_vmvnv2si (__a, 0);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vmvn_p8 (poly8x8_t __a)
-{
-  return (poly8x8_t)__builtin_neon_vmvnv8qi ((int8x8_t) __a, 4);
-}
+#define vmvn_p8(__a) \
+  (poly8x8_t)__builtin_neon_vmvnv8qi (__a, 4);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vmvnq_s8 (int8x16_t __a)
-{
-  return (int8x16_t)__builtin_neon_vmvnv16qi (__a, 1);
-}
+#define vmvnq_s8(__a) \
+  (int8x16_t)__builtin_neon_vmvnv16qi (__a, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmvnq_s16 (int16x8_t __a)
-{
-  return (int16x8_t)__builtin_neon_vmvnv8hi (__a, 1);
-}
+#define vmvnq_s16(__a) \
+  (int16x8_t)__builtin_neon_vmvnv8hi (__a, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmvnq_s32 (int32x4_t __a)
-{
-  return (int32x4_t)__builtin_neon_vmvnv4si (__a, 1);
-}
+#define vmvnq_s32(__a) \
+  (int32x4_t)__builtin_neon_vmvnv4si (__a, 1);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vmvnq_u8 (uint8x16_t __a)
-{
-  return (uint8x16_t)__builtin_neon_vmvnv16qi ((int8x16_t) __a, 0);
-}
+#define vmvnq_u8(__a) \
+  (uint8x16_t)__builtin_neon_vmvnv16qi (__a, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmvnq_u16 (uint16x8_t __a)
-{
-  return (uint16x8_t)__builtin_neon_vmvnv8hi ((int16x8_t) __a, 0);
-}
+#define vmvnq_u16(__a) \
+  (uint16x8_t)__builtin_neon_vmvnv8hi (__a, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmvnq_u32 (uint32x4_t __a)
-{
-  return (uint32x4_t)__builtin_neon_vmvnv4si ((int32x4_t) __a, 0);
-}
+#define vmvnq_u32(__a) \
+  (uint32x4_t)__builtin_neon_vmvnv4si (__a, 0);
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vmvnq_p8 (poly8x16_t __a)
-{
-  return (poly8x16_t)__builtin_neon_vmvnv16qi ((int8x16_t) __a, 4);
-}
+#define vmvnq_p8(__a) \
+  (poly8x16_t)__builtin_neon_vmvnv16qi (__a, 4);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vcls_s8 (int8x8_t __a)
-{
-  return (int8x8_t)__builtin_neon_vclsv8qi (__a, 1);
-}
+#define vcls_s8(__a) \
+  (int8x8_t)__builtin_neon_vclsv8qi (__a, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vcls_s16 (int16x4_t __a)
-{
-  return (int16x4_t)__builtin_neon_vclsv4hi (__a, 1);
-}
+#define vcls_s16(__a) \
+  (int16x4_t)__builtin_neon_vclsv4hi (__a, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vcls_s32 (int32x2_t __a)
-{
-  return (int32x2_t)__builtin_neon_vclsv2si (__a, 1);
-}
+#define vcls_s32(__a) \
+  (int32x2_t)__builtin_neon_vclsv2si (__a, 1);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vclsq_s8 (int8x16_t __a)
-{
-  return (int8x16_t)__builtin_neon_vclsv16qi (__a, 1);
-}
+#define vclsq_s8(__a) \
+  (int8x16_t)__builtin_neon_vclsv16qi (__a, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vclsq_s16 (int16x8_t __a)
-{
-  return (int16x8_t)__builtin_neon_vclsv8hi (__a, 1);
-}
+#define vclsq_s16(__a) \
+  (int16x8_t)__builtin_neon_vclsv8hi (__a, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vclsq_s32 (int32x4_t __a)
-{
-  return (int32x4_t)__builtin_neon_vclsv4si (__a, 1);
-}
+#define vclsq_s32(__a) \
+  (int32x4_t)__builtin_neon_vclsv4si (__a, 1);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vclz_s8 (int8x8_t __a)
-{
-  return (int8x8_t)__builtin_neon_vclzv8qi (__a, 1);
-}
+#define vclz_s8(__a) \
+  (int8x8_t)__builtin_neon_vclzv8qi (__a, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vclz_s16 (int16x4_t __a)
-{
-  return (int16x4_t)__builtin_neon_vclzv4hi (__a, 1);
-}
+#define vclz_s16(__a) \
+  (int16x4_t)__builtin_neon_vclzv4hi (__a, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vclz_s32 (int32x2_t __a)
-{
-  return (int32x2_t)__builtin_neon_vclzv2si (__a, 1);
-}
+#define vclz_s32(__a) \
+  (int32x2_t)__builtin_neon_vclzv2si (__a, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vclz_u8 (uint8x8_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vclzv8qi ((int8x8_t) __a, 0);
-}
+#define vclz_u8(__a) \
+  (uint8x8_t)__builtin_neon_vclzv8qi (__a, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vclz_u16 (uint16x4_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vclzv4hi ((int16x4_t) __a, 0);
-}
+#define vclz_u16(__a) \
+  (uint16x4_t)__builtin_neon_vclzv4hi (__a, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vclz_u32 (uint32x2_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vclzv2si ((int32x2_t) __a, 0);
-}
+#define vclz_u32(__a) \
+  (uint32x2_t)__builtin_neon_vclzv2si (__a, 0);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vclzq_s8 (int8x16_t __a)
-{
-  return (int8x16_t)__builtin_neon_vclzv16qi (__a, 1);
-}
+#define vclzq_s8(__a) \
+  (int8x16_t)__builtin_neon_vclzv16qi (__a, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vclzq_s16 (int16x8_t __a)
-{
-  return (int16x8_t)__builtin_neon_vclzv8hi (__a, 1);
-}
+#define vclzq_s16(__a) \
+  (int16x8_t)__builtin_neon_vclzv8hi (__a, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vclzq_s32 (int32x4_t __a)
-{
-  return (int32x4_t)__builtin_neon_vclzv4si (__a, 1);
-}
+#define vclzq_s32(__a) \
+  (int32x4_t)__builtin_neon_vclzv4si (__a, 1);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vclzq_u8 (uint8x16_t __a)
-{
-  return (uint8x16_t)__builtin_neon_vclzv16qi ((int8x16_t) __a, 0);
-}
+#define vclzq_u8(__a) \
+  (uint8x16_t)__builtin_neon_vclzv16qi (__a, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vclzq_u16 (uint16x8_t __a)
-{
-  return (uint16x8_t)__builtin_neon_vclzv8hi ((int16x8_t) __a, 0);
-}
+#define vclzq_u16(__a) \
+  (uint16x8_t)__builtin_neon_vclzv8hi (__a, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vclzq_u32 (uint32x4_t __a)
-{
-  return (uint32x4_t)__builtin_neon_vclzv4si ((int32x4_t) __a, 0);
-}
+#define vclzq_u32(__a) \
+  (uint32x4_t)__builtin_neon_vclzv4si (__a, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vcnt_s8 (int8x8_t __a)
-{
-  return (int8x8_t)__builtin_neon_vcntv8qi (__a, 1);
-}
+#define vcnt_s8(__a) \
+  (int8x8_t)__builtin_neon_vcntv8qi (__a, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcnt_u8 (uint8x8_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vcntv8qi ((int8x8_t) __a, 0);
-}
+#define vcnt_u8(__a) \
+  (uint8x8_t)__builtin_neon_vcntv8qi (__a, 0);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vcnt_p8 (poly8x8_t __a)
-{
-  return (poly8x8_t)__builtin_neon_vcntv8qi ((int8x8_t) __a, 4);
-}
+#define vcnt_p8(__a) \
+  (poly8x8_t)__builtin_neon_vcntv8qi (__a, 4);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vcntq_s8 (int8x16_t __a)
-{
-  return (int8x16_t)__builtin_neon_vcntv16qi (__a, 1);
-}
+#define vcntq_s8(__a) \
+  (int8x16_t)__builtin_neon_vcntv16qi (__a, 1);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcntq_u8 (uint8x16_t __a)
-{
-  return (uint8x16_t)__builtin_neon_vcntv16qi ((int8x16_t) __a, 0);
-}
+#define vcntq_u8(__a) \
+  (uint8x16_t)__builtin_neon_vcntv16qi (__a, 0);
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vcntq_p8 (poly8x16_t __a)
-{
-  return (poly8x16_t)__builtin_neon_vcntv16qi ((int8x16_t) __a, 4);
-}
+#define vcntq_p8(__a) \
+  (poly8x16_t)__builtin_neon_vcntv16qi (__a, 4);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrecpe_f32 (float32x2_t __a)
-{
-  return (float32x2_t)__builtin_neon_vrecpev2sf (__a, 5);
-}
+#define vrecpe_f32(__a) \
+  (float32x2_t)__builtin_neon_vrecpev2sf (__a, 5);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrecpe_u32 (uint32x2_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vrecpev2si ((int32x2_t) __a, 0);
-}
+#define vrecpe_u32(__a) \
+  (uint32x2_t)__builtin_neon_vrecpev2si (__a, 0);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrecpeq_f32 (float32x4_t __a)
-{
-  return (float32x4_t)__builtin_neon_vrecpev4sf (__a, 5);
-}
+#define vrecpeq_f32(__a) \
+  (float32x4_t)__builtin_neon_vrecpev4sf (__a, 5);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vrecpeq_u32 (uint32x4_t __a)
-{
-  return (uint32x4_t)__builtin_neon_vrecpev4si ((int32x4_t) __a, 0);
-}
+#define vrecpeq_u32(__a) \
+  (uint32x4_t)__builtin_neon_vrecpev4si (__a, 0);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrsqrte_f32 (float32x2_t __a)
-{
-  return (float32x2_t)__builtin_neon_vrsqrtev2sf (__a, 5);
-}
+#define vrsqrte_f32(__a) \
+  (float32x2_t)__builtin_neon_vrsqrtev2sf (__a, 5);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrsqrte_u32 (uint32x2_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vrsqrtev2si ((int32x2_t) __a, 0);
-}
+#define vrsqrte_u32(__a) \
+  (uint32x2_t)__builtin_neon_vrsqrtev2si (__a, 0);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrsqrteq_f32 (float32x4_t __a)
-{
-  return (float32x4_t)__builtin_neon_vrsqrtev4sf (__a, 5);
-}
+#define vrsqrteq_f32(__a) \
+  (float32x4_t)__builtin_neon_vrsqrtev4sf (__a, 5);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vrsqrteq_u32 (uint32x4_t __a)
-{
-  return (uint32x4_t)__builtin_neon_vrsqrtev4si ((int32x4_t) __a, 0);
-}
+#define vrsqrteq_u32(__a) \
+  (uint32x4_t)__builtin_neon_vrsqrtev4si (__a, 0);
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vget_lane_s8 (int8x8_t __a, const int __b)
-{
-  return (int8_t)__builtin_neon_vget_lanev8qi (__a, __b, 1);
-}
+#define vget_lane_s8(__a, __b) \
+  (int8_t)__builtin_neon_vget_lanev8qi (__a, __b, 1);
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vget_lane_s16 (int16x4_t __a, const int __b)
-{
-  return (int16_t)__builtin_neon_vget_lanev4hi (__a, __b, 1);
-}
+#define vget_lane_s16(__a, __b) \
+  (int16_t)__builtin_neon_vget_lanev4hi (__a, __b, 1);
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vget_lane_s32 (int32x2_t __a, const int __b)
-{
-  return (int32_t)__builtin_neon_vget_lanev2si (__a, __b, 1);
-}
+#define vget_lane_s32(__a, __b) \
+  (int32_t)__builtin_neon_vget_lanev2si (__a, __b, 1);
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vget_lane_f32 (float32x2_t __a, const int __b)
-{
-  return (float32_t)__builtin_neon_vget_lanev2sf (__a, __b, 5);
-}
+#define vget_lane_f32(__a, __b) \
+  (float32_t)__builtin_neon_vget_lanev2sf (__a, __b, 5);
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-vget_lane_u8 (uint8x8_t __a, const int __b)
-{
-  return (uint8_t)__builtin_neon_vget_lanev8qi ((int8x8_t) __a, __b, 0);
-}
+#define vget_lane_u8(__a, __b) \
+  (uint8_t)__builtin_neon_vget_lanev8qi (__a, __b, 0);
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vget_lane_u16 (uint16x4_t __a, const int __b)
-{
-  return (uint16_t)__builtin_neon_vget_lanev4hi ((int16x4_t) __a, __b, 0);
-}
+#define vget_lane_u16(__a, __b) \
+  (uint16_t)__builtin_neon_vget_lanev4hi (__a, __b, 0);
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vget_lane_u32 (uint32x2_t __a, const int __b)
-{
-  return (uint32_t)__builtin_neon_vget_lanev2si ((int32x2_t) __a, __b, 0);
-}
+#define vget_lane_u32(__a, __b) \
+  (uint32_t)__builtin_neon_vget_lanev2si (__a, __b, 0);
 
-__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
-vget_lane_p8 (poly8x8_t __a, const int __b)
-{
-  return (poly8_t)__builtin_neon_vget_lanev8qi ((int8x8_t) __a, __b, 4);
-}
+#define vget_lane_p8(__a, __b) \
+  (poly8_t)__builtin_neon_vget_lanev8qi (__a, __b, 4);
 
-__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
-vget_lane_p16 (poly16x4_t __a, const int __b)
-{
-  return (poly16_t)__builtin_neon_vget_lanev4hi ((int16x4_t) __a, __b, 4);
-}
+#define vget_lane_p16(__a, __b) \
+  (poly16_t)__builtin_neon_vget_lanev4hi (__a, __b, 4);
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vget_lane_s64 (int64x1_t __a, const int __b)
-{
-  return (int64_t)__builtin_neon_vget_lanedi (__a, __b, 1);
-}
+#define vget_lane_s64(__a, __b) \
+  (int64_t)__builtin_neon_vget_lanedi (__a, __b, 1);
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vget_lane_u64 (uint64x1_t __a, const int __b)
-{
-  return (uint64_t)__builtin_neon_vget_lanedi ((int64x1_t) __a, __b, 0);
-}
+#define vget_lane_u64(__a, __b) \
+  (uint64_t)__builtin_neon_vget_lanedi (__a, __b, 0);
 
-__extension__ static __inline int8_t __attribute__ ((__always_inline__))
-vgetq_lane_s8 (int8x16_t __a, const int __b)
-{
-  return (int8_t)__builtin_neon_vget_lanev16qi (__a, __b, 1);
-}
+#define vgetq_lane_s8(__a, __b) \
+  (int8_t)__builtin_neon_vget_lanev16qi (__a, __b, 1);
 
-__extension__ static __inline int16_t __attribute__ ((__always_inline__))
-vgetq_lane_s16 (int16x8_t __a, const int __b)
-{
-  return (int16_t)__builtin_neon_vget_lanev8hi (__a, __b, 1);
-}
+#define vgetq_lane_s16(__a, __b) \
+  (int16_t)__builtin_neon_vget_lanev8hi (__a, __b, 1);
 
-__extension__ static __inline int32_t __attribute__ ((__always_inline__))
-vgetq_lane_s32 (int32x4_t __a, const int __b)
-{
-  return (int32_t)__builtin_neon_vget_lanev4si (__a, __b, 1);
-}
+#define vgetq_lane_s32(__a, __b) \
+  (int32_t)__builtin_neon_vget_lanev4si (__a, __b, 1);
 
-__extension__ static __inline float32_t __attribute__ ((__always_inline__))
-vgetq_lane_f32 (float32x4_t __a, const int __b)
-{
-  return (float32_t)__builtin_neon_vget_lanev4sf (__a, __b, 5);
-}
+#define vgetq_lane_f32(__a, __b) \
+  (float32_t)__builtin_neon_vget_lanev4sf (__a, __b, 5);
 
-__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
-vgetq_lane_u8 (uint8x16_t __a, const int __b)
-{
-  return (uint8_t)__builtin_neon_vget_lanev16qi ((int8x16_t) __a, __b, 0);
-}
+#define vgetq_lane_u8(__a, __b) \
+  (uint8_t)__builtin_neon_vget_lanev16qi (__a, __b, 0);
 
-__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
-vgetq_lane_u16 (uint16x8_t __a, const int __b)
-{
-  return (uint16_t)__builtin_neon_vget_lanev8hi ((int16x8_t) __a, __b, 0);
-}
+#define vgetq_lane_u16(__a, __b) \
+  (uint16_t)__builtin_neon_vget_lanev8hi (__a, __b, 0);
 
-__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
-vgetq_lane_u32 (uint32x4_t __a, const int __b)
-{
-  return (uint32_t)__builtin_neon_vget_lanev4si ((int32x4_t) __a, __b, 0);
-}
+#define vgetq_lane_u32(__a, __b) \
+  (uint32_t)__builtin_neon_vget_lanev4si (__a, __b, 0);
 
-__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
-vgetq_lane_p8 (poly8x16_t __a, const int __b)
-{
-  return (poly8_t)__builtin_neon_vget_lanev16qi ((int8x16_t) __a, __b, 4);
-}
+#define vgetq_lane_p8(__a, __b) \
+  (poly8_t)__builtin_neon_vget_lanev16qi (__a, __b, 4);
 
-__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
-vgetq_lane_p16 (poly16x8_t __a, const int __b)
-{
-  return (poly16_t)__builtin_neon_vget_lanev8hi ((int16x8_t) __a, __b, 4);
-}
+#define vgetq_lane_p16(__a, __b) \
+  (poly16_t)__builtin_neon_vget_lanev8hi (__a, __b, 4);
 
-__extension__ static __inline int64_t __attribute__ ((__always_inline__))
-vgetq_lane_s64 (int64x2_t __a, const int __b)
-{
-  return (int64_t)__builtin_neon_vget_lanev2di (__a, __b, 1);
-}
+#define vgetq_lane_s64(__a, __b) \
+  (int64_t)__builtin_neon_vget_lanev2di (__a, __b, 1);
 
-__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
-vgetq_lane_u64 (uint64x2_t __a, const int __b)
-{
-  return (uint64_t)__builtin_neon_vget_lanev2di ((int64x2_t) __a, __b, 0);
-}
+#define vgetq_lane_u64(__a, __b) \
+  (uint64_t)__builtin_neon_vget_lanev2di (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vset_lane_s8 (int8_t __a, int8x8_t __b, const int __c)
-{
-  return (int8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, __b, __c);
-}
+#define vset_lane_s8(__a, __b, __c) \
+  (int8x8_t)__builtin_neon_vset_lanev8qi (__a, __b, __c);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vset_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
-{
-  return (int16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, __b, __c);
-}
+#define vset_lane_s16(__a, __b, __c) \
+  (int16x4_t)__builtin_neon_vset_lanev4hi (__a, __b, __c);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vset_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
-{
-  return (int32x2_t)__builtin_neon_vset_lanev2si ((__builtin_neon_si) __a, __b, __c);
-}
+#define vset_lane_s32(__a, __b, __c) \
+  (int32x2_t)__builtin_neon_vset_lanev2si (__a, __b, __c);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vset_lane_f32 (float32_t __a, float32x2_t __b, const int __c)
-{
-  return (float32x2_t)__builtin_neon_vset_lanev2sf (__a, __b, __c);
-}
+#define vset_lane_f32(__a, __b, __c) \
+  (float32x2_t)__builtin_neon_vset_lanev2sf (__a, __b, __c);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vset_lane_u8 (uint8_t __a, uint8x8_t __b, const int __c)
-{
-  return (uint8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, (int8x8_t) __b, __c);
-}
+#define vset_lane_u8(__a, __b, __c) \
+  (uint8x8_t)__builtin_neon_vset_lanev8qi (__a, __b, __c);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vset_lane_u16 (uint16_t __a, uint16x4_t __b, const int __c)
-{
-  return (uint16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, (int16x4_t) __b, __c);
-}
+#define vset_lane_u16(__a, __b, __c) \
+  (uint16x4_t)__builtin_neon_vset_lanev4hi (__a, __b, __c);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vset_lane_u32 (uint32_t __a, uint32x2_t __b, const int __c)
-{
-  return (uint32x2_t)__builtin_neon_vset_lanev2si ((__builtin_neon_si) __a, (int32x2_t) __b, __c);
-}
+#define vset_lane_u32(__a, __b, __c) \
+  (uint32x2_t)__builtin_neon_vset_lanev2si (__a, __b, __c);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vset_lane_p8 (poly8_t __a, poly8x8_t __b, const int __c)
-{
-  return (poly8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, (int8x8_t) __b, __c);
-}
+#define vset_lane_p8(__a, __b, __c) \
+  (poly8x8_t)__builtin_neon_vset_lanev8qi (__a, __b, __c);
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vset_lane_p16 (poly16_t __a, poly16x4_t __b, const int __c)
-{
-  return (poly16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, (int16x4_t) __b, __c);
-}
+#define vset_lane_p16(__a, __b, __c) \
+  (poly16x4_t)__builtin_neon_vset_lanev4hi (__a, __b, __c);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vset_lane_s64 (int64_t __a, int64x1_t __b, const int __c)
-{
-  return (int64x1_t)__builtin_neon_vset_lanedi ((__builtin_neon_di) __a, __b, __c);
-}
+#define vset_lane_s64(__a, __b, __c) \
+  (int64x1_t)__builtin_neon_vset_lanedi (__a, __b, __c);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vset_lane_u64 (uint64_t __a, uint64x1_t __b, const int __c)
-{
-  return (uint64x1_t)__builtin_neon_vset_lanedi ((__builtin_neon_di) __a, (int64x1_t) __b, __c);
-}
+#define vset_lane_u64(__a, __b, __c) \
+  (uint64x1_t)__builtin_neon_vset_lanedi (__a, __b, __c);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vsetq_lane_s8 (int8_t __a, int8x16_t __b, const int __c)
-{
-  return (int8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, __b, __c);
-}
+#define vsetq_lane_s8(__a, __b, __c) \
+  (int8x16_t)__builtin_neon_vset_lanev16qi (__a, __b, __c);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vsetq_lane_s16 (int16_t __a, int16x8_t __b, const int __c)
-{
-  return (int16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, __b, __c);
-}
+#define vsetq_lane_s16(__a, __b, __c) \
+  (int16x8_t)__builtin_neon_vset_lanev8hi (__a, __b, __c);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int __c)
-{
-  return (int32x4_t)__builtin_neon_vset_lanev4si ((__builtin_neon_si) __a, __b, __c);
-}
+#define vsetq_lane_s32(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vset_lanev4si (__a, __b, __c);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vsetq_lane_f32 (float32_t __a, float32x4_t __b, const int __c)
-{
-  return (float32x4_t)__builtin_neon_vset_lanev4sf (__a, __b, __c);
-}
+#define vsetq_lane_f32(__a, __b, __c) \
+  (float32x4_t)__builtin_neon_vset_lanev4sf (__a, __b, __c);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vsetq_lane_u8 (uint8_t __a, uint8x16_t __b, const int __c)
-{
-  return (uint8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, (int8x16_t) __b, __c);
-}
+#define vsetq_lane_u8(__a, __b, __c) \
+  (uint8x16_t)__builtin_neon_vset_lanev16qi (__a, __b, __c);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vsetq_lane_u16 (uint16_t __a, uint16x8_t __b, const int __c)
-{
-  return (uint16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, (int16x8_t) __b, __c);
-}
+#define vsetq_lane_u16(__a, __b, __c) \
+  (uint16x8_t)__builtin_neon_vset_lanev8hi (__a, __b, __c);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vsetq_lane_u32 (uint32_t __a, uint32x4_t __b, const int __c)
-{
-  return (uint32x4_t)__builtin_neon_vset_lanev4si ((__builtin_neon_si) __a, (int32x4_t) __b, __c);
-}
+#define vsetq_lane_u32(__a, __b, __c) \
+  (uint32x4_t)__builtin_neon_vset_lanev4si (__a, __b, __c);
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vsetq_lane_p8 (poly8_t __a, poly8x16_t __b, const int __c)
-{
-  return (poly8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, (int8x16_t) __b, __c);
-}
+#define vsetq_lane_p8(__a, __b, __c) \
+  (poly8x16_t)__builtin_neon_vset_lanev16qi (__a, __b, __c);
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vsetq_lane_p16 (poly16_t __a, poly16x8_t __b, const int __c)
-{
-  return (poly16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, (int16x8_t) __b, __c);
-}
+#define vsetq_lane_p16(__a, __b, __c) \
+  (poly16x8_t)__builtin_neon_vset_lanev8hi (__a, __b, __c);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vsetq_lane_s64 (int64_t __a, int64x2_t __b, const int __c)
-{
-  return (int64x2_t)__builtin_neon_vset_lanev2di ((__builtin_neon_di) __a, __b, __c);
-}
+#define vsetq_lane_s64(__a, __b, __c) \
+  (int64x2_t)__builtin_neon_vset_lanev2di (__a, __b, __c);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vsetq_lane_u64 (uint64_t __a, uint64x2_t __b, const int __c)
-{
-  return (uint64x2_t)__builtin_neon_vset_lanev2di ((__builtin_neon_di) __a, (int64x2_t) __b, __c);
-}
+#define vsetq_lane_u64(__a, __b, __c) \
+  (uint64x2_t)__builtin_neon_vset_lanev2di (__a, __b, __c);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vcreate_s8 (uint64_t __a)
-{
-  return (int8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a);
-}
+#define vcreate_s8(__a) \
+  (int8x8_t)__builtin_neon_vcreatev8qi (__a);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vcreate_s16 (uint64_t __a)
-{
-  return (int16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a);
-}
+#define vcreate_s16(__a) \
+  (int16x4_t)__builtin_neon_vcreatev4hi (__a);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vcreate_s32 (uint64_t __a)
-{
-  return (int32x2_t)__builtin_neon_vcreatev2si ((__builtin_neon_di) __a);
-}
+#define vcreate_s32(__a) \
+  (int32x2_t)__builtin_neon_vcreatev2si (__a);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vcreate_s64 (uint64_t __a)
-{
-  return (int64x1_t)__builtin_neon_vcreatedi ((__builtin_neon_di) __a);
-}
+#define vcreate_s64(__a) \
+  (int64x1_t)__builtin_neon_vcreatedi (__a);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vcreate_f32 (uint64_t __a)
-{
-  return (float32x2_t)__builtin_neon_vcreatev2sf ((__builtin_neon_di) __a);
-}
+#define vcreate_f32(__a) \
+  (float32x2_t)__builtin_neon_vcreatev2sf (__a);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vcreate_u8 (uint64_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a);
-}
+#define vcreate_u8(__a) \
+  (uint8x8_t)__builtin_neon_vcreatev8qi (__a);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vcreate_u16 (uint64_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a);
-}
+#define vcreate_u16(__a) \
+  (uint16x4_t)__builtin_neon_vcreatev4hi (__a);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcreate_u32 (uint64_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vcreatev2si ((__builtin_neon_di) __a);
-}
+#define vcreate_u32(__a) \
+  (uint32x2_t)__builtin_neon_vcreatev2si (__a);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vcreate_u64 (uint64_t __a)
-{
-  return (uint64x1_t)__builtin_neon_vcreatedi ((__builtin_neon_di) __a);
-}
+#define vcreate_u64(__a) \
+  (uint64x1_t)__builtin_neon_vcreatedi (__a);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vcreate_p8 (uint64_t __a)
-{
-  return (poly8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a);
-}
+#define vcreate_p8(__a) \
+  (poly8x8_t)__builtin_neon_vcreatev8qi (__a);
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vcreate_p16 (uint64_t __a)
-{
-  return (poly16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a);
-}
+#define vcreate_p16(__a) \
+  (poly16x4_t)__builtin_neon_vcreatev4hi (__a);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vdup_n_s8 (int8_t __a)
-{
-  return (int8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
-}
+#define vdup_n_s8(__a) \
+  (int8x8_t)__builtin_neon_vdup_nv8qi (__a);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vdup_n_s16 (int16_t __a)
-{
-  return (int16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
-}
+#define vdup_n_s16(__a) \
+  (int16x4_t)__builtin_neon_vdup_nv4hi (__a);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vdup_n_s32 (int32_t __a)
-{
-  return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
-}
+#define vdup_n_s32(__a) \
+  (int32x2_t)__builtin_neon_vdup_nv2si (__a);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vdup_n_f32 (float32_t __a)
-{
-  return (float32x2_t)__builtin_neon_vdup_nv2sf (__a);
-}
+#define vdup_n_f32(__a) \
+  (float32x2_t)__builtin_neon_vdup_nv2sf (__a);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vdup_n_u8 (uint8_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
-}
+#define vdup_n_u8(__a) \
+  (uint8x8_t)__builtin_neon_vdup_nv8qi (__a);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vdup_n_u16 (uint16_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
-}
+#define vdup_n_u16(__a) \
+  (uint16x4_t)__builtin_neon_vdup_nv4hi (__a);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vdup_n_u32 (uint32_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
-}
+#define vdup_n_u32(__a) \
+  (uint32x2_t)__builtin_neon_vdup_nv2si (__a);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vdup_n_p8 (poly8_t __a)
-{
-  return (poly8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
-}
+#define vdup_n_p8(__a) \
+  (poly8x8_t)__builtin_neon_vdup_nv8qi (__a);
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vdup_n_p16 (poly16_t __a)
-{
-  return (poly16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
-}
+#define vdup_n_p16(__a) \
+  (poly16x4_t)__builtin_neon_vdup_nv4hi (__a);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vdup_n_s64 (int64_t __a)
-{
-  return (int64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
-}
+#define vdup_n_s64(__a) \
+  (int64x1_t)__builtin_neon_vdup_ndi (__a);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vdup_n_u64 (uint64_t __a)
-{
-  return (uint64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
-}
+#define vdup_n_u64(__a) \
+  (uint64x1_t)__builtin_neon_vdup_ndi (__a);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vdupq_n_s8 (int8_t __a)
-{
-  return (int8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
-}
+#define vdupq_n_s8(__a) \
+  (int8x16_t)__builtin_neon_vdup_nv16qi (__a);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vdupq_n_s16 (int16_t __a)
-{
-  return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
-}
+#define vdupq_n_s16(__a) \
+  (int16x8_t)__builtin_neon_vdup_nv8hi (__a);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vdupq_n_s32 (int32_t __a)
-{
-  return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
-}
+#define vdupq_n_s32(__a) \
+  (int32x4_t)__builtin_neon_vdup_nv4si (__a);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vdupq_n_f32 (float32_t __a)
-{
-  return (float32x4_t)__builtin_neon_vdup_nv4sf (__a);
-}
+#define vdupq_n_f32(__a) \
+  (float32x4_t)__builtin_neon_vdup_nv4sf (__a);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vdupq_n_u8 (uint8_t __a)
-{
-  return (uint8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
-}
+#define vdupq_n_u8(__a) \
+  (uint8x16_t)__builtin_neon_vdup_nv16qi (__a);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vdupq_n_u16 (uint16_t __a)
-{
-  return (uint16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
-}
+#define vdupq_n_u16(__a) \
+  (uint16x8_t)__builtin_neon_vdup_nv8hi (__a);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vdupq_n_u32 (uint32_t __a)
-{
-  return (uint32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
-}
+#define vdupq_n_u32(__a) \
+  (uint32x4_t)__builtin_neon_vdup_nv4si (__a);
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vdupq_n_p8 (poly8_t __a)
-{
-  return (poly8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
-}
+#define vdupq_n_p8(__a) \
+  (poly8x16_t)__builtin_neon_vdup_nv16qi (__a);
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vdupq_n_p16 (poly16_t __a)
-{
-  return (poly16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
-}
+#define vdupq_n_p16(__a) \
+  (poly16x8_t)__builtin_neon_vdup_nv8hi (__a);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vdupq_n_s64 (int64_t __a)
-{
-  return (int64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
-}
+#define vdupq_n_s64(__a) \
+  (int64x2_t)__builtin_neon_vdup_nv2di (__a);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vdupq_n_u64 (uint64_t __a)
-{
-  return (uint64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
-}
+#define vdupq_n_u64(__a) \
+  (uint64x2_t)__builtin_neon_vdup_nv2di (__a);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vmov_n_s8 (int8_t __a)
-{
-  return (int8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
-}
+#define vmov_n_s8(__a) \
+  (int8x8_t)__builtin_neon_vdup_nv8qi (__a);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmov_n_s16 (int16_t __a)
-{
-  return (int16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
-}
+#define vmov_n_s16(__a) \
+  (int16x4_t)__builtin_neon_vdup_nv4hi (__a);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmov_n_s32 (int32_t __a)
-{
-  return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
-}
+#define vmov_n_s32(__a) \
+  (int32x2_t)__builtin_neon_vdup_nv2si (__a);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmov_n_f32 (float32_t __a)
-{
-  return (float32x2_t)__builtin_neon_vdup_nv2sf (__a);
-}
+#define vmov_n_f32(__a) \
+  (float32x2_t)__builtin_neon_vdup_nv2sf (__a);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vmov_n_u8 (uint8_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
-}
+#define vmov_n_u8(__a) \
+  (uint8x8_t)__builtin_neon_vdup_nv8qi (__a);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmov_n_u16 (uint16_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
-}
+#define vmov_n_u16(__a) \
+  (uint16x4_t)__builtin_neon_vdup_nv4hi (__a);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmov_n_u32 (uint32_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
-}
+#define vmov_n_u32(__a) \
+  (uint32x2_t)__builtin_neon_vdup_nv2si (__a);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vmov_n_p8 (poly8_t __a)
-{
-  return (poly8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
-}
+#define vmov_n_p8(__a) \
+  (poly8x8_t)__builtin_neon_vdup_nv8qi (__a);
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vmov_n_p16 (poly16_t __a)
-{
-  return (poly16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
-}
+#define vmov_n_p16(__a) \
+  (poly16x4_t)__builtin_neon_vdup_nv4hi (__a);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vmov_n_s64 (int64_t __a)
-{
-  return (int64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
-}
+#define vmov_n_s64(__a) \
+  (int64x1_t)__builtin_neon_vdup_ndi (__a);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vmov_n_u64 (uint64_t __a)
-{
-  return (uint64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
-}
+#define vmov_n_u64(__a) \
+  (uint64x1_t)__builtin_neon_vdup_ndi (__a);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vmovq_n_s8 (int8_t __a)
-{
-  return (int8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
-}
+#define vmovq_n_s8(__a) \
+  (int8x16_t)__builtin_neon_vdup_nv16qi (__a);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmovq_n_s16 (int16_t __a)
-{
-  return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
-}
+#define vmovq_n_s16(__a) \
+  (int16x8_t)__builtin_neon_vdup_nv8hi (__a);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmovq_n_s32 (int32_t __a)
-{
-  return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
-}
+#define vmovq_n_s32(__a) \
+  (int32x4_t)__builtin_neon_vdup_nv4si (__a);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmovq_n_f32 (float32_t __a)
-{
-  return (float32x4_t)__builtin_neon_vdup_nv4sf (__a);
-}
+#define vmovq_n_f32(__a) \
+  (float32x4_t)__builtin_neon_vdup_nv4sf (__a);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vmovq_n_u8 (uint8_t __a)
-{
-  return (uint8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
-}
+#define vmovq_n_u8(__a) \
+  (uint8x16_t)__builtin_neon_vdup_nv16qi (__a);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmovq_n_u16 (uint16_t __a)
-{
-  return (uint16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
-}
+#define vmovq_n_u16(__a) \
+  (uint16x8_t)__builtin_neon_vdup_nv8hi (__a);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmovq_n_u32 (uint32_t __a)
-{
-  return (uint32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
-}
+#define vmovq_n_u32(__a) \
+  (uint32x4_t)__builtin_neon_vdup_nv4si (__a);
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vmovq_n_p8 (poly8_t __a)
-{
-  return (poly8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
-}
+#define vmovq_n_p8(__a) \
+  (poly8x16_t)__builtin_neon_vdup_nv16qi (__a);
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vmovq_n_p16 (poly16_t __a)
-{
-  return (poly16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
-}
+#define vmovq_n_p16(__a) \
+  (poly16x8_t)__builtin_neon_vdup_nv8hi (__a);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmovq_n_s64 (int64_t __a)
-{
-  return (int64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
-}
+#define vmovq_n_s64(__a) \
+  (int64x2_t)__builtin_neon_vdup_nv2di (__a);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmovq_n_u64 (uint64_t __a)
-{
-  return (uint64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
-}
+#define vmovq_n_u64(__a) \
+  (uint64x2_t)__builtin_neon_vdup_nv2di (__a);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vdup_lane_s8 (int8x8_t __a, const int __b)
-{
-  return (int8x8_t)__builtin_neon_vdup_lanev8qi (__a, __b);
-}
+#define vdup_lane_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vdup_lanev8qi (__a, __b);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vdup_lane_s16 (int16x4_t __a, const int __b)
-{
-  return (int16x4_t)__builtin_neon_vdup_lanev4hi (__a, __b);
-}
+#define vdup_lane_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vdup_lanev4hi (__a, __b);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vdup_lane_s32 (int32x2_t __a, const int __b)
-{
-  return (int32x2_t)__builtin_neon_vdup_lanev2si (__a, __b);
-}
+#define vdup_lane_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vdup_lanev2si (__a, __b);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vdup_lane_f32 (float32x2_t __a, const int __b)
-{
-  return (float32x2_t)__builtin_neon_vdup_lanev2sf (__a, __b);
-}
+#define vdup_lane_f32(__a, __b) \
+  (float32x2_t)__builtin_neon_vdup_lanev2sf (__a, __b);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vdup_lane_u8 (uint8x8_t __a, const int __b)
-{
-  return (uint8x8_t)__builtin_neon_vdup_lanev8qi ((int8x8_t) __a, __b);
-}
+#define vdup_lane_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vdup_lanev8qi (__a, __b);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vdup_lane_u16 (uint16x4_t __a, const int __b)
-{
-  return (uint16x4_t)__builtin_neon_vdup_lanev4hi ((int16x4_t) __a, __b);
-}
+#define vdup_lane_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vdup_lanev4hi (__a, __b);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vdup_lane_u32 (uint32x2_t __a, const int __b)
-{
-  return (uint32x2_t)__builtin_neon_vdup_lanev2si ((int32x2_t) __a, __b);
-}
+#define vdup_lane_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vdup_lanev2si (__a, __b);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vdup_lane_p8 (poly8x8_t __a, const int __b)
-{
-  return (poly8x8_t)__builtin_neon_vdup_lanev8qi ((int8x8_t) __a, __b);
-}
+#define vdup_lane_p8(__a, __b) \
+  (poly8x8_t)__builtin_neon_vdup_lanev8qi (__a, __b);
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vdup_lane_p16 (poly16x4_t __a, const int __b)
-{
-  return (poly16x4_t)__builtin_neon_vdup_lanev4hi ((int16x4_t) __a, __b);
-}
+#define vdup_lane_p16(__a, __b) \
+  (poly16x4_t)__builtin_neon_vdup_lanev4hi (__a, __b);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vdup_lane_s64 (int64x1_t __a, const int __b)
-{
-  return (int64x1_t)__builtin_neon_vdup_lanedi (__a, __b);
-}
+#define vdup_lane_s64(__a, __b) \
+  (int64x1_t)__builtin_neon_vdup_lanedi (__a, __b);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vdup_lane_u64 (uint64x1_t __a, const int __b)
-{
-  return (uint64x1_t)__builtin_neon_vdup_lanedi ((int64x1_t) __a, __b);
-}
+#define vdup_lane_u64(__a, __b) \
+  (uint64x1_t)__builtin_neon_vdup_lanedi (__a, __b);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vdupq_lane_s8 (int8x8_t __a, const int __b)
-{
-  return (int8x16_t)__builtin_neon_vdup_lanev16qi (__a, __b);
-}
+#define vdupq_lane_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vdup_lanev16qi (__a, __b);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vdupq_lane_s16 (int16x4_t __a, const int __b)
-{
-  return (int16x8_t)__builtin_neon_vdup_lanev8hi (__a, __b);
-}
+#define vdupq_lane_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vdup_lanev8hi (__a, __b);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vdupq_lane_s32 (int32x2_t __a, const int __b)
-{
-  return (int32x4_t)__builtin_neon_vdup_lanev4si (__a, __b);
-}
+#define vdupq_lane_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vdup_lanev4si (__a, __b);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vdupq_lane_f32 (float32x2_t __a, const int __b)
-{
-  return (float32x4_t)__builtin_neon_vdup_lanev4sf (__a, __b);
-}
+#define vdupq_lane_f32(__a, __b) \
+  (float32x4_t)__builtin_neon_vdup_lanev4sf (__a, __b);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vdupq_lane_u8 (uint8x8_t __a, const int __b)
-{
-  return (uint8x16_t)__builtin_neon_vdup_lanev16qi ((int8x8_t) __a, __b);
-}
+#define vdupq_lane_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vdup_lanev16qi (__a, __b);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vdupq_lane_u16 (uint16x4_t __a, const int __b)
-{
-  return (uint16x8_t)__builtin_neon_vdup_lanev8hi ((int16x4_t) __a, __b);
-}
+#define vdupq_lane_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vdup_lanev8hi (__a, __b);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vdupq_lane_u32 (uint32x2_t __a, const int __b)
-{
-  return (uint32x4_t)__builtin_neon_vdup_lanev4si ((int32x2_t) __a, __b);
-}
+#define vdupq_lane_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vdup_lanev4si (__a, __b);
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vdupq_lane_p8 (poly8x8_t __a, const int __b)
-{
-  return (poly8x16_t)__builtin_neon_vdup_lanev16qi ((int8x8_t) __a, __b);
-}
+#define vdupq_lane_p8(__a, __b) \
+  (poly8x16_t)__builtin_neon_vdup_lanev16qi (__a, __b);
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vdupq_lane_p16 (poly16x4_t __a, const int __b)
-{
-  return (poly16x8_t)__builtin_neon_vdup_lanev8hi ((int16x4_t) __a, __b);
-}
+#define vdupq_lane_p16(__a, __b) \
+  (poly16x8_t)__builtin_neon_vdup_lanev8hi (__a, __b);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vdupq_lane_s64 (int64x1_t __a, const int __b)
-{
-  return (int64x2_t)__builtin_neon_vdup_lanev2di (__a, __b);
-}
+#define vdupq_lane_s64(__a, __b) \
+  (int64x2_t)__builtin_neon_vdup_lanev2di (__a, __b);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vdupq_lane_u64 (uint64x1_t __a, const int __b)
-{
-  return (uint64x2_t)__builtin_neon_vdup_lanev2di ((int64x1_t) __a, __b);
-}
+#define vdupq_lane_u64(__a, __b) \
+  (uint64x2_t)__builtin_neon_vdup_lanev2di (__a, __b);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vcombine_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x16_t)__builtin_neon_vcombinev8qi (__a, __b);
-}
+#define vcombine_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vcombinev8qi (__a, __b);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vcombine_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x8_t)__builtin_neon_vcombinev4hi (__a, __b);
-}
+#define vcombine_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vcombinev4hi (__a, __b);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vcombine_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x4_t)__builtin_neon_vcombinev2si (__a, __b);
-}
+#define vcombine_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vcombinev2si (__a, __b);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vcombine_s64 (int64x1_t __a, int64x1_t __b)
-{
-  return (int64x2_t)__builtin_neon_vcombinedi (__a, __b);
-}
+#define vcombine_s64(__a, __b) \
+  (int64x2_t)__builtin_neon_vcombinedi (__a, __b);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vcombine_f32 (float32x2_t __a, float32x2_t __b)
-{
-  return (float32x4_t)__builtin_neon_vcombinev2sf (__a, __b);
-}
+#define vcombine_f32(__a, __b) \
+  (float32x4_t)__builtin_neon_vcombinev2sf (__a, __b);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vcombine_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vcombinev8qi ((int8x8_t) __a, (int8x8_t) __b);
-}
+#define vcombine_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vcombinev8qi (__a, __b);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vcombine_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vcombinev4hi ((int16x4_t) __a, (int16x4_t) __b);
-}
+#define vcombine_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vcombinev4hi (__a, __b);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcombine_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vcombinev2si ((int32x2_t) __a, (int32x2_t) __b);
-}
+#define vcombine_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vcombinev2si (__a, __b);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vcombine_u64 (uint64x1_t __a, uint64x1_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vcombinedi ((int64x1_t) __a, (int64x1_t) __b);
-}
+#define vcombine_u64(__a, __b) \
+  (uint64x2_t)__builtin_neon_vcombinedi (__a, __b);
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vcombine_p8 (poly8x8_t __a, poly8x8_t __b)
-{
-  return (poly8x16_t)__builtin_neon_vcombinev8qi ((int8x8_t) __a, (int8x8_t) __b);
-}
+#define vcombine_p8(__a, __b) \
+  (poly8x16_t)__builtin_neon_vcombinev8qi (__a, __b);
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vcombine_p16 (poly16x4_t __a, poly16x4_t __b)
-{
-  return (poly16x8_t)__builtin_neon_vcombinev4hi ((int16x4_t) __a, (int16x4_t) __b);
-}
+#define vcombine_p16(__a, __b) \
+  (poly16x8_t)__builtin_neon_vcombinev4hi (__a, __b);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vget_high_s8 (int8x16_t __a)
-{
-  return (int8x8_t)__builtin_neon_vget_highv16qi (__a);
-}
+#define vget_high_s8(__a) \
+  (int8x8_t)__builtin_neon_vget_highv16qi (__a);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vget_high_s16 (int16x8_t __a)
-{
-  return (int16x4_t)__builtin_neon_vget_highv8hi (__a);
-}
+#define vget_high_s16(__a) \
+  (int16x4_t)__builtin_neon_vget_highv8hi (__a);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vget_high_s32 (int32x4_t __a)
-{
-  return (int32x2_t)__builtin_neon_vget_highv4si (__a);
-}
+#define vget_high_s32(__a) \
+  (int32x2_t)__builtin_neon_vget_highv4si (__a);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vget_high_s64 (int64x2_t __a)
-{
-  return (int64x1_t)__builtin_neon_vget_highv2di (__a);
-}
+#define vget_high_s64(__a) \
+  (int64x1_t)__builtin_neon_vget_highv2di (__a);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vget_high_f32 (float32x4_t __a)
-{
-  return (float32x2_t)__builtin_neon_vget_highv4sf (__a);
-}
+#define vget_high_f32(__a) \
+  (float32x2_t)__builtin_neon_vget_highv4sf (__a);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vget_high_u8 (uint8x16_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vget_highv16qi ((int8x16_t) __a);
-}
+#define vget_high_u8(__a) \
+  (uint8x8_t)__builtin_neon_vget_highv16qi (__a);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vget_high_u16 (uint16x8_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vget_highv8hi ((int16x8_t) __a);
-}
+#define vget_high_u16(__a) \
+  (uint16x4_t)__builtin_neon_vget_highv8hi (__a);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vget_high_u32 (uint32x4_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vget_highv4si ((int32x4_t) __a);
-}
+#define vget_high_u32(__a) \
+  (uint32x2_t)__builtin_neon_vget_highv4si (__a);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vget_high_u64 (uint64x2_t __a)
-{
-  return (uint64x1_t)__builtin_neon_vget_highv2di ((int64x2_t) __a);
-}
+#define vget_high_u64(__a) \
+  (uint64x1_t)__builtin_neon_vget_highv2di (__a);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vget_high_p8 (poly8x16_t __a)
-{
-  return (poly8x8_t)__builtin_neon_vget_highv16qi ((int8x16_t) __a);
-}
+#define vget_high_p8(__a) \
+  (poly8x8_t)__builtin_neon_vget_highv16qi (__a);
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vget_high_p16 (poly16x8_t __a)
-{
-  return (poly16x4_t)__builtin_neon_vget_highv8hi ((int16x8_t) __a);
-}
+#define vget_high_p16(__a) \
+  (poly16x4_t)__builtin_neon_vget_highv8hi (__a);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vget_low_s8 (int8x16_t __a)
-{
-  return (int8x8_t)__builtin_neon_vget_lowv16qi (__a);
-}
+#define vget_low_s8(__a) \
+  (int8x8_t)__builtin_neon_vget_lowv16qi (__a);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vget_low_s16 (int16x8_t __a)
-{
-  return (int16x4_t)__builtin_neon_vget_lowv8hi (__a);
-}
+#define vget_low_s16(__a) \
+  (int16x4_t)__builtin_neon_vget_lowv8hi (__a);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vget_low_s32 (int32x4_t __a)
-{
-  return (int32x2_t)__builtin_neon_vget_lowv4si (__a);
-}
+#define vget_low_s32(__a) \
+  (int32x2_t)__builtin_neon_vget_lowv4si (__a);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vget_low_s64 (int64x2_t __a)
-{
-  return (int64x1_t)__builtin_neon_vget_lowv2di (__a);
-}
+#define vget_low_s64(__a) \
+  (int64x1_t)__builtin_neon_vget_lowv2di (__a);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vget_low_f32 (float32x4_t __a)
-{
-  return (float32x2_t)__builtin_neon_vget_lowv4sf (__a);
-}
+#define vget_low_f32(__a) \
+  (float32x2_t)__builtin_neon_vget_lowv4sf (__a);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vget_low_u8 (uint8x16_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vget_lowv16qi ((int8x16_t) __a);
-}
+#define vget_low_u8(__a) \
+  (uint8x8_t)__builtin_neon_vget_lowv16qi (__a);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vget_low_u16 (uint16x8_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vget_lowv8hi ((int16x8_t) __a);
-}
+#define vget_low_u16(__a) \
+  (uint16x4_t)__builtin_neon_vget_lowv8hi (__a);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vget_low_u32 (uint32x4_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vget_lowv4si ((int32x4_t) __a);
-}
+#define vget_low_u32(__a) \
+  (uint32x2_t)__builtin_neon_vget_lowv4si (__a);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vget_low_u64 (uint64x2_t __a)
-{
-  return (uint64x1_t)__builtin_neon_vget_lowv2di ((int64x2_t) __a);
-}
+#define vget_low_u64(__a) \
+  (uint64x1_t)__builtin_neon_vget_lowv2di (__a);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vget_low_p8 (poly8x16_t __a)
-{
-  return (poly8x8_t)__builtin_neon_vget_lowv16qi ((int8x16_t) __a);
-}
+#define vget_low_p8(__a) \
+  (poly8x8_t)__builtin_neon_vget_lowv16qi (__a);
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vget_low_p16 (poly16x8_t __a)
-{
-  return (poly16x4_t)__builtin_neon_vget_lowv8hi ((int16x8_t) __a);
-}
+#define vget_low_p16(__a) \
+  (poly16x4_t)__builtin_neon_vget_lowv8hi (__a);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vcvt_s32_f32 (float32x2_t __a)
-{
-  return (int32x2_t)__builtin_neon_vcvtv2sf (__a, 1);
-}
+#define vcvt_s32_f32(__a) \
+  (int32x2_t)__builtin_neon_vcvtv2sf (__a, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vcvt_f32_s32 (int32x2_t __a)
-{
-  return (float32x2_t)__builtin_neon_vcvtv2si (__a, 1);
-}
+#define vcvt_f32_s32(__a) \
+  (float32x2_t)__builtin_neon_vcvtv2si (__a, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vcvt_f32_u32 (uint32x2_t __a)
-{
-  return (float32x2_t)__builtin_neon_vcvtv2si ((int32x2_t) __a, 0);
-}
+#define vcvt_f32_u32(__a) \
+  (float32x2_t)__builtin_neon_vcvtv2si (__a, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcvt_u32_f32 (float32x2_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vcvtv2sf (__a, 0);
-}
+#define vcvt_u32_f32(__a) \
+  (uint32x2_t)__builtin_neon_vcvtv2sf (__a, 0);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vcvtq_s32_f32 (float32x4_t __a)
-{
-  return (int32x4_t)__builtin_neon_vcvtv4sf (__a, 1);
-}
+#define vcvtq_s32_f32(__a) \
+  (int32x4_t)__builtin_neon_vcvtv4sf (__a, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vcvtq_f32_s32 (int32x4_t __a)
-{
-  return (float32x4_t)__builtin_neon_vcvtv4si (__a, 1);
-}
+#define vcvtq_f32_s32(__a) \
+  (float32x4_t)__builtin_neon_vcvtv4si (__a, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vcvtq_f32_u32 (uint32x4_t __a)
-{
-  return (float32x4_t)__builtin_neon_vcvtv4si ((int32x4_t) __a, 0);
-}
+#define vcvtq_f32_u32(__a) \
+  (float32x4_t)__builtin_neon_vcvtv4si (__a, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcvtq_u32_f32 (float32x4_t __a)
-{
-  return (uint32x4_t)__builtin_neon_vcvtv4sf (__a, 0);
-}
+#define vcvtq_u32_f32(__a) \
+  (uint32x4_t)__builtin_neon_vcvtv4sf (__a, 0);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vcvt_n_s32_f32 (float32x2_t __a, const int __b)
-{
-  return (int32x2_t)__builtin_neon_vcvt_nv2sf (__a, __b, 1);
-}
+#define vcvt_n_s32_f32(__a, __b) \
+  (int32x2_t)__builtin_neon_vcvt_nv2sf (__a, __b, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vcvt_n_f32_s32 (int32x2_t __a, const int __b)
-{
-  return (float32x2_t)__builtin_neon_vcvt_nv2si (__a, __b, 1);
-}
+#define vcvt_n_f32_s32(__a, __b) \
+  (float32x2_t)__builtin_neon_vcvt_nv2si (__a, __b, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vcvt_n_f32_u32 (uint32x2_t __a, const int __b)
-{
-  return (float32x2_t)__builtin_neon_vcvt_nv2si ((int32x2_t) __a, __b, 0);
-}
+#define vcvt_n_f32_u32(__a, __b) \
+  (float32x2_t)__builtin_neon_vcvt_nv2si (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vcvt_n_u32_f32 (float32x2_t __a, const int __b)
-{
-  return (uint32x2_t)__builtin_neon_vcvt_nv2sf (__a, __b, 0);
-}
+#define vcvt_n_u32_f32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vcvt_nv2sf (__a, __b, 0);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vcvtq_n_s32_f32 (float32x4_t __a, const int __b)
-{
-  return (int32x4_t)__builtin_neon_vcvt_nv4sf (__a, __b, 1);
-}
+#define vcvtq_n_s32_f32(__a, __b) \
+  (int32x4_t)__builtin_neon_vcvt_nv4sf (__a, __b, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vcvtq_n_f32_s32 (int32x4_t __a, const int __b)
-{
-  return (float32x4_t)__builtin_neon_vcvt_nv4si (__a, __b, 1);
-}
+#define vcvtq_n_f32_s32(__a, __b) \
+  (float32x4_t)__builtin_neon_vcvt_nv4si (__a, __b, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vcvtq_n_f32_u32 (uint32x4_t __a, const int __b)
-{
-  return (float32x4_t)__builtin_neon_vcvt_nv4si ((int32x4_t) __a, __b, 0);
-}
+#define vcvtq_n_f32_u32(__a, __b) \
+  (float32x4_t)__builtin_neon_vcvt_nv4si (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vcvtq_n_u32_f32 (float32x4_t __a, const int __b)
-{
-  return (uint32x4_t)__builtin_neon_vcvt_nv4sf (__a, __b, 0);
-}
+#define vcvtq_n_u32_f32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vcvt_nv4sf (__a, __b, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vmovn_s16 (int16x8_t __a)
-{
-  return (int8x8_t)__builtin_neon_vmovnv8hi (__a, 1);
-}
+#define vmovn_s16(__a) \
+  (int8x8_t)__builtin_neon_vmovnv8hi (__a, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmovn_s32 (int32x4_t __a)
-{
-  return (int16x4_t)__builtin_neon_vmovnv4si (__a, 1);
-}
+#define vmovn_s32(__a) \
+  (int16x4_t)__builtin_neon_vmovnv4si (__a, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmovn_s64 (int64x2_t __a)
-{
-  return (int32x2_t)__builtin_neon_vmovnv2di (__a, 1);
-}
+#define vmovn_s64(__a) \
+  (int32x2_t)__builtin_neon_vmovnv2di (__a, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vmovn_u16 (uint16x8_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vmovnv8hi ((int16x8_t) __a, 0);
-}
+#define vmovn_u16(__a) \
+  (uint8x8_t)__builtin_neon_vmovnv8hi (__a, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmovn_u32 (uint32x4_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vmovnv4si ((int32x4_t) __a, 0);
-}
+#define vmovn_u32(__a) \
+  (uint16x4_t)__builtin_neon_vmovnv4si (__a, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmovn_u64 (uint64x2_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vmovnv2di ((int64x2_t) __a, 0);
-}
+#define vmovn_u64(__a) \
+  (uint32x2_t)__builtin_neon_vmovnv2di (__a, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vqmovn_s16 (int16x8_t __a)
-{
-  return (int8x8_t)__builtin_neon_vqmovnv8hi (__a, 1);
-}
+#define vqmovn_s16(__a) \
+  (int8x8_t)__builtin_neon_vqmovnv8hi (__a, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqmovn_s32 (int32x4_t __a)
-{
-  return (int16x4_t)__builtin_neon_vqmovnv4si (__a, 1);
-}
+#define vqmovn_s32(__a) \
+  (int16x4_t)__builtin_neon_vqmovnv4si (__a, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqmovn_s64 (int64x2_t __a)
-{
-  return (int32x2_t)__builtin_neon_vqmovnv2di (__a, 1);
-}
+#define vqmovn_s64(__a) \
+  (int32x2_t)__builtin_neon_vqmovnv2di (__a, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqmovn_u16 (uint16x8_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vqmovnv8hi ((int16x8_t) __a, 0);
-}
+#define vqmovn_u16(__a) \
+  (uint8x8_t)__builtin_neon_vqmovnv8hi (__a, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqmovn_u32 (uint32x4_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vqmovnv4si ((int32x4_t) __a, 0);
-}
+#define vqmovn_u32(__a) \
+  (uint16x4_t)__builtin_neon_vqmovnv4si (__a, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqmovn_u64 (uint64x2_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vqmovnv2di ((int64x2_t) __a, 0);
-}
+#define vqmovn_u64(__a) \
+  (uint32x2_t)__builtin_neon_vqmovnv2di (__a, 0);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vqmovun_s16 (int16x8_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vqmovunv8hi (__a, 1);
-}
+#define vqmovun_s16(__a) \
+  (uint8x8_t)__builtin_neon_vqmovunv8hi (__a, 1);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vqmovun_s32 (int32x4_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vqmovunv4si (__a, 1);
-}
+#define vqmovun_s32(__a) \
+  (uint16x4_t)__builtin_neon_vqmovunv4si (__a, 1);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vqmovun_s64 (int64x2_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vqmovunv2di (__a, 1);
-}
+#define vqmovun_s64(__a) \
+  (uint32x2_t)__builtin_neon_vqmovunv2di (__a, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmovl_s8 (int8x8_t __a)
-{
-  return (int16x8_t)__builtin_neon_vmovlv8qi (__a, 1);
-}
+#define vmovl_s8(__a) \
+  (int16x8_t)__builtin_neon_vmovlv8qi (__a, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmovl_s16 (int16x4_t __a)
-{
-  return (int32x4_t)__builtin_neon_vmovlv4hi (__a, 1);
-}
+#define vmovl_s16(__a) \
+  (int32x4_t)__builtin_neon_vmovlv4hi (__a, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmovl_s32 (int32x2_t __a)
-{
-  return (int64x2_t)__builtin_neon_vmovlv2si (__a, 1);
-}
+#define vmovl_s32(__a) \
+  (int64x2_t)__builtin_neon_vmovlv2si (__a, 1);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmovl_u8 (uint8x8_t __a)
-{
-  return (uint16x8_t)__builtin_neon_vmovlv8qi ((int8x8_t) __a, 0);
-}
+#define vmovl_u8(__a) \
+  (uint16x8_t)__builtin_neon_vmovlv8qi (__a, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmovl_u16 (uint16x4_t __a)
-{
-  return (uint32x4_t)__builtin_neon_vmovlv4hi ((int16x4_t) __a, 0);
-}
+#define vmovl_u16(__a) \
+  (uint32x4_t)__builtin_neon_vmovlv4hi (__a, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmovl_u32 (uint32x2_t __a)
-{
-  return (uint64x2_t)__builtin_neon_vmovlv2si ((int32x2_t) __a, 0);
-}
+#define vmovl_u32(__a) \
+  (uint64x2_t)__builtin_neon_vmovlv2si (__a, 0);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbl1_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vtbl1v8qi (__a, __b);
-}
+#define vtbl1_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vtbl1v8qi (__a, __b);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbl1_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vtbl1v8qi ((int8x8_t) __a, (int8x8_t) __b);
-}
+#define vtbl1_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vtbl1v8qi (__a, __b);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbl1_p8 (poly8x8_t __a, uint8x8_t __b)
-{
-  return (poly8x8_t)__builtin_neon_vtbl1v8qi ((int8x8_t) __a, (int8x8_t) __b);
-}
+#define vtbl1_p8(__a, __b) \
+  (poly8x8_t)__builtin_neon_vtbl1v8qi (__a, __b);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbl2_s8 (int8x8x2_t __a, int8x8_t __b)
-{
-  union { int8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a };
-  return (int8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, __b);
-}
+#define vtbl2_s8(__a, __b) \
+  ({ \
+     union { int8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a }; \
+     (int8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, __b); \
+   })
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbl2_u8 (uint8x8x2_t __a, uint8x8_t __b)
-{
-  union { uint8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a };
-  return (uint8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, (int8x8_t) __b);
-}
+#define vtbl2_u8(__a, __b) \
+  ({ \
+     union { uint8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a }; \
+     (uint8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, __b); \
+   })
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbl2_p8 (poly8x8x2_t __a, uint8x8_t __b)
-{
-  union { poly8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a };
-  return (poly8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, (int8x8_t) __b);
-}
+#define vtbl2_p8(__a, __b) \
+  ({ \
+     union { poly8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a }; \
+     (poly8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, __b); \
+   })
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbl3_s8 (int8x8x3_t __a, int8x8_t __b)
-{
-  union { int8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a };
-  return (int8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, __b);
-}
+#define vtbl3_s8(__a, __b) \
+  ({ \
+     union { int8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a }; \
+     (int8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, __b); \
+   })
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbl3_u8 (uint8x8x3_t __a, uint8x8_t __b)
-{
-  union { uint8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a };
-  return (uint8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, (int8x8_t) __b);
-}
+#define vtbl3_u8(__a, __b) \
+  ({ \
+     union { uint8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a }; \
+     (uint8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, __b); \
+   })
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbl3_p8 (poly8x8x3_t __a, uint8x8_t __b)
-{
-  union { poly8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a };
-  return (poly8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, (int8x8_t) __b);
-}
+#define vtbl3_p8(__a, __b) \
+  ({ \
+     union { poly8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a }; \
+     (poly8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, __b); \
+   })
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbl4_s8 (int8x8x4_t __a, int8x8_t __b)
-{
-  union { int8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a };
-  return (int8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, __b);
-}
+#define vtbl4_s8(__a, __b) \
+  ({ \
+     union { int8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a }; \
+     (int8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, __b); \
+   })
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbl4_u8 (uint8x8x4_t __a, uint8x8_t __b)
-{
-  union { uint8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a };
-  return (uint8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, (int8x8_t) __b);
-}
+#define vtbl4_u8(__a, __b) \
+  ({ \
+     union { uint8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a }; \
+     (uint8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, __b); \
+   })
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbl4_p8 (poly8x8x4_t __a, uint8x8_t __b)
-{
-  union { poly8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a };
-  return (poly8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, (int8x8_t) __b);
-}
+#define vtbl4_p8(__a, __b) \
+  ({ \
+     union { poly8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a }; \
+     (poly8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, __b); \
+   })
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx1_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
-{
-  return (int8x8_t)__builtin_neon_vtbx1v8qi (__a, __b, __c);
-}
+#define vtbx1_s8(__a, __b, __c) \
+  (int8x8_t)__builtin_neon_vtbx1v8qi (__a, __b, __c);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx1_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
-{
-  return (uint8x8_t)__builtin_neon_vtbx1v8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
-}
+#define vtbx1_u8(__a, __b, __c) \
+  (uint8x8_t)__builtin_neon_vtbx1v8qi (__a, __b, __c);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx1_p8 (poly8x8_t __a, poly8x8_t __b, uint8x8_t __c)
-{
-  return (poly8x8_t)__builtin_neon_vtbx1v8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
-}
+#define vtbx1_p8(__a, __b, __c) \
+  (poly8x8_t)__builtin_neon_vtbx1v8qi (__a, __b, __c);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx2_s8 (int8x8_t __a, int8x8x2_t __b, int8x8_t __c)
-{
-  union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  return (int8x8_t)__builtin_neon_vtbx2v8qi (__a, __bu.__o, __c);
-}
+#define vtbx2_s8(__a, __b, __c) \
+  ({ \
+     union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     (int8x8_t)__builtin_neon_vtbx2v8qi (__a, __bu.__o, __c); \
+   })
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx2_u8 (uint8x8_t __a, uint8x8x2_t __b, uint8x8_t __c)
-{
-  union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  return (uint8x8_t)__builtin_neon_vtbx2v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
-}
+#define vtbx2_u8(__a, __b, __c) \
+  ({ \
+     union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     (uint8x8_t)__builtin_neon_vtbx2v8qi (__a, __bu.__o, __c); \
+   })
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx2_p8 (poly8x8_t __a, poly8x8x2_t __b, uint8x8_t __c)
-{
-  union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  return (poly8x8_t)__builtin_neon_vtbx2v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
-}
+#define vtbx2_p8(__a, __b, __c) \
+  ({ \
+     union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     (poly8x8_t)__builtin_neon_vtbx2v8qi (__a, __bu.__o, __c); \
+   })
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx3_s8 (int8x8_t __a, int8x8x3_t __b, int8x8_t __c)
-{
-  union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  return (int8x8_t)__builtin_neon_vtbx3v8qi (__a, __bu.__o, __c);
-}
+#define vtbx3_s8(__a, __b, __c) \
+  ({ \
+     union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     (int8x8_t)__builtin_neon_vtbx3v8qi (__a, __bu.__o, __c); \
+   })
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx3_u8 (uint8x8_t __a, uint8x8x3_t __b, uint8x8_t __c)
-{
-  union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  return (uint8x8_t)__builtin_neon_vtbx3v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
-}
+#define vtbx3_u8(__a, __b, __c) \
+  ({ \
+     union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     (uint8x8_t)__builtin_neon_vtbx3v8qi (__a, __bu.__o, __c); \
+   })
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx3_p8 (poly8x8_t __a, poly8x8x3_t __b, uint8x8_t __c)
-{
-  union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  return (poly8x8_t)__builtin_neon_vtbx3v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
-}
+#define vtbx3_p8(__a, __b, __c) \
+  ({ \
+     union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     (poly8x8_t)__builtin_neon_vtbx3v8qi (__a, __bu.__o, __c); \
+   })
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vtbx4_s8 (int8x8_t __a, int8x8x4_t __b, int8x8_t __c)
-{
-  union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  return (int8x8_t)__builtin_neon_vtbx4v8qi (__a, __bu.__o, __c);
-}
+#define vtbx4_s8(__a, __b, __c) \
+  ({ \
+     union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     (int8x8_t)__builtin_neon_vtbx4v8qi (__a, __bu.__o, __c); \
+   })
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vtbx4_u8 (uint8x8_t __a, uint8x8x4_t __b, uint8x8_t __c)
-{
-  union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  return (uint8x8_t)__builtin_neon_vtbx4v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
-}
+#define vtbx4_u8(__a, __b, __c) \
+  ({ \
+     union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     (uint8x8_t)__builtin_neon_vtbx4v8qi (__a, __bu.__o, __c); \
+   })
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vtbx4_p8 (poly8x8_t __a, poly8x8x4_t __b, uint8x8_t __c)
-{
-  union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  return (poly8x8_t)__builtin_neon_vtbx4v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
-}
+#define vtbx4_p8(__a, __b, __c) \
+  ({ \
+     union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     (poly8x8_t)__builtin_neon_vtbx4v8qi (__a, __bu.__o, __c); \
+   })
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmul_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
-{
-  return (int16x4_t)__builtin_neon_vmul_lanev4hi (__a, __b, __c, 1);
-}
+#define vmul_lane_s16(__a, __b, __c) \
+  (int16x4_t)__builtin_neon_vmul_lanev4hi (__a, __b, __c, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmul_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
-{
-  return (int32x2_t)__builtin_neon_vmul_lanev2si (__a, __b, __c, 1);
-}
+#define vmul_lane_s32(__a, __b, __c) \
+  (int32x2_t)__builtin_neon_vmul_lanev2si (__a, __b, __c, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmul_lane_f32 (float32x2_t __a, float32x2_t __b, const int __c)
-{
-  return (float32x2_t)__builtin_neon_vmul_lanev2sf (__a, __b, __c, 5);
-}
+#define vmul_lane_f32(__a, __b, __c) \
+  (float32x2_t)__builtin_neon_vmul_lanev2sf (__a, __b, __c, 5);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmul_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
-{
-  return (uint16x4_t)__builtin_neon_vmul_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, __c, 0);
-}
+#define vmul_lane_u16(__a, __b, __c) \
+  (uint16x4_t)__builtin_neon_vmul_lanev4hi (__a, __b, __c, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmul_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
-{
-  return (uint32x2_t)__builtin_neon_vmul_lanev2si ((int32x2_t) __a, (int32x2_t) __b, __c, 0);
-}
+#define vmul_lane_u32(__a, __b, __c) \
+  (uint32x2_t)__builtin_neon_vmul_lanev2si (__a, __b, __c, 0);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmulq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
-{
-  return (int16x8_t)__builtin_neon_vmul_lanev8hi (__a, __b, __c, 1);
-}
+#define vmulq_lane_s16(__a, __b, __c) \
+  (int16x8_t)__builtin_neon_vmul_lanev8hi (__a, __b, __c, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
-{
-  return (int32x4_t)__builtin_neon_vmul_lanev4si (__a, __b, __c, 1);
-}
+#define vmulq_lane_s32(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vmul_lanev4si (__a, __b, __c, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __c)
-{
-  return (float32x4_t)__builtin_neon_vmul_lanev4sf (__a, __b, __c, 5);
-}
+#define vmulq_lane_f32(__a, __b, __c) \
+  (float32x4_t)__builtin_neon_vmul_lanev4sf (__a, __b, __c, 5);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmulq_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c)
-{
-  return (uint16x8_t)__builtin_neon_vmul_lanev8hi ((int16x8_t) __a, (int16x4_t) __b, __c, 0);
-}
+#define vmulq_lane_u16(__a, __b, __c) \
+  (uint16x8_t)__builtin_neon_vmul_lanev8hi (__a, __b, __c, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmulq_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c)
-{
-  return (uint32x4_t)__builtin_neon_vmul_lanev4si ((int32x4_t) __a, (int32x2_t) __b, __c, 0);
-}
+#define vmulq_lane_u32(__a, __b, __c) \
+  (uint32x4_t)__builtin_neon_vmul_lanev4si (__a, __b, __c, 0);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmla_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
-{
-  return (int16x4_t)__builtin_neon_vmla_lanev4hi (__a, __b, __c, __d, 1);
-}
+#define vmla_lane_s16(__a, __b, __c, __d) \
+  (int16x4_t)__builtin_neon_vmla_lanev4hi (__a, __b, __c, __d, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmla_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
-{
-  return (int32x2_t)__builtin_neon_vmla_lanev2si (__a, __b, __c, __d, 1);
-}
+#define vmla_lane_s32(__a, __b, __c, __d) \
+  (int32x2_t)__builtin_neon_vmla_lanev2si (__a, __b, __c, __d, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmla_lane_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c, const int __d)
-{
-  return (float32x2_t)__builtin_neon_vmla_lanev2sf (__a, __b, __c, __d, 5);
-}
+#define vmla_lane_f32(__a, __b, __c, __d) \
+  (float32x2_t)__builtin_neon_vmla_lanev2sf (__a, __b, __c, __d, 5);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
-{
-  return (uint16x4_t)__builtin_neon_vmla_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d, 0);
-}
+#define vmla_lane_u16(__a, __b, __c, __d) \
+  (uint16x4_t)__builtin_neon_vmla_lanev4hi (__a, __b, __c, __d, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
-{
-  return (uint32x2_t)__builtin_neon_vmla_lanev2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d, 0);
-}
+#define vmla_lane_u32(__a, __b, __c, __d) \
+  (uint32x2_t)__builtin_neon_vmla_lanev2si (__a, __b, __c, __d, 0);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
-{
-  return (int16x8_t)__builtin_neon_vmla_lanev8hi (__a, __b, __c, __d, 1);
-}
+#define vmlaq_lane_s16(__a, __b, __c, __d) \
+  (int16x8_t)__builtin_neon_vmla_lanev8hi (__a, __b, __c, __d, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
-{
-  return (int32x4_t)__builtin_neon_vmla_lanev4si (__a, __b, __c, __d, 1);
-}
+#define vmlaq_lane_s32(__a, __b, __c, __d) \
+  (int32x4_t)__builtin_neon_vmla_lanev4si (__a, __b, __c, __d, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b, float32x2_t __c, const int __d)
-{
-  return (float32x4_t)__builtin_neon_vmla_lanev4sf (__a, __b, __c, __d, 5);
-}
+#define vmlaq_lane_f32(__a, __b, __c, __d) \
+  (float32x4_t)__builtin_neon_vmla_lanev4sf (__a, __b, __c, __d, 5);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlaq_lane_u16 (uint16x8_t __a, uint16x8_t __b, uint16x4_t __c, const int __d)
-{
-  return (uint16x8_t)__builtin_neon_vmla_lanev8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x4_t) __c, __d, 0);
-}
+#define vmlaq_lane_u16(__a, __b, __c, __d) \
+  (uint16x8_t)__builtin_neon_vmla_lanev8hi (__a, __b, __c, __d, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b, uint32x2_t __c, const int __d)
-{
-  return (uint32x4_t)__builtin_neon_vmla_lanev4si ((int32x4_t) __a, (int32x4_t) __b, (int32x2_t) __c, __d, 0);
-}
+#define vmlaq_lane_u32(__a, __b, __c, __d) \
+  (uint32x4_t)__builtin_neon_vmla_lanev4si (__a, __b, __c, __d, 0);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
-{
-  return (int32x4_t)__builtin_neon_vmlal_lanev4hi (__a, __b, __c, __d, 1);
-}
+#define vmlal_lane_s16(__a, __b, __c, __d) \
+  (int32x4_t)__builtin_neon_vmlal_lanev4hi (__a, __b, __c, __d, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
-{
-  return (int64x2_t)__builtin_neon_vmlal_lanev2si (__a, __b, __c, __d, 1);
-}
+#define vmlal_lane_s32(__a, __b, __c, __d) \
+  (int64x2_t)__builtin_neon_vmlal_lanev2si (__a, __b, __c, __d, 1);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlal_lane_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
-{
-  return (uint32x4_t)__builtin_neon_vmlal_lanev4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d, 0);
-}
+#define vmlal_lane_u16(__a, __b, __c, __d) \
+  (uint32x4_t)__builtin_neon_vmlal_lanev4hi (__a, __b, __c, __d, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmlal_lane_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
-{
-  return (uint64x2_t)__builtin_neon_vmlal_lanev2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d, 0);
-}
+#define vmlal_lane_u32(__a, __b, __c, __d) \
+  (uint64x2_t)__builtin_neon_vmlal_lanev2si (__a, __b, __c, __d, 0);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
-{
-  return (int32x4_t)__builtin_neon_vqdmlal_lanev4hi (__a, __b, __c, __d, 1);
-}
+#define vqdmlal_lane_s16(__a, __b, __c, __d) \
+  (int32x4_t)__builtin_neon_vqdmlal_lanev4hi (__a, __b, __c, __d, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
-{
-  return (int64x2_t)__builtin_neon_vqdmlal_lanev2si (__a, __b, __c, __d, 1);
-}
+#define vqdmlal_lane_s32(__a, __b, __c, __d) \
+  (int64x2_t)__builtin_neon_vqdmlal_lanev2si (__a, __b, __c, __d, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmls_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
-{
-  return (int16x4_t)__builtin_neon_vmls_lanev4hi (__a, __b, __c, __d, 1);
-}
+#define vmls_lane_s16(__a, __b, __c, __d) \
+  (int16x4_t)__builtin_neon_vmls_lanev4hi (__a, __b, __c, __d, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmls_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
-{
-  return (int32x2_t)__builtin_neon_vmls_lanev2si (__a, __b, __c, __d, 1);
-}
+#define vmls_lane_s32(__a, __b, __c, __d) \
+  (int32x2_t)__builtin_neon_vmls_lanev2si (__a, __b, __c, __d, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmls_lane_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c, const int __d)
-{
-  return (float32x2_t)__builtin_neon_vmls_lanev2sf (__a, __b, __c, __d, 5);
-}
+#define vmls_lane_f32(__a, __b, __c, __d) \
+  (float32x2_t)__builtin_neon_vmls_lanev2sf (__a, __b, __c, __d, 5);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
-{
-  return (uint16x4_t)__builtin_neon_vmls_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d, 0);
-}
+#define vmls_lane_u16(__a, __b, __c, __d) \
+  (uint16x4_t)__builtin_neon_vmls_lanev4hi (__a, __b, __c, __d, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
-{
-  return (uint32x2_t)__builtin_neon_vmls_lanev2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d, 0);
-}
+#define vmls_lane_u32(__a, __b, __c, __d) \
+  (uint32x2_t)__builtin_neon_vmls_lanev2si (__a, __b, __c, __d, 0);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
-{
-  return (int16x8_t)__builtin_neon_vmls_lanev8hi (__a, __b, __c, __d, 1);
-}
+#define vmlsq_lane_s16(__a, __b, __c, __d) \
+  (int16x8_t)__builtin_neon_vmls_lanev8hi (__a, __b, __c, __d, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
-{
-  return (int32x4_t)__builtin_neon_vmls_lanev4si (__a, __b, __c, __d, 1);
-}
+#define vmlsq_lane_s32(__a, __b, __c, __d) \
+  (int32x4_t)__builtin_neon_vmls_lanev4si (__a, __b, __c, __d, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b, float32x2_t __c, const int __d)
-{
-  return (float32x4_t)__builtin_neon_vmls_lanev4sf (__a, __b, __c, __d, 5);
-}
+#define vmlsq_lane_f32(__a, __b, __c, __d) \
+  (float32x4_t)__builtin_neon_vmls_lanev4sf (__a, __b, __c, __d, 5);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlsq_lane_u16 (uint16x8_t __a, uint16x8_t __b, uint16x4_t __c, const int __d)
-{
-  return (uint16x8_t)__builtin_neon_vmls_lanev8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x4_t) __c, __d, 0);
-}
+#define vmlsq_lane_u16(__a, __b, __c, __d) \
+  (uint16x8_t)__builtin_neon_vmls_lanev8hi (__a, __b, __c, __d, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b, uint32x2_t __c, const int __d)
-{
-  return (uint32x4_t)__builtin_neon_vmls_lanev4si ((int32x4_t) __a, (int32x4_t) __b, (int32x2_t) __c, __d, 0);
-}
+#define vmlsq_lane_u32(__a, __b, __c, __d) \
+  (uint32x4_t)__builtin_neon_vmls_lanev4si (__a, __b, __c, __d, 0);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
-{
-  return (int32x4_t)__builtin_neon_vmlsl_lanev4hi (__a, __b, __c, __d, 1);
-}
+#define vmlsl_lane_s16(__a, __b, __c, __d) \
+  (int32x4_t)__builtin_neon_vmlsl_lanev4hi (__a, __b, __c, __d, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
-{
-  return (int64x2_t)__builtin_neon_vmlsl_lanev2si (__a, __b, __c, __d, 1);
-}
+#define vmlsl_lane_s32(__a, __b, __c, __d) \
+  (int64x2_t)__builtin_neon_vmlsl_lanev2si (__a, __b, __c, __d, 1);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlsl_lane_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
-{
-  return (uint32x4_t)__builtin_neon_vmlsl_lanev4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d, 0);
-}
+#define vmlsl_lane_u16(__a, __b, __c, __d) \
+  (uint32x4_t)__builtin_neon_vmlsl_lanev4hi (__a, __b, __c, __d, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmlsl_lane_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
-{
-  return (uint64x2_t)__builtin_neon_vmlsl_lanev2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d, 0);
-}
+#define vmlsl_lane_u32(__a, __b, __c, __d) \
+  (uint64x2_t)__builtin_neon_vmlsl_lanev2si (__a, __b, __c, __d, 0);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
-{
-  return (int32x4_t)__builtin_neon_vqdmlsl_lanev4hi (__a, __b, __c, __d, 1);
-}
+#define vqdmlsl_lane_s16(__a, __b, __c, __d) \
+  (int32x4_t)__builtin_neon_vqdmlsl_lanev4hi (__a, __b, __c, __d, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
-{
-  return (int64x2_t)__builtin_neon_vqdmlsl_lanev2si (__a, __b, __c, __d, 1);
-}
+#define vqdmlsl_lane_s32(__a, __b, __c, __d) \
+  (int64x2_t)__builtin_neon_vqdmlsl_lanev2si (__a, __b, __c, __d, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
-{
-  return (int32x4_t)__builtin_neon_vmull_lanev4hi (__a, __b, __c, 1);
-}
+#define vmull_lane_s16(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vmull_lanev4hi (__a, __b, __c, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
-{
-  return (int64x2_t)__builtin_neon_vmull_lanev2si (__a, __b, __c, 1);
-}
+#define vmull_lane_s32(__a, __b, __c) \
+  (int64x2_t)__builtin_neon_vmull_lanev2si (__a, __b, __c, 1);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
-{
-  return (uint32x4_t)__builtin_neon_vmull_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, __c, 0);
-}
+#define vmull_lane_u16(__a, __b, __c) \
+  (uint32x4_t)__builtin_neon_vmull_lanev4hi (__a, __b, __c, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
-{
-  return (uint64x2_t)__builtin_neon_vmull_lanev2si ((int32x2_t) __a, (int32x2_t) __b, __c, 0);
-}
+#define vmull_lane_u32(__a, __b, __c) \
+  (uint64x2_t)__builtin_neon_vmull_lanev2si (__a, __b, __c, 0);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
-{
-  return (int32x4_t)__builtin_neon_vqdmull_lanev4hi (__a, __b, __c, 1);
-}
+#define vqdmull_lane_s16(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vqdmull_lanev4hi (__a, __b, __c, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
-{
-  return (int64x2_t)__builtin_neon_vqdmull_lanev2si (__a, __b, __c, 1);
-}
+#define vqdmull_lane_s32(__a, __b, __c) \
+  (int64x2_t)__builtin_neon_vqdmull_lanev2si (__a, __b, __c, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
-{
-  return (int16x8_t)__builtin_neon_vqdmulh_lanev8hi (__a, __b, __c, 1);
-}
+#define vqdmulhq_lane_s16(__a, __b, __c) \
+  (int16x8_t)__builtin_neon_vqdmulh_lanev8hi (__a, __b, __c, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
-{
-  return (int32x4_t)__builtin_neon_vqdmulh_lanev4si (__a, __b, __c, 1);
-}
+#define vqdmulhq_lane_s32(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vqdmulh_lanev4si (__a, __b, __c, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
-{
-  return (int16x4_t)__builtin_neon_vqdmulh_lanev4hi (__a, __b, __c, 1);
-}
+#define vqdmulh_lane_s16(__a, __b, __c) \
+  (int16x4_t)__builtin_neon_vqdmulh_lanev4hi (__a, __b, __c, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
-{
-  return (int32x2_t)__builtin_neon_vqdmulh_lanev2si (__a, __b, __c, 1);
-}
+#define vqdmulh_lane_s32(__a, __b, __c) \
+  (int32x2_t)__builtin_neon_vqdmulh_lanev2si (__a, __b, __c, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqrdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
-{
-  return (int16x8_t)__builtin_neon_vqdmulh_lanev8hi (__a, __b, __c, 3);
-}
+#define vqrdmulhq_lane_s16(__a, __b, __c) \
+  (int16x8_t)__builtin_neon_vqdmulh_lanev8hi (__a, __b, __c, 3);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqrdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
-{
-  return (int32x4_t)__builtin_neon_vqdmulh_lanev4si (__a, __b, __c, 3);
-}
+#define vqrdmulhq_lane_s32(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vqdmulh_lanev4si (__a, __b, __c, 3);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqrdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
-{
-  return (int16x4_t)__builtin_neon_vqdmulh_lanev4hi (__a, __b, __c, 3);
-}
+#define vqrdmulh_lane_s16(__a, __b, __c) \
+  (int16x4_t)__builtin_neon_vqdmulh_lanev4hi (__a, __b, __c, 3);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqrdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
-{
-  return (int32x2_t)__builtin_neon_vqdmulh_lanev2si (__a, __b, __c, 3);
-}
+#define vqrdmulh_lane_s32(__a, __b, __c) \
+  (int32x2_t)__builtin_neon_vqdmulh_lanev2si (__a, __b, __c, 3);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmul_n_s16 (int16x4_t __a, int16_t __b)
-{
-  return (int16x4_t)__builtin_neon_vmul_nv4hi (__a, (__builtin_neon_hi) __b, 1);
-}
+#define vmul_n_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vmul_nv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmul_n_s32 (int32x2_t __a, int32_t __b)
-{
-  return (int32x2_t)__builtin_neon_vmul_nv2si (__a, (__builtin_neon_si) __b, 1);
-}
+#define vmul_n_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vmul_nv2si (__a, __b, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmul_n_f32 (float32x2_t __a, float32_t __b)
-{
-  return (float32x2_t)__builtin_neon_vmul_nv2sf (__a, __b, 5);
-}
+#define vmul_n_f32(__a, __b) \
+  (float32x2_t)__builtin_neon_vmul_nv2sf (__a, __b, 5);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmul_n_u16 (uint16x4_t __a, uint16_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vmul_nv4hi ((int16x4_t) __a, (__builtin_neon_hi) __b, 0);
-}
+#define vmul_n_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vmul_nv4hi (__a, __b, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmul_n_u32 (uint32x2_t __a, uint32_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vmul_nv2si ((int32x2_t) __a, (__builtin_neon_si) __b, 0);
-}
+#define vmul_n_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vmul_nv2si (__a, __b, 0);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmulq_n_s16 (int16x8_t __a, int16_t __b)
-{
-  return (int16x8_t)__builtin_neon_vmul_nv8hi (__a, (__builtin_neon_hi) __b, 1);
-}
+#define vmulq_n_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vmul_nv8hi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmulq_n_s32 (int32x4_t __a, int32_t __b)
-{
-  return (int32x4_t)__builtin_neon_vmul_nv4si (__a, (__builtin_neon_si) __b, 1);
-}
+#define vmulq_n_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vmul_nv4si (__a, __b, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmulq_n_f32 (float32x4_t __a, float32_t __b)
-{
-  return (float32x4_t)__builtin_neon_vmul_nv4sf (__a, __b, 5);
-}
+#define vmulq_n_f32(__a, __b) \
+  (float32x4_t)__builtin_neon_vmul_nv4sf (__a, __b, 5);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vmul_nv8hi ((int16x8_t) __a, (__builtin_neon_hi) __b, 0);
-}
+#define vmulq_n_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vmul_nv8hi (__a, __b, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vmul_nv4si ((int32x4_t) __a, (__builtin_neon_si) __b, 0);
-}
+#define vmulq_n_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vmul_nv4si (__a, __b, 0);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmull_n_s16 (int16x4_t __a, int16_t __b)
-{
-  return (int32x4_t)__builtin_neon_vmull_nv4hi (__a, (__builtin_neon_hi) __b, 1);
-}
+#define vmull_n_s16(__a, __b) \
+  (int32x4_t)__builtin_neon_vmull_nv4hi (__a, __b, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmull_n_s32 (int32x2_t __a, int32_t __b)
-{
-  return (int64x2_t)__builtin_neon_vmull_nv2si (__a, (__builtin_neon_si) __b, 1);
-}
+#define vmull_n_s32(__a, __b) \
+  (int64x2_t)__builtin_neon_vmull_nv2si (__a, __b, 1);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmull_n_u16 (uint16x4_t __a, uint16_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vmull_nv4hi ((int16x4_t) __a, (__builtin_neon_hi) __b, 0);
-}
+#define vmull_n_u16(__a, __b) \
+  (uint32x4_t)__builtin_neon_vmull_nv4hi (__a, __b, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmull_n_u32 (uint32x2_t __a, uint32_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vmull_nv2si ((int32x2_t) __a, (__builtin_neon_si) __b, 0);
-}
+#define vmull_n_u32(__a, __b) \
+  (uint64x2_t)__builtin_neon_vmull_nv2si (__a, __b, 0);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmull_n_s16 (int16x4_t __a, int16_t __b)
-{
-  return (int32x4_t)__builtin_neon_vqdmull_nv4hi (__a, (__builtin_neon_hi) __b, 1);
-}
+#define vqdmull_n_s16(__a, __b) \
+  (int32x4_t)__builtin_neon_vqdmull_nv4hi (__a, __b, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmull_n_s32 (int32x2_t __a, int32_t __b)
-{
-  return (int64x2_t)__builtin_neon_vqdmull_nv2si (__a, (__builtin_neon_si) __b, 1);
-}
+#define vqdmull_n_s32(__a, __b) \
+  (int64x2_t)__builtin_neon_vqdmull_nv2si (__a, __b, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqdmulhq_n_s16 (int16x8_t __a, int16_t __b)
-{
-  return (int16x8_t)__builtin_neon_vqdmulh_nv8hi (__a, (__builtin_neon_hi) __b, 1);
-}
+#define vqdmulhq_n_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vqdmulh_nv8hi (__a, __b, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmulhq_n_s32 (int32x4_t __a, int32_t __b)
-{
-  return (int32x4_t)__builtin_neon_vqdmulh_nv4si (__a, (__builtin_neon_si) __b, 1);
-}
+#define vqdmulhq_n_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vqdmulh_nv4si (__a, __b, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqdmulh_n_s16 (int16x4_t __a, int16_t __b)
-{
-  return (int16x4_t)__builtin_neon_vqdmulh_nv4hi (__a, (__builtin_neon_hi) __b, 1);
-}
+#define vqdmulh_n_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vqdmulh_nv4hi (__a, __b, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqdmulh_n_s32 (int32x2_t __a, int32_t __b)
-{
-  return (int32x2_t)__builtin_neon_vqdmulh_nv2si (__a, (__builtin_neon_si) __b, 1);
-}
+#define vqdmulh_n_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vqdmulh_nv2si (__a, __b, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vqrdmulhq_n_s16 (int16x8_t __a, int16_t __b)
-{
-  return (int16x8_t)__builtin_neon_vqdmulh_nv8hi (__a, (__builtin_neon_hi) __b, 3);
-}
+#define vqrdmulhq_n_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vqdmulh_nv8hi (__a, __b, 3);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqrdmulhq_n_s32 (int32x4_t __a, int32_t __b)
-{
-  return (int32x4_t)__builtin_neon_vqdmulh_nv4si (__a, (__builtin_neon_si) __b, 3);
-}
+#define vqrdmulhq_n_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vqdmulh_nv4si (__a, __b, 3);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vqrdmulh_n_s16 (int16x4_t __a, int16_t __b)
-{
-  return (int16x4_t)__builtin_neon_vqdmulh_nv4hi (__a, (__builtin_neon_hi) __b, 3);
-}
+#define vqrdmulh_n_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vqdmulh_nv4hi (__a, __b, 3);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vqrdmulh_n_s32 (int32x2_t __a, int32_t __b)
-{
-  return (int32x2_t)__builtin_neon_vqdmulh_nv2si (__a, (__builtin_neon_si) __b, 3);
-}
+#define vqrdmulh_n_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vqdmulh_nv2si (__a, __b, 3);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmla_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c)
-{
-  return (int16x4_t)__builtin_neon_vmla_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1);
-}
+#define vmla_n_s16(__a, __b, __c) \
+  (int16x4_t)__builtin_neon_vmla_nv4hi (__a, __b, __c, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmla_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c)
-{
-  return (int32x2_t)__builtin_neon_vmla_nv2si (__a, __b, (__builtin_neon_si) __c, 1);
-}
+#define vmla_n_s32(__a, __b, __c) \
+  (int32x2_t)__builtin_neon_vmla_nv2si (__a, __b, __c, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmla_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
-{
-  return (float32x2_t)__builtin_neon_vmla_nv2sf (__a, __b, __c, 5);
-}
+#define vmla_n_f32(__a, __b, __c) \
+  (float32x2_t)__builtin_neon_vmla_nv2sf (__a, __b, __c, 5);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmla_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c)
-{
-  return (uint16x4_t)__builtin_neon_vmla_nv4hi ((int16x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c, 0);
-}
+#define vmla_n_u16(__a, __b, __c) \
+  (uint16x4_t)__builtin_neon_vmla_nv4hi (__a, __b, __c, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmla_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c)
-{
-  return (uint32x2_t)__builtin_neon_vmla_nv2si ((int32x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c, 0);
-}
+#define vmla_n_u32(__a, __b, __c) \
+  (uint32x2_t)__builtin_neon_vmla_nv2si (__a, __b, __c, 0);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmlaq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
-{
-  return (int16x8_t)__builtin_neon_vmla_nv8hi (__a, __b, (__builtin_neon_hi) __c, 1);
-}
+#define vmlaq_n_s16(__a, __b, __c) \
+  (int16x8_t)__builtin_neon_vmla_nv8hi (__a, __b, __c, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlaq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
-{
-  return (int32x4_t)__builtin_neon_vmla_nv4si (__a, __b, (__builtin_neon_si) __c, 1);
-}
+#define vmlaq_n_s32(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vmla_nv4si (__a, __b, __c, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmlaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
-{
-  return (float32x4_t)__builtin_neon_vmla_nv4sf (__a, __b, __c, 5);
-}
+#define vmlaq_n_f32(__a, __b, __c) \
+  (float32x4_t)__builtin_neon_vmla_nv4sf (__a, __b, __c, 5);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlaq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
-{
-  return (uint16x8_t)__builtin_neon_vmla_nv8hi ((int16x8_t) __a, (int16x8_t) __b, (__builtin_neon_hi) __c, 0);
-}
+#define vmlaq_n_u16(__a, __b, __c) \
+  (uint16x8_t)__builtin_neon_vmla_nv8hi (__a, __b, __c, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlaq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
-{
-  return (uint32x4_t)__builtin_neon_vmla_nv4si ((int32x4_t) __a, (int32x4_t) __b, (__builtin_neon_si) __c, 0);
-}
+#define vmlaq_n_u32(__a, __b, __c) \
+  (uint32x4_t)__builtin_neon_vmla_nv4si (__a, __b, __c, 0);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
-{
-  return (int32x4_t)__builtin_neon_vmlal_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1);
-}
+#define vmlal_n_s16(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vmlal_nv4hi (__a, __b, __c, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
-{
-  return (int64x2_t)__builtin_neon_vmlal_nv2si (__a, __b, (__builtin_neon_si) __c, 1);
-}
+#define vmlal_n_s32(__a, __b, __c) \
+  (int64x2_t)__builtin_neon_vmlal_nv2si (__a, __b, __c, 1);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlal_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c)
-{
-  return (uint32x4_t)__builtin_neon_vmlal_nv4hi ((int32x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c, 0);
-}
+#define vmlal_n_u16(__a, __b, __c) \
+  (uint32x4_t)__builtin_neon_vmlal_nv4hi (__a, __b, __c, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmlal_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c)
-{
-  return (uint64x2_t)__builtin_neon_vmlal_nv2si ((int64x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c, 0);
-}
+#define vmlal_n_u32(__a, __b, __c) \
+  (uint64x2_t)__builtin_neon_vmlal_nv2si (__a, __b, __c, 0);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
-{
-  return (int32x4_t)__builtin_neon_vqdmlal_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1);
-}
+#define vqdmlal_n_s16(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vqdmlal_nv4hi (__a, __b, __c, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
-{
-  return (int64x2_t)__builtin_neon_vqdmlal_nv2si (__a, __b, (__builtin_neon_si) __c, 1);
-}
+#define vqdmlal_n_s32(__a, __b, __c) \
+  (int64x2_t)__builtin_neon_vqdmlal_nv2si (__a, __b, __c, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vmls_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c)
-{
-  return (int16x4_t)__builtin_neon_vmls_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1);
-}
+#define vmls_n_s16(__a, __b, __c) \
+  (int16x4_t)__builtin_neon_vmls_nv4hi (__a, __b, __c, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vmls_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c)
-{
-  return (int32x2_t)__builtin_neon_vmls_nv2si (__a, __b, (__builtin_neon_si) __c, 1);
-}
+#define vmls_n_s32(__a, __b, __c) \
+  (int32x2_t)__builtin_neon_vmls_nv2si (__a, __b, __c, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vmls_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
-{
-  return (float32x2_t)__builtin_neon_vmls_nv2sf (__a, __b, __c, 5);
-}
+#define vmls_n_f32(__a, __b, __c) \
+  (float32x2_t)__builtin_neon_vmls_nv2sf (__a, __b, __c, 5);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vmls_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c)
-{
-  return (uint16x4_t)__builtin_neon_vmls_nv4hi ((int16x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c, 0);
-}
+#define vmls_n_u16(__a, __b, __c) \
+  (uint16x4_t)__builtin_neon_vmls_nv4hi (__a, __b, __c, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vmls_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c)
-{
-  return (uint32x2_t)__builtin_neon_vmls_nv2si ((int32x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c, 0);
-}
+#define vmls_n_u32(__a, __b, __c) \
+  (uint32x2_t)__builtin_neon_vmls_nv2si (__a, __b, __c, 0);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vmlsq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
-{
-  return (int16x8_t)__builtin_neon_vmls_nv8hi (__a, __b, (__builtin_neon_hi) __c, 1);
-}
+#define vmlsq_n_s16(__a, __b, __c) \
+  (int16x8_t)__builtin_neon_vmls_nv8hi (__a, __b, __c, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlsq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
-{
-  return (int32x4_t)__builtin_neon_vmls_nv4si (__a, __b, (__builtin_neon_si) __c, 1);
-}
+#define vmlsq_n_s32(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vmls_nv4si (__a, __b, __c, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vmlsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
-{
-  return (float32x4_t)__builtin_neon_vmls_nv4sf (__a, __b, __c, 5);
-}
+#define vmlsq_n_f32(__a, __b, __c) \
+  (float32x4_t)__builtin_neon_vmls_nv4sf (__a, __b, __c, 5);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vmlsq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
-{
-  return (uint16x8_t)__builtin_neon_vmls_nv8hi ((int16x8_t) __a, (int16x8_t) __b, (__builtin_neon_hi) __c, 0);
-}
+#define vmlsq_n_u16(__a, __b, __c) \
+  (uint16x8_t)__builtin_neon_vmls_nv8hi (__a, __b, __c, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlsq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
-{
-  return (uint32x4_t)__builtin_neon_vmls_nv4si ((int32x4_t) __a, (int32x4_t) __b, (__builtin_neon_si) __c, 0);
-}
+#define vmlsq_n_u32(__a, __b, __c) \
+  (uint32x4_t)__builtin_neon_vmls_nv4si (__a, __b, __c, 0);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
-{
-  return (int32x4_t)__builtin_neon_vmlsl_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1);
-}
+#define vmlsl_n_s16(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vmlsl_nv4hi (__a, __b, __c, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
-{
-  return (int64x2_t)__builtin_neon_vmlsl_nv2si (__a, __b, (__builtin_neon_si) __c, 1);
-}
+#define vmlsl_n_s32(__a, __b, __c) \
+  (int64x2_t)__builtin_neon_vmlsl_nv2si (__a, __b, __c, 1);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vmlsl_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c)
-{
-  return (uint32x4_t)__builtin_neon_vmlsl_nv4hi ((int32x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c, 0);
-}
+#define vmlsl_n_u16(__a, __b, __c) \
+  (uint32x4_t)__builtin_neon_vmlsl_nv4hi (__a, __b, __c, 0);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vmlsl_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c)
-{
-  return (uint64x2_t)__builtin_neon_vmlsl_nv2si ((int64x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c, 0);
-}
+#define vmlsl_n_u32(__a, __b, __c) \
+  (uint64x2_t)__builtin_neon_vmlsl_nv2si (__a, __b, __c, 0);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vqdmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
-{
-  return (int32x4_t)__builtin_neon_vqdmlsl_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1);
-}
+#define vqdmlsl_n_s16(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vqdmlsl_nv4hi (__a, __b, __c, 1);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vqdmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
-{
-  return (int64x2_t)__builtin_neon_vqdmlsl_nv2si (__a, __b, (__builtin_neon_si) __c, 1);
-}
+#define vqdmlsl_n_s32(__a, __b, __c) \
+  (int64x2_t)__builtin_neon_vqdmlsl_nv2si (__a, __b, __c, 1);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vext_s8 (int8x8_t __a, int8x8_t __b, const int __c)
-{
-  return (int8x8_t)__builtin_neon_vextv8qi (__a, __b, __c);
-}
+#define vext_s8(__a, __b, __c) \
+  (int8x8_t)__builtin_neon_vextv8qi (__a, __b, __c);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vext_s16 (int16x4_t __a, int16x4_t __b, const int __c)
-{
-  return (int16x4_t)__builtin_neon_vextv4hi (__a, __b, __c);
-}
+#define vext_s16(__a, __b, __c) \
+  (int16x4_t)__builtin_neon_vextv4hi (__a, __b, __c);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vext_s32 (int32x2_t __a, int32x2_t __b, const int __c)
-{
-  return (int32x2_t)__builtin_neon_vextv2si (__a, __b, __c);
-}
+#define vext_s32(__a, __b, __c) \
+  (int32x2_t)__builtin_neon_vextv2si (__a, __b, __c);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vext_s64 (int64x1_t __a, int64x1_t __b, const int __c)
-{
-  return (int64x1_t)__builtin_neon_vextdi (__a, __b, __c);
-}
+#define vext_s64(__a, __b, __c) \
+  (int64x1_t)__builtin_neon_vextdi (__a, __b, __c);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vext_f32 (float32x2_t __a, float32x2_t __b, const int __c)
-{
-  return (float32x2_t)__builtin_neon_vextv2sf (__a, __b, __c);
-}
+#define vext_f32(__a, __b, __c) \
+  (float32x2_t)__builtin_neon_vextv2sf (__a, __b, __c);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vext_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
-{
-  return (uint8x8_t)__builtin_neon_vextv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
-}
+#define vext_u8(__a, __b, __c) \
+  (uint8x8_t)__builtin_neon_vextv8qi (__a, __b, __c);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vext_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
-{
-  return (uint16x4_t)__builtin_neon_vextv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
-}
+#define vext_u16(__a, __b, __c) \
+  (uint16x4_t)__builtin_neon_vextv4hi (__a, __b, __c);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vext_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
-{
-  return (uint32x2_t)__builtin_neon_vextv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
-}
+#define vext_u32(__a, __b, __c) \
+  (uint32x2_t)__builtin_neon_vextv2si (__a, __b, __c);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vext_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
-{
-  return (uint64x1_t)__builtin_neon_vextdi ((int64x1_t) __a, (int64x1_t) __b, __c);
-}
+#define vext_u64(__a, __b, __c) \
+  (uint64x1_t)__builtin_neon_vextdi (__a, __b, __c);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vext_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
-{
-  return (poly8x8_t)__builtin_neon_vextv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
-}
+#define vext_p8(__a, __b, __c) \
+  (poly8x8_t)__builtin_neon_vextv8qi (__a, __b, __c);
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vext_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
-{
-  return (poly16x4_t)__builtin_neon_vextv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
-}
+#define vext_p16(__a, __b, __c) \
+  (poly16x4_t)__builtin_neon_vextv4hi (__a, __b, __c);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vextq_s8 (int8x16_t __a, int8x16_t __b, const int __c)
-{
-  return (int8x16_t)__builtin_neon_vextv16qi (__a, __b, __c);
-}
+#define vextq_s8(__a, __b, __c) \
+  (int8x16_t)__builtin_neon_vextv16qi (__a, __b, __c);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vextq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
-{
-  return (int16x8_t)__builtin_neon_vextv8hi (__a, __b, __c);
-}
+#define vextq_s16(__a, __b, __c) \
+  (int16x8_t)__builtin_neon_vextv8hi (__a, __b, __c);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vextq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
-{
-  return (int32x4_t)__builtin_neon_vextv4si (__a, __b, __c);
-}
+#define vextq_s32(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vextv4si (__a, __b, __c);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vextq_s64 (int64x2_t __a, int64x2_t __b, const int __c)
-{
-  return (int64x2_t)__builtin_neon_vextv2di (__a, __b, __c);
-}
+#define vextq_s64(__a, __b, __c) \
+  (int64x2_t)__builtin_neon_vextv2di (__a, __b, __c);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vextq_f32 (float32x4_t __a, float32x4_t __b, const int __c)
-{
-  return (float32x4_t)__builtin_neon_vextv4sf (__a, __b, __c);
-}
+#define vextq_f32(__a, __b, __c) \
+  (float32x4_t)__builtin_neon_vextv4sf (__a, __b, __c);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vextq_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
-{
-  return (uint8x16_t)__builtin_neon_vextv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
-}
+#define vextq_u8(__a, __b, __c) \
+  (uint8x16_t)__builtin_neon_vextv16qi (__a, __b, __c);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vextq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
-{
-  return (uint16x8_t)__builtin_neon_vextv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
-}
+#define vextq_u16(__a, __b, __c) \
+  (uint16x8_t)__builtin_neon_vextv8hi (__a, __b, __c);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vextq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
-{
-  return (uint32x4_t)__builtin_neon_vextv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
-}
+#define vextq_u32(__a, __b, __c) \
+  (uint32x4_t)__builtin_neon_vextv4si (__a, __b, __c);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vextq_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
-{
-  return (uint64x2_t)__builtin_neon_vextv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
-}
+#define vextq_u64(__a, __b, __c) \
+  (uint64x2_t)__builtin_neon_vextv2di (__a, __b, __c);
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vextq_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
-{
-  return (poly8x16_t)__builtin_neon_vextv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
-}
+#define vextq_p8(__a, __b, __c) \
+  (poly8x16_t)__builtin_neon_vextv16qi (__a, __b, __c);
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vextq_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
-{
-  return (poly16x8_t)__builtin_neon_vextv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
-}
+#define vextq_p16(__a, __b, __c) \
+  (poly16x8_t)__builtin_neon_vextv8hi (__a, __b, __c);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrev64_s8 (int8x8_t __a)
-{
-  return (int8x8_t)__builtin_neon_vrev64v8qi (__a, 1);
-}
+#define vrev64_s8(__a) \
+  (int8x8_t)__builtin_neon_vrev64v8qi (__a, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vrev64_s16 (int16x4_t __a)
-{
-  return (int16x4_t)__builtin_neon_vrev64v4hi (__a, 1);
-}
+#define vrev64_s16(__a) \
+  (int16x4_t)__builtin_neon_vrev64v4hi (__a, 1);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vrev64_s32 (int32x2_t __a)
-{
-  return (int32x2_t)__builtin_neon_vrev64v2si (__a, 1);
-}
+#define vrev64_s32(__a) \
+  (int32x2_t)__builtin_neon_vrev64v2si (__a, 1);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vrev64_f32 (float32x2_t __a)
-{
-  return (float32x2_t)__builtin_neon_vrev64v2sf (__a, 5);
-}
+#define vrev64_f32(__a) \
+  (float32x2_t)__builtin_neon_vrev64v2sf (__a, 5);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrev64_u8 (uint8x8_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vrev64v8qi ((int8x8_t) __a, 0);
-}
+#define vrev64_u8(__a) \
+  (uint8x8_t)__builtin_neon_vrev64v8qi (__a, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vrev64_u16 (uint16x4_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vrev64v4hi ((int16x4_t) __a, 0);
-}
+#define vrev64_u16(__a) \
+  (uint16x4_t)__builtin_neon_vrev64v4hi (__a, 0);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vrev64_u32 (uint32x2_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vrev64v2si ((int32x2_t) __a, 0);
-}
+#define vrev64_u32(__a) \
+  (uint32x2_t)__builtin_neon_vrev64v2si (__a, 0);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vrev64_p8 (poly8x8_t __a)
-{
-  return (poly8x8_t)__builtin_neon_vrev64v8qi ((int8x8_t) __a, 4);
-}
+#define vrev64_p8(__a) \
+  (poly8x8_t)__builtin_neon_vrev64v8qi (__a, 4);
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vrev64_p16 (poly16x4_t __a)
-{
-  return (poly16x4_t)__builtin_neon_vrev64v4hi ((int16x4_t) __a, 4);
-}
+#define vrev64_p16(__a) \
+  (poly16x4_t)__builtin_neon_vrev64v4hi (__a, 4);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vrev64q_s8 (int8x16_t __a)
-{
-  return (int8x16_t)__builtin_neon_vrev64v16qi (__a, 1);
-}
+#define vrev64q_s8(__a) \
+  (int8x16_t)__builtin_neon_vrev64v16qi (__a, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vrev64q_s16 (int16x8_t __a)
-{
-  return (int16x8_t)__builtin_neon_vrev64v8hi (__a, 1);
-}
+#define vrev64q_s16(__a) \
+  (int16x8_t)__builtin_neon_vrev64v8hi (__a, 1);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vrev64q_s32 (int32x4_t __a)
-{
-  return (int32x4_t)__builtin_neon_vrev64v4si (__a, 1);
-}
+#define vrev64q_s32(__a) \
+  (int32x4_t)__builtin_neon_vrev64v4si (__a, 1);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vrev64q_f32 (float32x4_t __a)
-{
-  return (float32x4_t)__builtin_neon_vrev64v4sf (__a, 5);
-}
+#define vrev64q_f32(__a) \
+  (float32x4_t)__builtin_neon_vrev64v4sf (__a, 5);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vrev64q_u8 (uint8x16_t __a)
-{
-  return (uint8x16_t)__builtin_neon_vrev64v16qi ((int8x16_t) __a, 0);
-}
+#define vrev64q_u8(__a) \
+  (uint8x16_t)__builtin_neon_vrev64v16qi (__a, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vrev64q_u16 (uint16x8_t __a)
-{
-  return (uint16x8_t)__builtin_neon_vrev64v8hi ((int16x8_t) __a, 0);
-}
+#define vrev64q_u16(__a) \
+  (uint16x8_t)__builtin_neon_vrev64v8hi (__a, 0);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vrev64q_u32 (uint32x4_t __a)
-{
-  return (uint32x4_t)__builtin_neon_vrev64v4si ((int32x4_t) __a, 0);
-}
+#define vrev64q_u32(__a) \
+  (uint32x4_t)__builtin_neon_vrev64v4si (__a, 0);
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vrev64q_p8 (poly8x16_t __a)
-{
-  return (poly8x16_t)__builtin_neon_vrev64v16qi ((int8x16_t) __a, 4);
-}
+#define vrev64q_p8(__a) \
+  (poly8x16_t)__builtin_neon_vrev64v16qi (__a, 4);
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vrev64q_p16 (poly16x8_t __a)
-{
-  return (poly16x8_t)__builtin_neon_vrev64v8hi ((int16x8_t) __a, 4);
-}
+#define vrev64q_p16(__a) \
+  (poly16x8_t)__builtin_neon_vrev64v8hi (__a, 4);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrev32_s8 (int8x8_t __a)
-{
-  return (int8x8_t)__builtin_neon_vrev32v8qi (__a, 1);
-}
+#define vrev32_s8(__a) \
+  (int8x8_t)__builtin_neon_vrev32v8qi (__a, 1);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vrev32_s16 (int16x4_t __a)
-{
-  return (int16x4_t)__builtin_neon_vrev32v4hi (__a, 1);
-}
+#define vrev32_s16(__a) \
+  (int16x4_t)__builtin_neon_vrev32v4hi (__a, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrev32_u8 (uint8x8_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vrev32v8qi ((int8x8_t) __a, 0);
-}
+#define vrev32_u8(__a) \
+  (uint8x8_t)__builtin_neon_vrev32v8qi (__a, 0);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vrev32_u16 (uint16x4_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vrev32v4hi ((int16x4_t) __a, 0);
-}
+#define vrev32_u16(__a) \
+  (uint16x4_t)__builtin_neon_vrev32v4hi (__a, 0);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vrev32_p8 (poly8x8_t __a)
-{
-  return (poly8x8_t)__builtin_neon_vrev32v8qi ((int8x8_t) __a, 4);
-}
+#define vrev32_p8(__a) \
+  (poly8x8_t)__builtin_neon_vrev32v8qi (__a, 4);
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vrev32_p16 (poly16x4_t __a)
-{
-  return (poly16x4_t)__builtin_neon_vrev32v4hi ((int16x4_t) __a, 4);
-}
+#define vrev32_p16(__a) \
+  (poly16x4_t)__builtin_neon_vrev32v4hi (__a, 4);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vrev32q_s8 (int8x16_t __a)
-{
-  return (int8x16_t)__builtin_neon_vrev32v16qi (__a, 1);
-}
+#define vrev32q_s8(__a) \
+  (int8x16_t)__builtin_neon_vrev32v16qi (__a, 1);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vrev32q_s16 (int16x8_t __a)
-{
-  return (int16x8_t)__builtin_neon_vrev32v8hi (__a, 1);
-}
+#define vrev32q_s16(__a) \
+  (int16x8_t)__builtin_neon_vrev32v8hi (__a, 1);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vrev32q_u8 (uint8x16_t __a)
-{
-  return (uint8x16_t)__builtin_neon_vrev32v16qi ((int8x16_t) __a, 0);
-}
+#define vrev32q_u8(__a) \
+  (uint8x16_t)__builtin_neon_vrev32v16qi (__a, 0);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vrev32q_u16 (uint16x8_t __a)
-{
-  return (uint16x8_t)__builtin_neon_vrev32v8hi ((int16x8_t) __a, 0);
-}
+#define vrev32q_u16(__a) \
+  (uint16x8_t)__builtin_neon_vrev32v8hi (__a, 0);
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vrev32q_p8 (poly8x16_t __a)
-{
-  return (poly8x16_t)__builtin_neon_vrev32v16qi ((int8x16_t) __a, 4);
-}
+#define vrev32q_p8(__a) \
+  (poly8x16_t)__builtin_neon_vrev32v16qi (__a, 4);
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vrev32q_p16 (poly16x8_t __a)
-{
-  return (poly16x8_t)__builtin_neon_vrev32v8hi ((int16x8_t) __a, 4);
-}
+#define vrev32q_p16(__a) \
+  (poly16x8_t)__builtin_neon_vrev32v8hi (__a, 4);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vrev16_s8 (int8x8_t __a)
-{
-  return (int8x8_t)__builtin_neon_vrev16v8qi (__a, 1);
-}
+#define vrev16_s8(__a) \
+  (int8x8_t)__builtin_neon_vrev16v8qi (__a, 1);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vrev16_u8 (uint8x8_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vrev16v8qi ((int8x8_t) __a, 0);
-}
+#define vrev16_u8(__a) \
+  (uint8x8_t)__builtin_neon_vrev16v8qi (__a, 0);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vrev16_p8 (poly8x8_t __a)
-{
-  return (poly8x8_t)__builtin_neon_vrev16v8qi ((int8x8_t) __a, 4);
-}
+#define vrev16_p8(__a) \
+  (poly8x8_t)__builtin_neon_vrev16v8qi (__a, 4);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vrev16q_s8 (int8x16_t __a)
-{
-  return (int8x16_t)__builtin_neon_vrev16v16qi (__a, 1);
-}
+#define vrev16q_s8(__a) \
+  (int8x16_t)__builtin_neon_vrev16v16qi (__a, 1);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vrev16q_u8 (uint8x16_t __a)
-{
-  return (uint8x16_t)__builtin_neon_vrev16v16qi ((int8x16_t) __a, 0);
-}
+#define vrev16q_u8(__a) \
+  (uint8x16_t)__builtin_neon_vrev16v16qi (__a, 0);
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vrev16q_p8 (poly8x16_t __a)
-{
-  return (poly8x16_t)__builtin_neon_vrev16v16qi ((int8x16_t) __a, 4);
-}
+#define vrev16q_p8(__a) \
+  (poly8x16_t)__builtin_neon_vrev16v16qi (__a, 4);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vbsl_s8 (uint8x8_t __a, int8x8_t __b, int8x8_t __c)
-{
-  return (int8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, __b, __c);
-}
+#define vbsl_s8(__a, __b, __c) \
+  (int8x8_t)__builtin_neon_vbslv8qi (__a, __b, __c);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vbsl_s16 (uint16x4_t __a, int16x4_t __b, int16x4_t __c)
-{
-  return (int16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, __b, __c);
-}
+#define vbsl_s16(__a, __b, __c) \
+  (int16x4_t)__builtin_neon_vbslv4hi (__a, __b, __c);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vbsl_s32 (uint32x2_t __a, int32x2_t __b, int32x2_t __c)
-{
-  return (int32x2_t)__builtin_neon_vbslv2si ((int32x2_t) __a, __b, __c);
-}
+#define vbsl_s32(__a, __b, __c) \
+  (int32x2_t)__builtin_neon_vbslv2si (__a, __b, __c);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vbsl_s64 (uint64x1_t __a, int64x1_t __b, int64x1_t __c)
-{
-  return (int64x1_t)__builtin_neon_vbsldi ((int64x1_t) __a, __b, __c);
-}
+#define vbsl_s64(__a, __b, __c) \
+  (int64x1_t)__builtin_neon_vbsldi (__a, __b, __c);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vbsl_f32 (uint32x2_t __a, float32x2_t __b, float32x2_t __c)
-{
-  return (float32x2_t)__builtin_neon_vbslv2sf ((int32x2_t) __a, __b, __c);
-}
+#define vbsl_f32(__a, __b, __c) \
+  (float32x2_t)__builtin_neon_vbslv2sf (__a, __b, __c);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vbsl_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
-{
-  return (uint8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
-}
+#define vbsl_u8(__a, __b, __c) \
+  (uint8x8_t)__builtin_neon_vbslv8qi (__a, __b, __c);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vbsl_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
-{
-  return (uint16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
-}
+#define vbsl_u16(__a, __b, __c) \
+  (uint16x4_t)__builtin_neon_vbslv4hi (__a, __b, __c);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vbsl_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
-{
-  return (uint32x2_t)__builtin_neon_vbslv2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
-}
+#define vbsl_u32(__a, __b, __c) \
+  (uint32x2_t)__builtin_neon_vbslv2si (__a, __b, __c);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vbsl_u64 (uint64x1_t __a, uint64x1_t __b, uint64x1_t __c)
-{
-  return (uint64x1_t)__builtin_neon_vbsldi ((int64x1_t) __a, (int64x1_t) __b, (int64x1_t) __c);
-}
+#define vbsl_u64(__a, __b, __c) \
+  (uint64x1_t)__builtin_neon_vbsldi (__a, __b, __c);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vbsl_p8 (uint8x8_t __a, poly8x8_t __b, poly8x8_t __c)
-{
-  return (poly8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
-}
+#define vbsl_p8(__a, __b, __c) \
+  (poly8x8_t)__builtin_neon_vbslv8qi (__a, __b, __c);
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vbsl_p16 (uint16x4_t __a, poly16x4_t __b, poly16x4_t __c)
-{
-  return (poly16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
-}
+#define vbsl_p16(__a, __b, __c) \
+  (poly16x4_t)__builtin_neon_vbslv4hi (__a, __b, __c);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vbslq_s8 (uint8x16_t __a, int8x16_t __b, int8x16_t __c)
-{
-  return (int8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, __b, __c);
-}
+#define vbslq_s8(__a, __b, __c) \
+  (int8x16_t)__builtin_neon_vbslv16qi (__a, __b, __c);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vbslq_s16 (uint16x8_t __a, int16x8_t __b, int16x8_t __c)
-{
-  return (int16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, __b, __c);
-}
+#define vbslq_s16(__a, __b, __c) \
+  (int16x8_t)__builtin_neon_vbslv8hi (__a, __b, __c);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vbslq_s32 (uint32x4_t __a, int32x4_t __b, int32x4_t __c)
-{
-  return (int32x4_t)__builtin_neon_vbslv4si ((int32x4_t) __a, __b, __c);
-}
+#define vbslq_s32(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vbslv4si (__a, __b, __c);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vbslq_s64 (uint64x2_t __a, int64x2_t __b, int64x2_t __c)
-{
-  return (int64x2_t)__builtin_neon_vbslv2di ((int64x2_t) __a, __b, __c);
-}
+#define vbslq_s64(__a, __b, __c) \
+  (int64x2_t)__builtin_neon_vbslv2di (__a, __b, __c);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vbslq_f32 (uint32x4_t __a, float32x4_t __b, float32x4_t __c)
-{
-  return (float32x4_t)__builtin_neon_vbslv4sf ((int32x4_t) __a, __b, __c);
-}
+#define vbslq_f32(__a, __b, __c) \
+  (float32x4_t)__builtin_neon_vbslv4sf (__a, __b, __c);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vbslq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
-{
-  return (uint8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
-}
+#define vbslq_u8(__a, __b, __c) \
+  (uint8x16_t)__builtin_neon_vbslv16qi (__a, __b, __c);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vbslq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
-{
-  return (uint16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
-}
+#define vbslq_u16(__a, __b, __c) \
+  (uint16x8_t)__builtin_neon_vbslv8hi (__a, __b, __c);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vbslq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
-{
-  return (uint32x4_t)__builtin_neon_vbslv4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c);
-}
+#define vbslq_u32(__a, __b, __c) \
+  (uint32x4_t)__builtin_neon_vbslv4si (__a, __b, __c);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vbslq_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c)
-{
-  return (uint64x2_t)__builtin_neon_vbslv2di ((int64x2_t) __a, (int64x2_t) __b, (int64x2_t) __c);
-}
+#define vbslq_u64(__a, __b, __c) \
+  (uint64x2_t)__builtin_neon_vbslv2di (__a, __b, __c);
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vbslq_p8 (uint8x16_t __a, poly8x16_t __b, poly8x16_t __c)
-{
-  return (poly8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
-}
+#define vbslq_p8(__a, __b, __c) \
+  (poly8x16_t)__builtin_neon_vbslv16qi (__a, __b, __c);
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vbslq_p16 (uint16x8_t __a, poly16x8_t __b, poly16x8_t __c)
-{
-  return (poly16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
-}
+#define vbslq_p16(__a, __b, __c) \
+  (poly16x8_t)__builtin_neon_vbslv8hi (__a, __b, __c);
 
-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
-vtrn_s8 (int8x8_t __a, int8x8_t __b)
-{
-  int8x8x2_t __rv;
-  __builtin_neon_vtrnv8qi (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vtrn_s8(__a, __b) \
+  ({ \
+     int8x8x2_t __rv; \
+     __builtin_neon_vtrnv8qi (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
-vtrn_s16 (int16x4_t __a, int16x4_t __b)
-{
-  int16x4x2_t __rv;
-  __builtin_neon_vtrnv4hi (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vtrn_s16(__a, __b) \
+  ({ \
+     int16x4x2_t __rv; \
+     __builtin_neon_vtrnv4hi (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
-vtrn_s32 (int32x2_t __a, int32x2_t __b)
-{
-  int32x2x2_t __rv;
-  __builtin_neon_vtrnv2si (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vtrn_s32(__a, __b) \
+  ({ \
+     int32x2x2_t __rv; \
+     __builtin_neon_vtrnv2si (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
-vtrn_f32 (float32x2_t __a, float32x2_t __b)
-{
-  float32x2x2_t __rv;
-  __builtin_neon_vtrnv2sf (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vtrn_f32(__a, __b) \
+  ({ \
+     float32x2x2_t __rv; \
+     __builtin_neon_vtrnv2sf (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
-vtrn_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  uint8x8x2_t __rv;
-  __builtin_neon_vtrnv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
-  return __rv;
-}
+#define vtrn_u8(__a, __b) \
+  ({ \
+     uint8x8x2_t __rv; \
+     __builtin_neon_vtrnv8qi ((int8x8_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
-vtrn_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  uint16x4x2_t __rv;
-  __builtin_neon_vtrnv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
-  return __rv;
-}
+#define vtrn_u16(__a, __b) \
+  ({ \
+     uint16x4x2_t __rv; \
+     __builtin_neon_vtrnv4hi ((int16x4_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
-vtrn_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  uint32x2x2_t __rv;
-  __builtin_neon_vtrnv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b);
-  return __rv;
-}
+#define vtrn_u32(__a, __b) \
+  ({ \
+     uint32x2x2_t __rv; \
+     __builtin_neon_vtrnv2si ((int32x2_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
-vtrn_p8 (poly8x8_t __a, poly8x8_t __b)
-{
-  poly8x8x2_t __rv;
-  __builtin_neon_vtrnv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
-  return __rv;
-}
+#define vtrn_p8(__a, __b) \
+  ({ \
+     poly8x8x2_t __rv; \
+     __builtin_neon_vtrnv8qi ((int8x8_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
-vtrn_p16 (poly16x4_t __a, poly16x4_t __b)
-{
-  poly16x4x2_t __rv;
-  __builtin_neon_vtrnv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
-  return __rv;
-}
+#define vtrn_p16(__a, __b) \
+  ({ \
+     poly16x4x2_t __rv; \
+     __builtin_neon_vtrnv4hi ((int16x4_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
-vtrnq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  int8x16x2_t __rv;
-  __builtin_neon_vtrnv16qi (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vtrnq_s8(__a, __b) \
+  ({ \
+     int8x16x2_t __rv; \
+     __builtin_neon_vtrnv16qi (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
-vtrnq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  int16x8x2_t __rv;
-  __builtin_neon_vtrnv8hi (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vtrnq_s16(__a, __b) \
+  ({ \
+     int16x8x2_t __rv; \
+     __builtin_neon_vtrnv8hi (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
-vtrnq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  int32x4x2_t __rv;
-  __builtin_neon_vtrnv4si (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vtrnq_s32(__a, __b) \
+  ({ \
+     int32x4x2_t __rv; \
+     __builtin_neon_vtrnv4si (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
-vtrnq_f32 (float32x4_t __a, float32x4_t __b)
-{
-  float32x4x2_t __rv;
-  __builtin_neon_vtrnv4sf (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vtrnq_f32(__a, __b) \
+  ({ \
+     float32x4x2_t __rv; \
+     __builtin_neon_vtrnv4sf (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
-vtrnq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  uint8x16x2_t __rv;
-  __builtin_neon_vtrnv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
-  return __rv;
-}
+#define vtrnq_u8(__a, __b) \
+  ({ \
+     uint8x16x2_t __rv; \
+     __builtin_neon_vtrnv16qi ((int8x16_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
-vtrnq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  uint16x8x2_t __rv;
-  __builtin_neon_vtrnv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
-  return __rv;
-}
+#define vtrnq_u16(__a, __b) \
+  ({ \
+     uint16x8x2_t __rv; \
+     __builtin_neon_vtrnv8hi ((int16x8_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
-vtrnq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  uint32x4x2_t __rv;
-  __builtin_neon_vtrnv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b);
-  return __rv;
-}
+#define vtrnq_u32(__a, __b) \
+  ({ \
+     uint32x4x2_t __rv; \
+     __builtin_neon_vtrnv4si ((int32x4_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
-vtrnq_p8 (poly8x16_t __a, poly8x16_t __b)
-{
-  poly8x16x2_t __rv;
-  __builtin_neon_vtrnv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
-  return __rv;
-}
+#define vtrnq_p8(__a, __b) \
+  ({ \
+     poly8x16x2_t __rv; \
+     __builtin_neon_vtrnv16qi ((int8x16_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
-vtrnq_p16 (poly16x8_t __a, poly16x8_t __b)
-{
-  poly16x8x2_t __rv;
-  __builtin_neon_vtrnv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
-  return __rv;
-}
+#define vtrnq_p16(__a, __b) \
+  ({ \
+     poly16x8x2_t __rv; \
+     __builtin_neon_vtrnv8hi ((int16x8_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
-vzip_s8 (int8x8_t __a, int8x8_t __b)
-{
-  int8x8x2_t __rv;
-  __builtin_neon_vzipv8qi (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vzip_s8(__a, __b) \
+  ({ \
+     int8x8x2_t __rv; \
+     __builtin_neon_vzipv8qi (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
-vzip_s16 (int16x4_t __a, int16x4_t __b)
-{
-  int16x4x2_t __rv;
-  __builtin_neon_vzipv4hi (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vzip_s16(__a, __b) \
+  ({ \
+     int16x4x2_t __rv; \
+     __builtin_neon_vzipv4hi (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
-vzip_s32 (int32x2_t __a, int32x2_t __b)
-{
-  int32x2x2_t __rv;
-  __builtin_neon_vzipv2si (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vzip_s32(__a, __b) \
+  ({ \
+     int32x2x2_t __rv; \
+     __builtin_neon_vzipv2si (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
-vzip_f32 (float32x2_t __a, float32x2_t __b)
-{
-  float32x2x2_t __rv;
-  __builtin_neon_vzipv2sf (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vzip_f32(__a, __b) \
+  ({ \
+     float32x2x2_t __rv; \
+     __builtin_neon_vzipv2sf (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
-vzip_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  uint8x8x2_t __rv;
-  __builtin_neon_vzipv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
-  return __rv;
-}
+#define vzip_u8(__a, __b) \
+  ({ \
+     uint8x8x2_t __rv; \
+     __builtin_neon_vzipv8qi ((int8x8_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
-vzip_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  uint16x4x2_t __rv;
-  __builtin_neon_vzipv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
-  return __rv;
-}
+#define vzip_u16(__a, __b) \
+  ({ \
+     uint16x4x2_t __rv; \
+     __builtin_neon_vzipv4hi ((int16x4_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
-vzip_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  uint32x2x2_t __rv;
-  __builtin_neon_vzipv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b);
-  return __rv;
-}
+#define vzip_u32(__a, __b) \
+  ({ \
+     uint32x2x2_t __rv; \
+     __builtin_neon_vzipv2si ((int32x2_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
-vzip_p8 (poly8x8_t __a, poly8x8_t __b)
-{
-  poly8x8x2_t __rv;
-  __builtin_neon_vzipv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
-  return __rv;
-}
+#define vzip_p8(__a, __b) \
+  ({ \
+     poly8x8x2_t __rv; \
+     __builtin_neon_vzipv8qi ((int8x8_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
-vzip_p16 (poly16x4_t __a, poly16x4_t __b)
-{
-  poly16x4x2_t __rv;
-  __builtin_neon_vzipv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
-  return __rv;
-}
+#define vzip_p16(__a, __b) \
+  ({ \
+     poly16x4x2_t __rv; \
+     __builtin_neon_vzipv4hi ((int16x4_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
-vzipq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  int8x16x2_t __rv;
-  __builtin_neon_vzipv16qi (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vzipq_s8(__a, __b) \
+  ({ \
+     int8x16x2_t __rv; \
+     __builtin_neon_vzipv16qi (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
-vzipq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  int16x8x2_t __rv;
-  __builtin_neon_vzipv8hi (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vzipq_s16(__a, __b) \
+  ({ \
+     int16x8x2_t __rv; \
+     __builtin_neon_vzipv8hi (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
-vzipq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  int32x4x2_t __rv;
-  __builtin_neon_vzipv4si (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vzipq_s32(__a, __b) \
+  ({ \
+     int32x4x2_t __rv; \
+     __builtin_neon_vzipv4si (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
-vzipq_f32 (float32x4_t __a, float32x4_t __b)
-{
-  float32x4x2_t __rv;
-  __builtin_neon_vzipv4sf (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vzipq_f32(__a, __b) \
+  ({ \
+     float32x4x2_t __rv; \
+     __builtin_neon_vzipv4sf (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
-vzipq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  uint8x16x2_t __rv;
-  __builtin_neon_vzipv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
-  return __rv;
-}
+#define vzipq_u8(__a, __b) \
+  ({ \
+     uint8x16x2_t __rv; \
+     __builtin_neon_vzipv16qi ((int8x16_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
-vzipq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  uint16x8x2_t __rv;
-  __builtin_neon_vzipv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
-  return __rv;
-}
+#define vzipq_u16(__a, __b) \
+  ({ \
+     uint16x8x2_t __rv; \
+     __builtin_neon_vzipv8hi ((int16x8_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
-vzipq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  uint32x4x2_t __rv;
-  __builtin_neon_vzipv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b);
-  return __rv;
-}
+#define vzipq_u32(__a, __b) \
+  ({ \
+     uint32x4x2_t __rv; \
+     __builtin_neon_vzipv4si ((int32x4_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
-vzipq_p8 (poly8x16_t __a, poly8x16_t __b)
-{
-  poly8x16x2_t __rv;
-  __builtin_neon_vzipv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
-  return __rv;
-}
+#define vzipq_p8(__a, __b) \
+  ({ \
+     poly8x16x2_t __rv; \
+     __builtin_neon_vzipv16qi ((int8x16_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
-vzipq_p16 (poly16x8_t __a, poly16x8_t __b)
-{
-  poly16x8x2_t __rv;
-  __builtin_neon_vzipv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
-  return __rv;
-}
+#define vzipq_p16(__a, __b) \
+  ({ \
+     poly16x8x2_t __rv; \
+     __builtin_neon_vzipv8hi ((int16x8_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
-vuzp_s8 (int8x8_t __a, int8x8_t __b)
-{
-  int8x8x2_t __rv;
-  __builtin_neon_vuzpv8qi (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vuzp_s8(__a, __b) \
+  ({ \
+     int8x8x2_t __rv; \
+     __builtin_neon_vuzpv8qi (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
-vuzp_s16 (int16x4_t __a, int16x4_t __b)
-{
-  int16x4x2_t __rv;
-  __builtin_neon_vuzpv4hi (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vuzp_s16(__a, __b) \
+  ({ \
+     int16x4x2_t __rv; \
+     __builtin_neon_vuzpv4hi (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
-vuzp_s32 (int32x2_t __a, int32x2_t __b)
-{
-  int32x2x2_t __rv;
-  __builtin_neon_vuzpv2si (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vuzp_s32(__a, __b) \
+  ({ \
+     int32x2x2_t __rv; \
+     __builtin_neon_vuzpv2si (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
-vuzp_f32 (float32x2_t __a, float32x2_t __b)
-{
-  float32x2x2_t __rv;
-  __builtin_neon_vuzpv2sf (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vuzp_f32(__a, __b) \
+  ({ \
+     float32x2x2_t __rv; \
+     __builtin_neon_vuzpv2sf (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
-vuzp_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  uint8x8x2_t __rv;
-  __builtin_neon_vuzpv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
-  return __rv;
-}
+#define vuzp_u8(__a, __b) \
+  ({ \
+     uint8x8x2_t __rv; \
+     __builtin_neon_vuzpv8qi ((int8x8_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
-vuzp_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  uint16x4x2_t __rv;
-  __builtin_neon_vuzpv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
-  return __rv;
-}
+#define vuzp_u16(__a, __b) \
+  ({ \
+     uint16x4x2_t __rv; \
+     __builtin_neon_vuzpv4hi ((int16x4_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
-vuzp_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  uint32x2x2_t __rv;
-  __builtin_neon_vuzpv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b);
-  return __rv;
-}
+#define vuzp_u32(__a, __b) \
+  ({ \
+     uint32x2x2_t __rv; \
+     __builtin_neon_vuzpv2si ((int32x2_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
-vuzp_p8 (poly8x8_t __a, poly8x8_t __b)
-{
-  poly8x8x2_t __rv;
-  __builtin_neon_vuzpv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
-  return __rv;
-}
+#define vuzp_p8(__a, __b) \
+  ({ \
+     poly8x8x2_t __rv; \
+     __builtin_neon_vuzpv8qi ((int8x8_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
-vuzp_p16 (poly16x4_t __a, poly16x4_t __b)
-{
-  poly16x4x2_t __rv;
-  __builtin_neon_vuzpv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
-  return __rv;
-}
+#define vuzp_p16(__a, __b) \
+  ({ \
+     poly16x4x2_t __rv; \
+     __builtin_neon_vuzpv4hi ((int16x4_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
-vuzpq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  int8x16x2_t __rv;
-  __builtin_neon_vuzpv16qi (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vuzpq_s8(__a, __b) \
+  ({ \
+     int8x16x2_t __rv; \
+     __builtin_neon_vuzpv16qi (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
-vuzpq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  int16x8x2_t __rv;
-  __builtin_neon_vuzpv8hi (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vuzpq_s16(__a, __b) \
+  ({ \
+     int16x8x2_t __rv; \
+     __builtin_neon_vuzpv8hi (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
-vuzpq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  int32x4x2_t __rv;
-  __builtin_neon_vuzpv4si (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vuzpq_s32(__a, __b) \
+  ({ \
+     int32x4x2_t __rv; \
+     __builtin_neon_vuzpv4si (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
-vuzpq_f32 (float32x4_t __a, float32x4_t __b)
-{
-  float32x4x2_t __rv;
-  __builtin_neon_vuzpv4sf (&__rv.val[0], __a, __b);
-  return __rv;
-}
+#define vuzpq_f32(__a, __b) \
+  ({ \
+     float32x4x2_t __rv; \
+     __builtin_neon_vuzpv4sf (&__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
-vuzpq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  uint8x16x2_t __rv;
-  __builtin_neon_vuzpv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
-  return __rv;
-}
+#define vuzpq_u8(__a, __b) \
+  ({ \
+     uint8x16x2_t __rv; \
+     __builtin_neon_vuzpv16qi ((int8x16_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
-vuzpq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  uint16x8x2_t __rv;
-  __builtin_neon_vuzpv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
-  return __rv;
-}
+#define vuzpq_u16(__a, __b) \
+  ({ \
+     uint16x8x2_t __rv; \
+     __builtin_neon_vuzpv8hi ((int16x8_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
-vuzpq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  uint32x4x2_t __rv;
-  __builtin_neon_vuzpv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b);
-  return __rv;
-}
+#define vuzpq_u32(__a, __b) \
+  ({ \
+     uint32x4x2_t __rv; \
+     __builtin_neon_vuzpv4si ((int32x4_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
-vuzpq_p8 (poly8x16_t __a, poly8x16_t __b)
-{
-  poly8x16x2_t __rv;
-  __builtin_neon_vuzpv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
-  return __rv;
-}
+#define vuzpq_p8(__a, __b) \
+  ({ \
+     poly8x16x2_t __rv; \
+     __builtin_neon_vuzpv16qi ((int8x16_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
-vuzpq_p16 (poly16x8_t __a, poly16x8_t __b)
-{
-  poly16x8x2_t __rv;
-  __builtin_neon_vuzpv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
-  return __rv;
-}
+#define vuzpq_p16(__a, __b) \
+  ({ \
+     poly16x8x2_t __rv; \
+     __builtin_neon_vuzpv8hi ((int16x8_t *) &__rv.val[0], __a, __b); \
+     __rv; \
+   })
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vld1_s8 (const int8_t * __a)
-{
-  return (int8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
-}
+#define vld1_s8(__a) \
+  (int8x8_t)__builtin_neon_vld1v8qi (__a);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vld1_s16 (const int16_t * __a)
-{
-  return (int16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
-}
+#define vld1_s16(__a) \
+  (int16x4_t)__builtin_neon_vld1v4hi (__a);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vld1_s32 (const int32_t * __a)
-{
-  return (int32x2_t)__builtin_neon_vld1v2si ((const __builtin_neon_si *) __a);
-}
+#define vld1_s32(__a) \
+  (int32x2_t)__builtin_neon_vld1v2si (__a);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vld1_s64 (const int64_t * __a)
-{
-  return (int64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a);
-}
+#define vld1_s64(__a) \
+  (int64x1_t)__builtin_neon_vld1di (__a);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vld1_f32 (const float32_t * __a)
-{
-  return (float32x2_t)__builtin_neon_vld1v2sf (__a);
-}
+#define vld1_f32(__a) \
+  (float32x2_t)__builtin_neon_vld1v2sf (__a);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vld1_u8 (const uint8_t * __a)
-{
-  return (uint8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
-}
+#define vld1_u8(__a) \
+  (uint8x8_t)__builtin_neon_vld1v8qi (__a);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vld1_u16 (const uint16_t * __a)
-{
-  return (uint16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
-}
+#define vld1_u16(__a) \
+  (uint16x4_t)__builtin_neon_vld1v4hi (__a);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vld1_u32 (const uint32_t * __a)
-{
-  return (uint32x2_t)__builtin_neon_vld1v2si ((const __builtin_neon_si *) __a);
-}
+#define vld1_u32(__a) \
+  (uint32x2_t)__builtin_neon_vld1v2si (__a);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vld1_u64 (const uint64_t * __a)
-{
-  return (uint64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a);
-}
+#define vld1_u64(__a) \
+  (uint64x1_t)__builtin_neon_vld1di (__a);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vld1_p8 (const poly8_t * __a)
-{
-  return (poly8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
-}
+#define vld1_p8(__a) \
+  (poly8x8_t)__builtin_neon_vld1v8qi (__a);
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vld1_p16 (const poly16_t * __a)
-{
-  return (poly16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
-}
+#define vld1_p16(__a) \
+  (poly16x4_t)__builtin_neon_vld1v4hi (__a);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vld1q_s8 (const int8_t * __a)
-{
-  return (int8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a);
-}
+#define vld1q_s8(__a) \
+  (int8x16_t)__builtin_neon_vld1v16qi (__a);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vld1q_s16 (const int16_t * __a)
-{
-  return (int16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a);
-}
+#define vld1q_s16(__a) \
+  (int16x8_t)__builtin_neon_vld1v8hi (__a);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vld1q_s32 (const int32_t * __a)
-{
-  return (int32x4_t)__builtin_neon_vld1v4si ((const __builtin_neon_si *) __a);
-}
+#define vld1q_s32(__a) \
+  (int32x4_t)__builtin_neon_vld1v4si (__a);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vld1q_s64 (const int64_t * __a)
-{
-  return (int64x2_t)__builtin_neon_vld1v2di ((const __builtin_neon_di *) __a);
-}
+#define vld1q_s64(__a) \
+  (int64x2_t)__builtin_neon_vld1v2di (__a);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vld1q_f32 (const float32_t * __a)
-{
-  return (float32x4_t)__builtin_neon_vld1v4sf (__a);
-}
+#define vld1q_f32(__a) \
+  (float32x4_t)__builtin_neon_vld1v4sf (__a);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vld1q_u8 (const uint8_t * __a)
-{
-  return (uint8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a);
-}
+#define vld1q_u8(__a) \
+  (uint8x16_t)__builtin_neon_vld1v16qi (__a);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vld1q_u16 (const uint16_t * __a)
-{
-  return (uint16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a);
-}
+#define vld1q_u16(__a) \
+  (uint16x8_t)__builtin_neon_vld1v8hi (__a);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vld1q_u32 (const uint32_t * __a)
-{
-  return (uint32x4_t)__builtin_neon_vld1v4si ((const __builtin_neon_si *) __a);
-}
+#define vld1q_u32(__a) \
+  (uint32x4_t)__builtin_neon_vld1v4si (__a);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vld1q_u64 (const uint64_t * __a)
-{
-  return (uint64x2_t)__builtin_neon_vld1v2di ((const __builtin_neon_di *) __a);
-}
+#define vld1q_u64(__a) \
+  (uint64x2_t)__builtin_neon_vld1v2di (__a);
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vld1q_p8 (const poly8_t * __a)
-{
-  return (poly8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a);
-}
+#define vld1q_p8(__a) \
+  (poly8x16_t)__builtin_neon_vld1v16qi (__a);
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vld1q_p16 (const poly16_t * __a)
-{
-  return (poly16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a);
-}
+#define vld1q_p16(__a) \
+  (poly16x8_t)__builtin_neon_vld1v8hi (__a);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vld1_lane_s8 (const int8_t * __a, int8x8_t __b, const int __c)
-{
-  return (int8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, __b, __c);
-}
+#define vld1_lane_s8(__a, __b, __c) \
+  (int8x8_t)__builtin_neon_vld1_lanev8qi (__a, __b, __c);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vld1_lane_s16 (const int16_t * __a, int16x4_t __b, const int __c)
-{
-  return (int16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, __b, __c);
-}
+#define vld1_lane_s16(__a, __b, __c) \
+  (int16x4_t)__builtin_neon_vld1_lanev4hi (__a, __b, __c);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vld1_lane_s32 (const int32_t * __a, int32x2_t __b, const int __c)
-{
-  return (int32x2_t)__builtin_neon_vld1_lanev2si ((const __builtin_neon_si *) __a, __b, __c);
-}
+#define vld1_lane_s32(__a, __b, __c) \
+  (int32x2_t)__builtin_neon_vld1_lanev2si (__a, __b, __c);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vld1_lane_f32 (const float32_t * __a, float32x2_t __b, const int __c)
-{
-  return (float32x2_t)__builtin_neon_vld1_lanev2sf (__a, __b, __c);
-}
+#define vld1_lane_f32(__a, __b, __c) \
+  (float32x2_t)__builtin_neon_vld1_lanev2sf (__a, __b, __c);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vld1_lane_u8 (const uint8_t * __a, uint8x8_t __b, const int __c)
-{
-  return (uint8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, (int8x8_t) __b, __c);
-}
+#define vld1_lane_u8(__a, __b, __c) \
+  (uint8x8_t)__builtin_neon_vld1_lanev8qi (__a, __b, __c);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vld1_lane_u16 (const uint16_t * __a, uint16x4_t __b, const int __c)
-{
-  return (uint16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, (int16x4_t) __b, __c);
-}
+#define vld1_lane_u16(__a, __b, __c) \
+  (uint16x4_t)__builtin_neon_vld1_lanev4hi (__a, __b, __c);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vld1_lane_u32 (const uint32_t * __a, uint32x2_t __b, const int __c)
-{
-  return (uint32x2_t)__builtin_neon_vld1_lanev2si ((const __builtin_neon_si *) __a, (int32x2_t) __b, __c);
-}
+#define vld1_lane_u32(__a, __b, __c) \
+  (uint32x2_t)__builtin_neon_vld1_lanev2si (__a, __b, __c);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vld1_lane_p8 (const poly8_t * __a, poly8x8_t __b, const int __c)
-{
-  return (poly8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, (int8x8_t) __b, __c);
-}
+#define vld1_lane_p8(__a, __b, __c) \
+  (poly8x8_t)__builtin_neon_vld1_lanev8qi (__a, __b, __c);
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vld1_lane_p16 (const poly16_t * __a, poly16x4_t __b, const int __c)
-{
-  return (poly16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, (int16x4_t) __b, __c);
-}
+#define vld1_lane_p16(__a, __b, __c) \
+  (poly16x4_t)__builtin_neon_vld1_lanev4hi (__a, __b, __c);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vld1_lane_s64 (const int64_t * __a, int64x1_t __b, const int __c)
-{
-  return (int64x1_t)__builtin_neon_vld1_lanedi ((const __builtin_neon_di *) __a, __b, __c);
-}
+#define vld1_lane_s64(__a, __b, __c) \
+  (int64x1_t)__builtin_neon_vld1_lanedi (__a, __b, __c);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vld1_lane_u64 (const uint64_t * __a, uint64x1_t __b, const int __c)
-{
-  return (uint64x1_t)__builtin_neon_vld1_lanedi ((const __builtin_neon_di *) __a, (int64x1_t) __b, __c);
-}
+#define vld1_lane_u64(__a, __b, __c) \
+  (uint64x1_t)__builtin_neon_vld1_lanedi (__a, __b, __c);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vld1q_lane_s8 (const int8_t * __a, int8x16_t __b, const int __c)
-{
-  return (int8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, __b, __c);
-}
+#define vld1q_lane_s8(__a, __b, __c) \
+  (int8x16_t)__builtin_neon_vld1_lanev16qi (__a, __b, __c);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vld1q_lane_s16 (const int16_t * __a, int16x8_t __b, const int __c)
-{
-  return (int16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, __b, __c);
-}
+#define vld1q_lane_s16(__a, __b, __c) \
+  (int16x8_t)__builtin_neon_vld1_lanev8hi (__a, __b, __c);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vld1q_lane_s32 (const int32_t * __a, int32x4_t __b, const int __c)
-{
-  return (int32x4_t)__builtin_neon_vld1_lanev4si ((const __builtin_neon_si *) __a, __b, __c);
-}
+#define vld1q_lane_s32(__a, __b, __c) \
+  (int32x4_t)__builtin_neon_vld1_lanev4si (__a, __b, __c);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vld1q_lane_f32 (const float32_t * __a, float32x4_t __b, const int __c)
-{
-  return (float32x4_t)__builtin_neon_vld1_lanev4sf (__a, __b, __c);
-}
+#define vld1q_lane_f32(__a, __b, __c) \
+  (float32x4_t)__builtin_neon_vld1_lanev4sf (__a, __b, __c);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vld1q_lane_u8 (const uint8_t * __a, uint8x16_t __b, const int __c)
-{
-  return (uint8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, (int8x16_t) __b, __c);
-}
+#define vld1q_lane_u8(__a, __b, __c) \
+  (uint8x16_t)__builtin_neon_vld1_lanev16qi (__a, __b, __c);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vld1q_lane_u16 (const uint16_t * __a, uint16x8_t __b, const int __c)
-{
-  return (uint16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, (int16x8_t) __b, __c);
-}
+#define vld1q_lane_u16(__a, __b, __c) \
+  (uint16x8_t)__builtin_neon_vld1_lanev8hi (__a, __b, __c);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vld1q_lane_u32 (const uint32_t * __a, uint32x4_t __b, const int __c)
-{
-  return (uint32x4_t)__builtin_neon_vld1_lanev4si ((const __builtin_neon_si *) __a, (int32x4_t) __b, __c);
-}
+#define vld1q_lane_u32(__a, __b, __c) \
+  (uint32x4_t)__builtin_neon_vld1_lanev4si (__a, __b, __c);
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vld1q_lane_p8 (const poly8_t * __a, poly8x16_t __b, const int __c)
-{
-  return (poly8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, (int8x16_t) __b, __c);
-}
+#define vld1q_lane_p8(__a, __b, __c) \
+  (poly8x16_t)__builtin_neon_vld1_lanev16qi (__a, __b, __c);
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vld1q_lane_p16 (const poly16_t * __a, poly16x8_t __b, const int __c)
-{
-  return (poly16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, (int16x8_t) __b, __c);
-}
+#define vld1q_lane_p16(__a, __b, __c) \
+  (poly16x8_t)__builtin_neon_vld1_lanev8hi (__a, __b, __c);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vld1q_lane_s64 (const int64_t * __a, int64x2_t __b, const int __c)
-{
-  return (int64x2_t)__builtin_neon_vld1_lanev2di ((const __builtin_neon_di *) __a, __b, __c);
-}
+#define vld1q_lane_s64(__a, __b, __c) \
+  (int64x2_t)__builtin_neon_vld1_lanev2di (__a, __b, __c);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vld1q_lane_u64 (const uint64_t * __a, uint64x2_t __b, const int __c)
-{
-  return (uint64x2_t)__builtin_neon_vld1_lanev2di ((const __builtin_neon_di *) __a, (int64x2_t) __b, __c);
-}
+#define vld1q_lane_u64(__a, __b, __c) \
+  (uint64x2_t)__builtin_neon_vld1_lanev2di (__a, __b, __c);
 
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vld1_dup_s8 (const int8_t * __a)
-{
-  return (int8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a);
-}
+#define vld1_dup_s8(__a) \
+  (int8x8_t)__builtin_neon_vld1_dupv8qi (__a);
 
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vld1_dup_s16 (const int16_t * __a)
-{
-  return (int16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a);
-}
+#define vld1_dup_s16(__a) \
+  (int16x4_t)__builtin_neon_vld1_dupv4hi (__a);
 
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vld1_dup_s32 (const int32_t * __a)
-{
-  return (int32x2_t)__builtin_neon_vld1_dupv2si ((const __builtin_neon_si *) __a);
-}
+#define vld1_dup_s32(__a) \
+  (int32x2_t)__builtin_neon_vld1_dupv2si (__a);
 
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vld1_dup_f32 (const float32_t * __a)
-{
-  return (float32x2_t)__builtin_neon_vld1_dupv2sf (__a);
-}
+#define vld1_dup_f32(__a) \
+  (float32x2_t)__builtin_neon_vld1_dupv2sf (__a);
 
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vld1_dup_u8 (const uint8_t * __a)
-{
-  return (uint8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a);
-}
+#define vld1_dup_u8(__a) \
+  (uint8x8_t)__builtin_neon_vld1_dupv8qi (__a);
 
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vld1_dup_u16 (const uint16_t * __a)
-{
-  return (uint16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a);
-}
+#define vld1_dup_u16(__a) \
+  (uint16x4_t)__builtin_neon_vld1_dupv4hi (__a);
 
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vld1_dup_u32 (const uint32_t * __a)
-{
-  return (uint32x2_t)__builtin_neon_vld1_dupv2si ((const __builtin_neon_si *) __a);
-}
+#define vld1_dup_u32(__a) \
+  (uint32x2_t)__builtin_neon_vld1_dupv2si (__a);
 
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vld1_dup_p8 (const poly8_t * __a)
-{
-  return (poly8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a);
-}
+#define vld1_dup_p8(__a) \
+  (poly8x8_t)__builtin_neon_vld1_dupv8qi (__a);
 
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vld1_dup_p16 (const poly16_t * __a)
-{
-  return (poly16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a);
-}
+#define vld1_dup_p16(__a) \
+  (poly16x4_t)__builtin_neon_vld1_dupv4hi (__a);
 
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vld1_dup_s64 (const int64_t * __a)
-{
-  return (int64x1_t)__builtin_neon_vld1_dupdi ((const __builtin_neon_di *) __a);
-}
+#define vld1_dup_s64(__a) \
+  (int64x1_t)__builtin_neon_vld1_dupdi (__a);
 
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vld1_dup_u64 (const uint64_t * __a)
-{
-  return (uint64x1_t)__builtin_neon_vld1_dupdi ((const __builtin_neon_di *) __a);
-}
+#define vld1_dup_u64(__a) \
+  (uint64x1_t)__builtin_neon_vld1_dupdi (__a);
 
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vld1q_dup_s8 (const int8_t * __a)
-{
-  return (int8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a);
-}
+#define vld1q_dup_s8(__a) \
+  (int8x16_t)__builtin_neon_vld1_dupv16qi (__a);
 
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vld1q_dup_s16 (const int16_t * __a)
-{
-  return (int16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a);
-}
+#define vld1q_dup_s16(__a) \
+  (int16x8_t)__builtin_neon_vld1_dupv8hi (__a);
 
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vld1q_dup_s32 (const int32_t * __a)
-{
-  return (int32x4_t)__builtin_neon_vld1_dupv4si ((const __builtin_neon_si *) __a);
-}
+#define vld1q_dup_s32(__a) \
+  (int32x4_t)__builtin_neon_vld1_dupv4si (__a);
 
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vld1q_dup_f32 (const float32_t * __a)
-{
-  return (float32x4_t)__builtin_neon_vld1_dupv4sf (__a);
-}
+#define vld1q_dup_f32(__a) \
+  (float32x4_t)__builtin_neon_vld1_dupv4sf (__a);
 
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vld1q_dup_u8 (const uint8_t * __a)
-{
-  return (uint8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a);
-}
+#define vld1q_dup_u8(__a) \
+  (uint8x16_t)__builtin_neon_vld1_dupv16qi (__a);
 
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vld1q_dup_u16 (const uint16_t * __a)
-{
-  return (uint16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a);
-}
+#define vld1q_dup_u16(__a) \
+  (uint16x8_t)__builtin_neon_vld1_dupv8hi (__a);
 
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vld1q_dup_u32 (const uint32_t * __a)
-{
-  return (uint32x4_t)__builtin_neon_vld1_dupv4si ((const __builtin_neon_si *) __a);
-}
+#define vld1q_dup_u32(__a) \
+  (uint32x4_t)__builtin_neon_vld1_dupv4si (__a);
 
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vld1q_dup_p8 (const poly8_t * __a)
-{
-  return (poly8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a);
-}
+#define vld1q_dup_p8(__a) \
+  (poly8x16_t)__builtin_neon_vld1_dupv16qi (__a);
 
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vld1q_dup_p16 (const poly16_t * __a)
-{
-  return (poly16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a);
-}
+#define vld1q_dup_p16(__a) \
+  (poly16x8_t)__builtin_neon_vld1_dupv8hi (__a);
 
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vld1q_dup_s64 (const int64_t * __a)
-{
-  return (int64x2_t)__builtin_neon_vld1_dupv2di ((const __builtin_neon_di *) __a);
-}
+#define vld1q_dup_s64(__a) \
+  (int64x2_t)__builtin_neon_vld1_dupv2di (__a);
 
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vld1q_dup_u64 (const uint64_t * __a)
-{
-  return (uint64x2_t)__builtin_neon_vld1_dupv2di ((const __builtin_neon_di *) __a);
-}
+#define vld1q_dup_u64(__a) \
+  (uint64x2_t)__builtin_neon_vld1_dupv2di (__a);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_s8 (int8_t * __a, int8x8_t __b)
-{
-  __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, __b);
-}
+#define vst1_s8(__a, __b) \
+  __builtin_neon_vst1v8qi (__a, __b);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_s16 (int16_t * __a, int16x4_t __b)
-{
-  __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, __b);
-}
+#define vst1_s16(__a, __b) \
+  __builtin_neon_vst1v4hi (__a, __b);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_s32 (int32_t * __a, int32x2_t __b)
-{
-  __builtin_neon_vst1v2si ((__builtin_neon_si *) __a, __b);
-}
+#define vst1_s32(__a, __b) \
+  __builtin_neon_vst1v2si (__a, __b);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_s64 (int64_t * __a, int64x1_t __b)
-{
-  __builtin_neon_vst1di ((__builtin_neon_di *) __a, __b);
-}
+#define vst1_s64(__a, __b) \
+  __builtin_neon_vst1di (__a, __b);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_f32 (float32_t * __a, float32x2_t __b)
-{
+#define vst1_f32(__a, __b) \
   __builtin_neon_vst1v2sf (__a, __b);
-}
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_u8 (uint8_t * __a, uint8x8_t __b)
-{
-  __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b);
-}
+#define vst1_u8(__a, __b) \
+  __builtin_neon_vst1v8qi (__a, __b);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_u16 (uint16_t * __a, uint16x4_t __b)
-{
-  __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b);
-}
+#define vst1_u16(__a, __b) \
+  __builtin_neon_vst1v4hi (__a, __b);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_u32 (uint32_t * __a, uint32x2_t __b)
-{
-  __builtin_neon_vst1v2si ((__builtin_neon_si *) __a, (int32x2_t) __b);
-}
+#define vst1_u32(__a, __b) \
+  __builtin_neon_vst1v2si (__a, __b);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_u64 (uint64_t * __a, uint64x1_t __b)
-{
-  __builtin_neon_vst1di ((__builtin_neon_di *) __a, (int64x1_t) __b);
-}
+#define vst1_u64(__a, __b) \
+  __builtin_neon_vst1di (__a, __b);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_p8 (poly8_t * __a, poly8x8_t __b)
-{
-  __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b);
-}
+#define vst1_p8(__a, __b) \
+  __builtin_neon_vst1v8qi (__a, __b);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_p16 (poly16_t * __a, poly16x4_t __b)
-{
-  __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b);
-}
+#define vst1_p16(__a, __b) \
+  __builtin_neon_vst1v4hi (__a, __b);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_s8 (int8_t * __a, int8x16_t __b)
-{
-  __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, __b);
-}
+#define vst1q_s8(__a, __b) \
+  __builtin_neon_vst1v16qi (__a, __b);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_s16 (int16_t * __a, int16x8_t __b)
-{
-  __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, __b);
-}
+#define vst1q_s16(__a, __b) \
+  __builtin_neon_vst1v8hi (__a, __b);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_s32 (int32_t * __a, int32x4_t __b)
-{
-  __builtin_neon_vst1v4si ((__builtin_neon_si *) __a, __b);
-}
+#define vst1q_s32(__a, __b) \
+  __builtin_neon_vst1v4si (__a, __b);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_s64 (int64_t * __a, int64x2_t __b)
-{
-  __builtin_neon_vst1v2di ((__builtin_neon_di *) __a, __b);
-}
+#define vst1q_s64(__a, __b) \
+  __builtin_neon_vst1v2di (__a, __b);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_f32 (float32_t * __a, float32x4_t __b)
-{
+#define vst1q_f32(__a, __b) \
   __builtin_neon_vst1v4sf (__a, __b);
-}
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_u8 (uint8_t * __a, uint8x16_t __b)
-{
-  __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b);
-}
+#define vst1q_u8(__a, __b) \
+  __builtin_neon_vst1v16qi (__a, __b);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_u16 (uint16_t * __a, uint16x8_t __b)
-{
-  __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b);
-}
+#define vst1q_u16(__a, __b) \
+  __builtin_neon_vst1v8hi (__a, __b);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_u32 (uint32_t * __a, uint32x4_t __b)
-{
-  __builtin_neon_vst1v4si ((__builtin_neon_si *) __a, (int32x4_t) __b);
-}
+#define vst1q_u32(__a, __b) \
+  __builtin_neon_vst1v4si (__a, __b);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_u64 (uint64_t * __a, uint64x2_t __b)
-{
-  __builtin_neon_vst1v2di ((__builtin_neon_di *) __a, (int64x2_t) __b);
-}
+#define vst1q_u64(__a, __b) \
+  __builtin_neon_vst1v2di (__a, __b);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_p8 (poly8_t * __a, poly8x16_t __b)
-{
-  __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b);
-}
+#define vst1q_p8(__a, __b) \
+  __builtin_neon_vst1v16qi (__a, __b);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_p16 (poly16_t * __a, poly16x8_t __b)
-{
-  __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b);
-}
+#define vst1q_p16(__a, __b) \
+  __builtin_neon_vst1v8hi (__a, __b);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_s8 (int8_t * __a, int8x8_t __b, const int __c)
-{
-  __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, __b, __c);
-}
+#define vst1_lane_s8(__a, __b, __c) \
+  __builtin_neon_vst1_lanev8qi (__a, __b, __c);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_s16 (int16_t * __a, int16x4_t __b, const int __c)
-{
-  __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, __b, __c);
-}
+#define vst1_lane_s16(__a, __b, __c) \
+  __builtin_neon_vst1_lanev4hi (__a, __b, __c);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_s32 (int32_t * __a, int32x2_t __b, const int __c)
-{
-  __builtin_neon_vst1_lanev2si ((__builtin_neon_si *) __a, __b, __c);
-}
+#define vst1_lane_s32(__a, __b, __c) \
+  __builtin_neon_vst1_lanev2si (__a, __b, __c);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_f32 (float32_t * __a, float32x2_t __b, const int __c)
-{
+#define vst1_lane_f32(__a, __b, __c) \
   __builtin_neon_vst1_lanev2sf (__a, __b, __c);
-}
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_u8 (uint8_t * __a, uint8x8_t __b, const int __c)
-{
-  __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b, __c);
-}
+#define vst1_lane_u8(__a, __b, __c) \
+  __builtin_neon_vst1_lanev8qi (__a, __b, __c);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_u16 (uint16_t * __a, uint16x4_t __b, const int __c)
-{
-  __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b, __c);
-}
+#define vst1_lane_u16(__a, __b, __c) \
+  __builtin_neon_vst1_lanev4hi (__a, __b, __c);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_u32 (uint32_t * __a, uint32x2_t __b, const int __c)
-{
-  __builtin_neon_vst1_lanev2si ((__builtin_neon_si *) __a, (int32x2_t) __b, __c);
-}
+#define vst1_lane_u32(__a, __b, __c) \
+  __builtin_neon_vst1_lanev2si (__a, __b, __c);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_p8 (poly8_t * __a, poly8x8_t __b, const int __c)
-{
-  __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b, __c);
-}
+#define vst1_lane_p8(__a, __b, __c) \
+  __builtin_neon_vst1_lanev8qi (__a, __b, __c);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_p16 (poly16_t * __a, poly16x4_t __b, const int __c)
-{
-  __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b, __c);
-}
+#define vst1_lane_p16(__a, __b, __c) \
+  __builtin_neon_vst1_lanev4hi (__a, __b, __c);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_s64 (int64_t * __a, int64x1_t __b, const int __c)
-{
-  __builtin_neon_vst1_lanedi ((__builtin_neon_di *) __a, __b, __c);
-}
+#define vst1_lane_s64(__a, __b, __c) \
+  __builtin_neon_vst1_lanedi (__a, __b, __c);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1_lane_u64 (uint64_t * __a, uint64x1_t __b, const int __c)
-{
-  __builtin_neon_vst1_lanedi ((__builtin_neon_di *) __a, (int64x1_t) __b, __c);
-}
+#define vst1_lane_u64(__a, __b, __c) \
+  __builtin_neon_vst1_lanedi (__a, __b, __c);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_s8 (int8_t * __a, int8x16_t __b, const int __c)
-{
-  __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, __b, __c);
-}
+#define vst1q_lane_s8(__a, __b, __c) \
+  __builtin_neon_vst1_lanev16qi (__a, __b, __c);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_s16 (int16_t * __a, int16x8_t __b, const int __c)
-{
-  __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, __b, __c);
-}
+#define vst1q_lane_s16(__a, __b, __c) \
+  __builtin_neon_vst1_lanev8hi (__a, __b, __c);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_s32 (int32_t * __a, int32x4_t __b, const int __c)
-{
-  __builtin_neon_vst1_lanev4si ((__builtin_neon_si *) __a, __b, __c);
-}
+#define vst1q_lane_s32(__a, __b, __c) \
+  __builtin_neon_vst1_lanev4si (__a, __b, __c);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_f32 (float32_t * __a, float32x4_t __b, const int __c)
-{
+#define vst1q_lane_f32(__a, __b, __c) \
   __builtin_neon_vst1_lanev4sf (__a, __b, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_u8 (uint8_t * __a, uint8x16_t __b, const int __c)
-{
-  __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_u16 (uint16_t * __a, uint16x8_t __b, const int __c)
-{
-  __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_u32 (uint32_t * __a, uint32x4_t __b, const int __c)
-{
-  __builtin_neon_vst1_lanev4si ((__builtin_neon_si *) __a, (int32x4_t) __b, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_p8 (poly8_t * __a, poly8x16_t __b, const int __c)
-{
-  __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_p16 (poly16_t * __a, poly16x8_t __b, const int __c)
-{
-  __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_s64 (int64_t * __a, int64x2_t __b, const int __c)
-{
-  __builtin_neon_vst1_lanev2di ((__builtin_neon_di *) __a, __b, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst1q_lane_u64 (uint64_t * __a, uint64x2_t __b, const int __c)
-{
-  __builtin_neon_vst1_lanev2di ((__builtin_neon_di *) __a, (int64x2_t) __b, __c);
-}
-
-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
-vld2_s8 (const int8_t * __a)
-{
-  union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2v8qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
-vld2_s16 (const int16_t * __a)
-{
-  union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2v4hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
-vld2_s32 (const int32_t * __a)
-{
-  union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2v2si ((const __builtin_neon_si *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
-vld2_f32 (const float32_t * __a)
-{
-  union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2v2sf (__a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
-vld2_u8 (const uint8_t * __a)
-{
-  union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2v8qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
-vld2_u16 (const uint16_t * __a)
-{
-  union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2v4hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
-vld2_u32 (const uint32_t * __a)
-{
-  union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2v2si ((const __builtin_neon_si *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
-vld2_p8 (const poly8_t * __a)
-{
-  union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2v8qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
-vld2_p16 (const poly16_t * __a)
-{
-  union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2v4hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
 
-__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
-vld2_s64 (const int64_t * __a)
-{
-  union { int64x1x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2di ((const __builtin_neon_di *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__))
-vld2_u64 (const uint64_t * __a)
-{
-  union { uint64x1x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2di ((const __builtin_neon_di *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
-vld2q_s8 (const int8_t * __a)
-{
-  union { int8x16x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld2v16qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
-vld2q_s16 (const int16_t * __a)
-{
-  union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld2v8hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
-vld2q_s32 (const int32_t * __a)
-{
-  union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld2v4si ((const __builtin_neon_si *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
-vld2q_f32 (const float32_t * __a)
-{
-  union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld2v4sf (__a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
-vld2q_u8 (const uint8_t * __a)
-{
-  union { uint8x16x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld2v16qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
-vld2q_u16 (const uint16_t * __a)
-{
-  union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld2v8hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
-vld2q_u32 (const uint32_t * __a)
-{
-  union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld2v4si ((const __builtin_neon_si *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
-vld2q_p8 (const poly8_t * __a)
-{
-  union { poly8x16x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld2v16qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
-vld2q_p16 (const poly16_t * __a)
-{
-  union { poly16x8x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld2v8hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
-vld2_lane_s8 (const int8_t * __a, int8x8x2_t __b, const int __c)
-{
-  union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
-vld2_lane_s16 (const int16_t * __a, int16x4x2_t __b, const int __c)
-{
-  union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
-vld2_lane_s32 (const int32_t * __a, int32x2x2_t __b, const int __c)
-{
-  union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
-vld2_lane_f32 (const float32_t * __a, float32x2x2_t __b, const int __c)
-{
-  union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_lanev2sf (__a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
-vld2_lane_u8 (const uint8_t * __a, uint8x8x2_t __b, const int __c)
-{
-  union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
-vld2_lane_u16 (const uint16_t * __a, uint16x4x2_t __b, const int __c)
-{
-  union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
-vld2_lane_u32 (const uint32_t * __a, uint32x2x2_t __b, const int __c)
-{
-  union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
-vld2_lane_p8 (const poly8_t * __a, poly8x8x2_t __b, const int __c)
-{
-  union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
-vld2_lane_p16 (const poly16_t * __a, poly16x4x2_t __b, const int __c)
-{
-  union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
-vld2q_lane_s16 (const int16_t * __a, int16x8x2_t __b, const int __c)
-{
-  union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
-vld2q_lane_s32 (const int32_t * __a, int32x4x2_t __b, const int __c)
-{
-  union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
-vld2q_lane_f32 (const float32_t * __a, float32x4x2_t __b, const int __c)
-{
-  union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_lanev4sf (__a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
-vld2q_lane_u16 (const uint16_t * __a, uint16x8x2_t __b, const int __c)
-{
-  union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
-vld2q_lane_u32 (const uint32_t * __a, uint32x4x2_t __b, const int __c)
-{
-  union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
-vld2q_lane_p16 (const poly16_t * __a, poly16x8x2_t __b, const int __c)
-{
-  union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  union { poly16x8x2_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
-vld2_dup_s8 (const int8_t * __a)
-{
-  union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_dupv8qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
-vld2_dup_s16 (const int16_t * __a)
-{
-  union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_dupv4hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
-vld2_dup_s32 (const int32_t * __a)
-{
-  union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_dupv2si ((const __builtin_neon_si *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
-vld2_dup_f32 (const float32_t * __a)
-{
-  union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_dupv2sf (__a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
-vld2_dup_u8 (const uint8_t * __a)
-{
-  union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_dupv8qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
-vld2_dup_u16 (const uint16_t * __a)
-{
-  union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_dupv4hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
-vld2_dup_u32 (const uint32_t * __a)
-{
-  union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_dupv2si ((const __builtin_neon_si *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
-vld2_dup_p8 (const poly8_t * __a)
-{
-  union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_dupv8qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
-vld2_dup_p16 (const poly16_t * __a)
-{
-  union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_dupv4hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
-vld2_dup_s64 (const int64_t * __a)
-{
-  union { int64x1x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_dupdi ((const __builtin_neon_di *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__))
-vld2_dup_u64 (const uint64_t * __a)
-{
-  union { uint64x1x2_t __i; __builtin_neon_ti __o; } __rv;
-  __rv.__o = __builtin_neon_vld2_dupdi ((const __builtin_neon_di *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_s8 (int8_t * __a, int8x8x2_t __b)
-{
-  union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_s16 (int16_t * __a, int16x4x2_t __b)
-{
-  union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  __builtin_neon_vst2v4hi ((__builtin_neon_hi *) __a, __bu.__o);
-}
+#define vst1q_lane_u8(__a, __b, __c) \
+  __builtin_neon_vst1_lanev16qi (__a, __b, __c);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_s32 (int32_t * __a, int32x2x2_t __b)
-{
-  union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  __builtin_neon_vst2v2si ((__builtin_neon_si *) __a, __bu.__o);
-}
+#define vst1q_lane_u16(__a, __b, __c) \
+  __builtin_neon_vst1_lanev8hi (__a, __b, __c);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_f32 (float32_t * __a, float32x2x2_t __b)
-{
-  union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  __builtin_neon_vst2v2sf (__a, __bu.__o);
-}
+#define vst1q_lane_u32(__a, __b, __c) \
+  __builtin_neon_vst1_lanev4si (__a, __b, __c);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_u8 (uint8_t * __a, uint8x8x2_t __b)
-{
-  union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
-}
+#define vst1q_lane_p8(__a, __b, __c) \
+  __builtin_neon_vst1_lanev16qi (__a, __b, __c);
+
+#define vst1q_lane_p16(__a, __b, __c) \
+  __builtin_neon_vst1_lanev8hi (__a, __b, __c);
+
+#define vst1q_lane_s64(__a, __b, __c) \
+  __builtin_neon_vst1_lanev2di (__a, __b, __c);
+
+#define vst1q_lane_u64(__a, __b, __c) \
+  __builtin_neon_vst1_lanev2di (__a, __b, __c);
+
+#define vld2_s8(__a) \
+  ({ \
+     union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2v8qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_s16(__a) \
+  ({ \
+     union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2v4hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_s32(__a) \
+  ({ \
+     union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2v2si (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_f32(__a) \
+  ({ \
+     union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2v2sf (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_u8(__a) \
+  ({ \
+     union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2v8qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_u16(__a) \
+  ({ \
+     union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2v4hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_u32(__a) \
+  ({ \
+     union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2v2si (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_p8(__a) \
+  ({ \
+     union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2v8qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_p16(__a) \
+  ({ \
+     union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2v4hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_s64(__a) \
+  ({ \
+     union { int64x1x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2di (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_u64(__a) \
+  ({ \
+     union { uint64x1x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2di (__a); \
+     __rv.__i; \
+   })
+
+#define vld2q_s8(__a) \
+  ({ \
+     union { int8x16x2_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2v16qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld2q_s16(__a) \
+  ({ \
+     union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2v8hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld2q_s32(__a) \
+  ({ \
+     union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2v4si (__a); \
+     __rv.__i; \
+   })
+
+#define vld2q_f32(__a) \
+  ({ \
+     union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2v4sf (__a); \
+     __rv.__i; \
+   })
+
+#define vld2q_u8(__a) \
+  ({ \
+     union { uint8x16x2_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2v16qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld2q_u16(__a) \
+  ({ \
+     union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2v8hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld2q_u32(__a) \
+  ({ \
+     union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2v4si (__a); \
+     __rv.__i; \
+   })
+
+#define vld2q_p8(__a) \
+  ({ \
+     union { poly8x16x2_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2v16qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld2q_p16(__a) \
+  ({ \
+     union { poly16x8x2_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2v8hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_lane_s8(__a, __b, __c) \
+  ({ \
+     union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_lanev8qi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld2_lane_s16(__a, __b, __c) \
+  ({ \
+     union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_lanev4hi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld2_lane_s32(__a, __b, __c) \
+  ({ \
+     union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_lanev2si (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld2_lane_f32(__a, __b, __c) \
+  ({ \
+     union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_lanev2sf (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld2_lane_u8(__a, __b, __c) \
+  ({ \
+     union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_lanev8qi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld2_lane_u16(__a, __b, __c) \
+  ({ \
+     union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_lanev4hi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld2_lane_u32(__a, __b, __c) \
+  ({ \
+     union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_lanev2si (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld2_lane_p8(__a, __b, __c) \
+  ({ \
+     union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_lanev8qi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld2_lane_p16(__a, __b, __c) \
+  ({ \
+     union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_lanev4hi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld2q_lane_s16(__a, __b, __c) \
+  ({ \
+     union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_lanev8hi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld2q_lane_s32(__a, __b, __c) \
+  ({ \
+     union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_lanev4si (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld2q_lane_f32(__a, __b, __c) \
+  ({ \
+     union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_lanev4sf (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld2q_lane_u16(__a, __b, __c) \
+  ({ \
+     union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_lanev8hi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld2q_lane_u32(__a, __b, __c) \
+  ({ \
+     union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_lanev4si (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld2q_lane_p16(__a, __b, __c) \
+  ({ \
+     union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     union { poly16x8x2_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_lanev8hi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld2_dup_s8(__a) \
+  ({ \
+     union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_dupv8qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_dup_s16(__a) \
+  ({ \
+     union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_dupv4hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_dup_s32(__a) \
+  ({ \
+     union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_dupv2si (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_dup_f32(__a) \
+  ({ \
+     union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_dupv2sf (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_dup_u8(__a) \
+  ({ \
+     union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_dupv8qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_dup_u16(__a) \
+  ({ \
+     union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_dupv4hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_dup_u32(__a) \
+  ({ \
+     union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_dupv2si (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_dup_p8(__a) \
+  ({ \
+     union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_dupv8qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_dup_p16(__a) \
+  ({ \
+     union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_dupv4hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_dup_s64(__a) \
+  ({ \
+     union { int64x1x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_dupdi (__a); \
+     __rv.__i; \
+   })
+
+#define vld2_dup_u64(__a) \
+  ({ \
+     union { uint64x1x2_t __i; __builtin_neon_ti __o; } __rv; \
+     __rv.__o = __builtin_neon_vld2_dupdi (__a); \
+     __rv.__i; \
+   })
+
+#define vst2_s8(__a, __b) \
+  ({ \
+     union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     __builtin_neon_vst2v8qi (__a, __bu.__o); \
+   })
+
+#define vst2_s16(__a, __b) \
+  ({ \
+     union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     __builtin_neon_vst2v4hi (__a, __bu.__o); \
+   })
+
+#define vst2_s32(__a, __b) \
+  ({ \
+     union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     __builtin_neon_vst2v2si (__a, __bu.__o); \
+   })
+
+#define vst2_f32(__a, __b) \
+  ({ \
+     union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     __builtin_neon_vst2v2sf (__a, __bu.__o); \
+   })
+
+#define vst2_u8(__a, __b) \
+  ({ \
+     union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     __builtin_neon_vst2v8qi (__a, __bu.__o); \
+   })
+
+#define vst2_u16(__a, __b) \
+  ({ \
+     union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     __builtin_neon_vst2v4hi (__a, __bu.__o); \
+   })
+
+#define vst2_u32(__a, __b) \
+  ({ \
+     union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     __builtin_neon_vst2v2si (__a, __bu.__o); \
+   })
+
+#define vst2_p8(__a, __b) \
+  ({ \
+     union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     __builtin_neon_vst2v8qi (__a, __bu.__o); \
+   })
+
+#define vst2_p16(__a, __b) \
+  ({ \
+     union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     __builtin_neon_vst2v4hi (__a, __bu.__o); \
+   })
+
+#define vst2_s64(__a, __b) \
+  ({ \
+     union { int64x1x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     __builtin_neon_vst2di (__a, __bu.__o); \
+   })
+
+#define vst2_u64(__a, __b) \
+  ({ \
+     union { uint64x1x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     __builtin_neon_vst2di (__a, __bu.__o); \
+   })
+
+#define vst2q_s8(__a, __b) \
+  ({ \
+     union { int8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst2v16qi (__a, __bu.__o); \
+   })
+
+#define vst2q_s16(__a, __b) \
+  ({ \
+     union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst2v8hi (__a, __bu.__o); \
+   })
+
+#define vst2q_s32(__a, __b) \
+  ({ \
+     union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst2v4si (__a, __bu.__o); \
+   })
+
+#define vst2q_f32(__a, __b) \
+  ({ \
+     union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst2v4sf (__a, __bu.__o); \
+   })
+
+#define vst2q_u8(__a, __b) \
+  ({ \
+     union { uint8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst2v16qi (__a, __bu.__o); \
+   })
+
+#define vst2q_u16(__a, __b) \
+  ({ \
+     union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst2v8hi (__a, __bu.__o); \
+   })
+
+#define vst2q_u32(__a, __b) \
+  ({ \
+     union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst2v4si (__a, __bu.__o); \
+   })
+
+#define vst2q_p8(__a, __b) \
+  ({ \
+     union { poly8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst2v16qi (__a, __bu.__o); \
+   })
+
+#define vst2q_p16(__a, __b) \
+  ({ \
+     union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst2v8hi (__a, __bu.__o); \
+   })
+
+#define vst2_lane_s8(__a, __b, __c) \
+  ({ \
+     union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     __builtin_neon_vst2_lanev8qi (__a, __bu.__o, __c); \
+   })
+
+#define vst2_lane_s16(__a, __b, __c) \
+  ({ \
+     union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     __builtin_neon_vst2_lanev4hi (__a, __bu.__o, __c); \
+   })
+
+#define vst2_lane_s32(__a, __b, __c) \
+  ({ \
+     union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     __builtin_neon_vst2_lanev2si (__a, __bu.__o, __c); \
+   })
+
+#define vst2_lane_f32(__a, __b, __c) \
+  ({ \
+     union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     __builtin_neon_vst2_lanev2sf (__a, __bu.__o, __c); \
+   })
+
+#define vst2_lane_u8(__a, __b, __c) \
+  ({ \
+     union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     __builtin_neon_vst2_lanev8qi (__a, __bu.__o, __c); \
+   })
+
+#define vst2_lane_u16(__a, __b, __c) \
+  ({ \
+     union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     __builtin_neon_vst2_lanev4hi (__a, __bu.__o, __c); \
+   })
+
+#define vst2_lane_u32(__a, __b, __c) \
+  ({ \
+     union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     __builtin_neon_vst2_lanev2si (__a, __bu.__o, __c); \
+   })
+
+#define vst2_lane_p8(__a, __b, __c) \
+  ({ \
+     union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     __builtin_neon_vst2_lanev8qi (__a, __bu.__o, __c); \
+   })
+
+#define vst2_lane_p16(__a, __b, __c) \
+  ({ \
+     union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b }; \
+     __builtin_neon_vst2_lanev4hi (__a, __bu.__o, __c); \
+   })
+
+#define vst2q_lane_s16(__a, __b, __c) \
+  ({ \
+     union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst2_lanev8hi (__a, __bu.__o, __c); \
+   })
+
+#define vst2q_lane_s32(__a, __b, __c) \
+  ({ \
+     union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst2_lanev4si (__a, __bu.__o, __c); \
+   })
+
+#define vst2q_lane_f32(__a, __b, __c) \
+  ({ \
+     union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst2_lanev4sf (__a, __bu.__o, __c); \
+   })
+
+#define vst2q_lane_u16(__a, __b, __c) \
+  ({ \
+     union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst2_lanev8hi (__a, __bu.__o, __c); \
+   })
+
+#define vst2q_lane_u32(__a, __b, __c) \
+  ({ \
+     union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst2_lanev4si (__a, __bu.__o, __c); \
+   })
+
+#define vst2q_lane_p16(__a, __b, __c) \
+  ({ \
+     union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst2_lanev8hi (__a, __bu.__o, __c); \
+   })
+
+#define vld3_s8(__a) \
+  ({ \
+     union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3v8qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_s16(__a) \
+  ({ \
+     union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3v4hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_s32(__a) \
+  ({ \
+     union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3v2si (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_f32(__a) \
+  ({ \
+     union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3v2sf (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_u8(__a) \
+  ({ \
+     union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3v8qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_u16(__a) \
+  ({ \
+     union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3v4hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_u32(__a) \
+  ({ \
+     union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3v2si (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_p8(__a) \
+  ({ \
+     union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3v8qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_p16(__a) \
+  ({ \
+     union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3v4hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_s64(__a) \
+  ({ \
+     union { int64x1x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3di (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_u64(__a) \
+  ({ \
+     union { uint64x1x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3di (__a); \
+     __rv.__i; \
+   })
+
+#define vld3q_s8(__a) \
+  ({ \
+     union { int8x16x3_t __i; __builtin_neon_ci __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3v16qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld3q_s16(__a) \
+  ({ \
+     union { int16x8x3_t __i; __builtin_neon_ci __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3v8hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld3q_s32(__a) \
+  ({ \
+     union { int32x4x3_t __i; __builtin_neon_ci __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3v4si (__a); \
+     __rv.__i; \
+   })
+
+#define vld3q_f32(__a) \
+  ({ \
+     union { float32x4x3_t __i; __builtin_neon_ci __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3v4sf (__a); \
+     __rv.__i; \
+   })
+
+#define vld3q_u8(__a) \
+  ({ \
+     union { uint8x16x3_t __i; __builtin_neon_ci __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3v16qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld3q_u16(__a) \
+  ({ \
+     union { uint16x8x3_t __i; __builtin_neon_ci __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3v8hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld3q_u32(__a) \
+  ({ \
+     union { uint32x4x3_t __i; __builtin_neon_ci __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3v4si (__a); \
+     __rv.__i; \
+   })
+
+#define vld3q_p8(__a) \
+  ({ \
+     union { poly8x16x3_t __i; __builtin_neon_ci __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3v16qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld3q_p16(__a) \
+  ({ \
+     union { poly16x8x3_t __i; __builtin_neon_ci __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3v8hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_lane_s8(__a, __b, __c) \
+  ({ \
+     union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_lanev8qi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld3_lane_s16(__a, __b, __c) \
+  ({ \
+     union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_lanev4hi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld3_lane_s32(__a, __b, __c) \
+  ({ \
+     union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_lanev2si (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld3_lane_f32(__a, __b, __c) \
+  ({ \
+     union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_lanev2sf (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld3_lane_u8(__a, __b, __c) \
+  ({ \
+     union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_lanev8qi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld3_lane_u16(__a, __b, __c) \
+  ({ \
+     union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_lanev4hi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld3_lane_u32(__a, __b, __c) \
+  ({ \
+     union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_lanev2si (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld3_lane_p8(__a, __b, __c) \
+  ({ \
+     union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_lanev8qi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld3_lane_p16(__a, __b, __c) \
+  ({ \
+     union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_lanev4hi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld3q_lane_s16(__a, __b, __c) \
+  ({ \
+     union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     union { int16x8x3_t __i; __builtin_neon_ci __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_lanev8hi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld3q_lane_s32(__a, __b, __c) \
+  ({ \
+     union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     union { int32x4x3_t __i; __builtin_neon_ci __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_lanev4si (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld3q_lane_f32(__a, __b, __c) \
+  ({ \
+     union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     union { float32x4x3_t __i; __builtin_neon_ci __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_lanev4sf (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld3q_lane_u16(__a, __b, __c) \
+  ({ \
+     union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     union { uint16x8x3_t __i; __builtin_neon_ci __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_lanev8hi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld3q_lane_u32(__a, __b, __c) \
+  ({ \
+     union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     union { uint32x4x3_t __i; __builtin_neon_ci __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_lanev4si (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld3q_lane_p16(__a, __b, __c) \
+  ({ \
+     union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     union { poly16x8x3_t __i; __builtin_neon_ci __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_lanev8hi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld3_dup_s8(__a) \
+  ({ \
+     union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_dupv8qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_dup_s16(__a) \
+  ({ \
+     union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_dupv4hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_dup_s32(__a) \
+  ({ \
+     union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_dupv2si (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_dup_f32(__a) \
+  ({ \
+     union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_dupv2sf (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_dup_u8(__a) \
+  ({ \
+     union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_dupv8qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_dup_u16(__a) \
+  ({ \
+     union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_dupv4hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_dup_u32(__a) \
+  ({ \
+     union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_dupv2si (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_dup_p8(__a) \
+  ({ \
+     union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_dupv8qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_dup_p16(__a) \
+  ({ \
+     union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_dupv4hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_dup_s64(__a) \
+  ({ \
+     union { int64x1x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_dupdi (__a); \
+     __rv.__i; \
+   })
+
+#define vld3_dup_u64(__a) \
+  ({ \
+     union { uint64x1x3_t __i; __builtin_neon_ei __o; } __rv; \
+     __rv.__o = __builtin_neon_vld3_dupdi (__a); \
+     __rv.__i; \
+   })
+
+#define vst3_s8(__a, __b) \
+  ({ \
+     union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     __builtin_neon_vst3v8qi (__a, __bu.__o); \
+   })
+
+#define vst3_s16(__a, __b) \
+  ({ \
+     union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     __builtin_neon_vst3v4hi (__a, __bu.__o); \
+   })
+
+#define vst3_s32(__a, __b) \
+  ({ \
+     union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     __builtin_neon_vst3v2si (__a, __bu.__o); \
+   })
+
+#define vst3_f32(__a, __b) \
+  ({ \
+     union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     __builtin_neon_vst3v2sf (__a, __bu.__o); \
+   })
+
+#define vst3_u8(__a, __b) \
+  ({ \
+     union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     __builtin_neon_vst3v8qi (__a, __bu.__o); \
+   })
+
+#define vst3_u16(__a, __b) \
+  ({ \
+     union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     __builtin_neon_vst3v4hi (__a, __bu.__o); \
+   })
+
+#define vst3_u32(__a, __b) \
+  ({ \
+     union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     __builtin_neon_vst3v2si (__a, __bu.__o); \
+   })
+
+#define vst3_p8(__a, __b) \
+  ({ \
+     union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     __builtin_neon_vst3v8qi (__a, __bu.__o); \
+   })
+
+#define vst3_p16(__a, __b) \
+  ({ \
+     union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     __builtin_neon_vst3v4hi (__a, __bu.__o); \
+   })
+
+#define vst3_s64(__a, __b) \
+  ({ \
+     union { int64x1x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     __builtin_neon_vst3di (__a, __bu.__o); \
+   })
+
+#define vst3_u64(__a, __b) \
+  ({ \
+     union { uint64x1x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     __builtin_neon_vst3di (__a, __bu.__o); \
+   })
+
+#define vst3q_s8(__a, __b) \
+  ({ \
+     union { int8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     __builtin_neon_vst3v16qi (__a, __bu.__o); \
+   })
+
+#define vst3q_s16(__a, __b) \
+  ({ \
+     union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     __builtin_neon_vst3v8hi (__a, __bu.__o); \
+   })
+
+#define vst3q_s32(__a, __b) \
+  ({ \
+     union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     __builtin_neon_vst3v4si (__a, __bu.__o); \
+   })
+
+#define vst3q_f32(__a, __b) \
+  ({ \
+     union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     __builtin_neon_vst3v4sf (__a, __bu.__o); \
+   })
+
+#define vst3q_u8(__a, __b) \
+  ({ \
+     union { uint8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     __builtin_neon_vst3v16qi (__a, __bu.__o); \
+   })
+
+#define vst3q_u16(__a, __b) \
+  ({ \
+     union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     __builtin_neon_vst3v8hi (__a, __bu.__o); \
+   })
+
+#define vst3q_u32(__a, __b) \
+  ({ \
+     union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     __builtin_neon_vst3v4si (__a, __bu.__o); \
+   })
+
+#define vst3q_p8(__a, __b) \
+  ({ \
+     union { poly8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     __builtin_neon_vst3v16qi (__a, __bu.__o); \
+   })
+
+#define vst3q_p16(__a, __b) \
+  ({ \
+     union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     __builtin_neon_vst3v8hi (__a, __bu.__o); \
+   })
+
+#define vst3_lane_s8(__a, __b, __c) \
+  ({ \
+     union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     __builtin_neon_vst3_lanev8qi (__a, __bu.__o, __c); \
+   })
+
+#define vst3_lane_s16(__a, __b, __c) \
+  ({ \
+     union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     __builtin_neon_vst3_lanev4hi (__a, __bu.__o, __c); \
+   })
+
+#define vst3_lane_s32(__a, __b, __c) \
+  ({ \
+     union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     __builtin_neon_vst3_lanev2si (__a, __bu.__o, __c); \
+   })
+
+#define vst3_lane_f32(__a, __b, __c) \
+  ({ \
+     union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     __builtin_neon_vst3_lanev2sf (__a, __bu.__o, __c); \
+   })
+
+#define vst3_lane_u8(__a, __b, __c) \
+  ({ \
+     union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     __builtin_neon_vst3_lanev8qi (__a, __bu.__o, __c); \
+   })
+
+#define vst3_lane_u16(__a, __b, __c) \
+  ({ \
+     union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     __builtin_neon_vst3_lanev4hi (__a, __bu.__o, __c); \
+   })
+
+#define vst3_lane_u32(__a, __b, __c) \
+  ({ \
+     union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     __builtin_neon_vst3_lanev2si (__a, __bu.__o, __c); \
+   })
+
+#define vst3_lane_p8(__a, __b, __c) \
+  ({ \
+     union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     __builtin_neon_vst3_lanev8qi (__a, __bu.__o, __c); \
+   })
+
+#define vst3_lane_p16(__a, __b, __c) \
+  ({ \
+     union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b }; \
+     __builtin_neon_vst3_lanev4hi (__a, __bu.__o, __c); \
+   })
+
+#define vst3q_lane_s16(__a, __b, __c) \
+  ({ \
+     union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     __builtin_neon_vst3_lanev8hi (__a, __bu.__o, __c); \
+   })
+
+#define vst3q_lane_s32(__a, __b, __c) \
+  ({ \
+     union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     __builtin_neon_vst3_lanev4si (__a, __bu.__o, __c); \
+   })
+
+#define vst3q_lane_f32(__a, __b, __c) \
+  ({ \
+     union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     __builtin_neon_vst3_lanev4sf (__a, __bu.__o, __c); \
+   })
+
+#define vst3q_lane_u16(__a, __b, __c) \
+  ({ \
+     union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     __builtin_neon_vst3_lanev8hi (__a, __bu.__o, __c); \
+   })
+
+#define vst3q_lane_u32(__a, __b, __c) \
+  ({ \
+     union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     __builtin_neon_vst3_lanev4si (__a, __bu.__o, __c); \
+   })
+
+#define vst3q_lane_p16(__a, __b, __c) \
+  ({ \
+     union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b }; \
+     __builtin_neon_vst3_lanev8hi (__a, __bu.__o, __c); \
+   })
+
+#define vld4_s8(__a) \
+  ({ \
+     union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4v8qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_s16(__a) \
+  ({ \
+     union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4v4hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_s32(__a) \
+  ({ \
+     union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4v2si (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_f32(__a) \
+  ({ \
+     union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4v2sf (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_u8(__a) \
+  ({ \
+     union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4v8qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_u16(__a) \
+  ({ \
+     union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4v4hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_u32(__a) \
+  ({ \
+     union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4v2si (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_p8(__a) \
+  ({ \
+     union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4v8qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_p16(__a) \
+  ({ \
+     union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4v4hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_s64(__a) \
+  ({ \
+     union { int64x1x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4di (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_u64(__a) \
+  ({ \
+     union { uint64x1x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4di (__a); \
+     __rv.__i; \
+   })
+
+#define vld4q_s8(__a) \
+  ({ \
+     union { int8x16x4_t __i; __builtin_neon_xi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4v16qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld4q_s16(__a) \
+  ({ \
+     union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4v8hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld4q_s32(__a) \
+  ({ \
+     union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4v4si (__a); \
+     __rv.__i; \
+   })
+
+#define vld4q_f32(__a) \
+  ({ \
+     union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4v4sf (__a); \
+     __rv.__i; \
+   })
+
+#define vld4q_u8(__a) \
+  ({ \
+     union { uint8x16x4_t __i; __builtin_neon_xi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4v16qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld4q_u16(__a) \
+  ({ \
+     union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4v8hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld4q_u32(__a) \
+  ({ \
+     union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4v4si (__a); \
+     __rv.__i; \
+   })
+
+#define vld4q_p8(__a) \
+  ({ \
+     union { poly8x16x4_t __i; __builtin_neon_xi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4v16qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld4q_p16(__a) \
+  ({ \
+     union { poly16x8x4_t __i; __builtin_neon_xi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4v8hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_lane_s8(__a, __b, __c) \
+  ({ \
+     union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_lanev8qi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld4_lane_s16(__a, __b, __c) \
+  ({ \
+     union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_lanev4hi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld4_lane_s32(__a, __b, __c) \
+  ({ \
+     union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_lanev2si (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld4_lane_f32(__a, __b, __c) \
+  ({ \
+     union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_lanev2sf (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld4_lane_u8(__a, __b, __c) \
+  ({ \
+     union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_lanev8qi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld4_lane_u16(__a, __b, __c) \
+  ({ \
+     union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_lanev4hi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld4_lane_u32(__a, __b, __c) \
+  ({ \
+     union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_lanev2si (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld4_lane_p8(__a, __b, __c) \
+  ({ \
+     union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_lanev8qi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld4_lane_p16(__a, __b, __c) \
+  ({ \
+     union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_lanev4hi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld4q_lane_s16(__a, __b, __c) \
+  ({ \
+     union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_lanev8hi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld4q_lane_s32(__a, __b, __c) \
+  ({ \
+     union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_lanev4si (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld4q_lane_f32(__a, __b, __c) \
+  ({ \
+     union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_lanev4sf (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld4q_lane_u16(__a, __b, __c) \
+  ({ \
+     union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_lanev8hi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld4q_lane_u32(__a, __b, __c) \
+  ({ \
+     union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_lanev4si (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld4q_lane_p16(__a, __b, __c) \
+  ({ \
+     union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     union { poly16x8x4_t __i; __builtin_neon_xi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_lanev8hi (__a, __bu.__o, __c); \
+     __rv.__i; \
+   })
+
+#define vld4_dup_s8(__a) \
+  ({ \
+     union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_dupv8qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_dup_s16(__a) \
+  ({ \
+     union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_dupv4hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_dup_s32(__a) \
+  ({ \
+     union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_dupv2si (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_dup_f32(__a) \
+  ({ \
+     union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_dupv2sf (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_dup_u8(__a) \
+  ({ \
+     union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_dupv8qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_dup_u16(__a) \
+  ({ \
+     union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_dupv4hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_dup_u32(__a) \
+  ({ \
+     union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_dupv2si (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_dup_p8(__a) \
+  ({ \
+     union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_dupv8qi (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_dup_p16(__a) \
+  ({ \
+     union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_dupv4hi (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_dup_s64(__a) \
+  ({ \
+     union { int64x1x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_dupdi (__a); \
+     __rv.__i; \
+   })
+
+#define vld4_dup_u64(__a) \
+  ({ \
+     union { uint64x1x4_t __i; __builtin_neon_oi __o; } __rv; \
+     __rv.__o = __builtin_neon_vld4_dupdi (__a); \
+     __rv.__i; \
+   })
+
+#define vst4_s8(__a, __b) \
+  ({ \
+     union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst4v8qi (__a, __bu.__o); \
+   })
+
+#define vst4_s16(__a, __b) \
+  ({ \
+     union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst4v4hi (__a, __bu.__o); \
+   })
+
+#define vst4_s32(__a, __b) \
+  ({ \
+     union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst4v2si (__a, __bu.__o); \
+   })
+
+#define vst4_f32(__a, __b) \
+  ({ \
+     union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst4v2sf (__a, __bu.__o); \
+   })
+
+#define vst4_u8(__a, __b) \
+  ({ \
+     union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst4v8qi (__a, __bu.__o); \
+   })
+
+#define vst4_u16(__a, __b) \
+  ({ \
+     union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst4v4hi (__a, __bu.__o); \
+   })
+
+#define vst4_u32(__a, __b) \
+  ({ \
+     union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst4v2si (__a, __bu.__o); \
+   })
+
+#define vst4_p8(__a, __b) \
+  ({ \
+     union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst4v8qi (__a, __bu.__o); \
+   })
+
+#define vst4_p16(__a, __b) \
+  ({ \
+     union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst4v4hi (__a, __bu.__o); \
+   })
+
+#define vst4_s64(__a, __b) \
+  ({ \
+     union { int64x1x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst4di (__a, __bu.__o); \
+   })
+
+#define vst4_u64(__a, __b) \
+  ({ \
+     union { uint64x1x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst4di (__a, __bu.__o); \
+   })
+
+#define vst4q_s8(__a, __b) \
+  ({ \
+     union { int8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     __builtin_neon_vst4v16qi (__a, __bu.__o); \
+   })
+
+#define vst4q_s16(__a, __b) \
+  ({ \
+     union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     __builtin_neon_vst4v8hi (__a, __bu.__o); \
+   })
+
+#define vst4q_s32(__a, __b) \
+  ({ \
+     union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     __builtin_neon_vst4v4si (__a, __bu.__o); \
+   })
+
+#define vst4q_f32(__a, __b) \
+  ({ \
+     union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     __builtin_neon_vst4v4sf (__a, __bu.__o); \
+   })
+
+#define vst4q_u8(__a, __b) \
+  ({ \
+     union { uint8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     __builtin_neon_vst4v16qi (__a, __bu.__o); \
+   })
+
+#define vst4q_u16(__a, __b) \
+  ({ \
+     union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     __builtin_neon_vst4v8hi (__a, __bu.__o); \
+   })
+
+#define vst4q_u32(__a, __b) \
+  ({ \
+     union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     __builtin_neon_vst4v4si (__a, __bu.__o); \
+   })
+
+#define vst4q_p8(__a, __b) \
+  ({ \
+     union { poly8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     __builtin_neon_vst4v16qi (__a, __bu.__o); \
+   })
+
+#define vst4q_p16(__a, __b) \
+  ({ \
+     union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     __builtin_neon_vst4v8hi (__a, __bu.__o); \
+   })
+
+#define vst4_lane_s8(__a, __b, __c) \
+  ({ \
+     union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst4_lanev8qi (__a, __bu.__o, __c); \
+   })
+
+#define vst4_lane_s16(__a, __b, __c) \
+  ({ \
+     union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst4_lanev4hi (__a, __bu.__o, __c); \
+   })
+
+#define vst4_lane_s32(__a, __b, __c) \
+  ({ \
+     union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst4_lanev2si (__a, __bu.__o, __c); \
+   })
+
+#define vst4_lane_f32(__a, __b, __c) \
+  ({ \
+     union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst4_lanev2sf (__a, __bu.__o, __c); \
+   })
+
+#define vst4_lane_u8(__a, __b, __c) \
+  ({ \
+     union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst4_lanev8qi (__a, __bu.__o, __c); \
+   })
+
+#define vst4_lane_u16(__a, __b, __c) \
+  ({ \
+     union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst4_lanev4hi (__a, __bu.__o, __c); \
+   })
+
+#define vst4_lane_u32(__a, __b, __c) \
+  ({ \
+     union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst4_lanev2si (__a, __bu.__o, __c); \
+   })
+
+#define vst4_lane_p8(__a, __b, __c) \
+  ({ \
+     union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst4_lanev8qi (__a, __bu.__o, __c); \
+   })
+
+#define vst4_lane_p16(__a, __b, __c) \
+  ({ \
+     union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b }; \
+     __builtin_neon_vst4_lanev4hi (__a, __bu.__o, __c); \
+   })
+
+#define vst4q_lane_s16(__a, __b, __c) \
+  ({ \
+     union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     __builtin_neon_vst4_lanev8hi (__a, __bu.__o, __c); \
+   })
+
+#define vst4q_lane_s32(__a, __b, __c) \
+  ({ \
+     union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     __builtin_neon_vst4_lanev4si (__a, __bu.__o, __c); \
+   })
+
+#define vst4q_lane_f32(__a, __b, __c) \
+  ({ \
+     union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     __builtin_neon_vst4_lanev4sf (__a, __bu.__o, __c); \
+   })
+
+#define vst4q_lane_u16(__a, __b, __c) \
+  ({ \
+     union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     __builtin_neon_vst4_lanev8hi (__a, __bu.__o, __c); \
+   })
+
+#define vst4q_lane_u32(__a, __b, __c) \
+  ({ \
+     union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     __builtin_neon_vst4_lanev4si (__a, __bu.__o, __c); \
+   })
+
+#define vst4q_lane_p16(__a, __b, __c) \
+  ({ \
+     union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b }; \
+     __builtin_neon_vst4_lanev8hi (__a, __bu.__o, __c); \
+   })
+
+#define vand_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vandv8qi (__a, __b, 1);
+
+#define vand_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vandv4hi (__a, __b, 1);
+
+#define vand_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vandv2si (__a, __b, 1);
+
+#define vand_s64(__a, __b) \
+  (int64x1_t)__builtin_neon_vanddi (__a, __b, 1);
+
+#define vand_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vandv8qi (__a, __b, 0);
+
+#define vand_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vandv4hi (__a, __b, 0);
+
+#define vand_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vandv2si (__a, __b, 0);
+
+#define vand_u64(__a, __b) \
+  (uint64x1_t)__builtin_neon_vanddi (__a, __b, 0);
+
+#define vandq_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vandv16qi (__a, __b, 1);
+
+#define vandq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vandv8hi (__a, __b, 1);
+
+#define vandq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vandv4si (__a, __b, 1);
+
+#define vandq_s64(__a, __b) \
+  (int64x2_t)__builtin_neon_vandv2di (__a, __b, 1);
+
+#define vandq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vandv16qi (__a, __b, 0);
+
+#define vandq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vandv8hi (__a, __b, 0);
+
+#define vandq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vandv4si (__a, __b, 0);
+
+#define vandq_u64(__a, __b) \
+  (uint64x2_t)__builtin_neon_vandv2di (__a, __b, 0);
+
+#define vorr_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vorrv8qi (__a, __b, 1);
+
+#define vorr_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vorrv4hi (__a, __b, 1);
+
+#define vorr_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vorrv2si (__a, __b, 1);
+
+#define vorr_s64(__a, __b) \
+  (int64x1_t)__builtin_neon_vorrdi (__a, __b, 1);
+
+#define vorr_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vorrv8qi (__a, __b, 0);
+
+#define vorr_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vorrv4hi (__a, __b, 0);
+
+#define vorr_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vorrv2si (__a, __b, 0);
+
+#define vorr_u64(__a, __b) \
+  (uint64x1_t)__builtin_neon_vorrdi (__a, __b, 0);
+
+#define vorrq_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vorrv16qi (__a, __b, 1);
+
+#define vorrq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vorrv8hi (__a, __b, 1);
+
+#define vorrq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vorrv4si (__a, __b, 1);
+
+#define vorrq_s64(__a, __b) \
+  (int64x2_t)__builtin_neon_vorrv2di (__a, __b, 1);
+
+#define vorrq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vorrv16qi (__a, __b, 0);
+
+#define vorrq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vorrv8hi (__a, __b, 0);
+
+#define vorrq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vorrv4si (__a, __b, 0);
+
+#define vorrq_u64(__a, __b) \
+  (uint64x2_t)__builtin_neon_vorrv2di (__a, __b, 0);
+
+#define veor_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_veorv8qi (__a, __b, 1);
+
+#define veor_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_veorv4hi (__a, __b, 1);
+
+#define veor_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_veorv2si (__a, __b, 1);
+
+#define veor_s64(__a, __b) \
+  (int64x1_t)__builtin_neon_veordi (__a, __b, 1);
+
+#define veor_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_veorv8qi (__a, __b, 0);
+
+#define veor_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_veorv4hi (__a, __b, 0);
+
+#define veor_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_veorv2si (__a, __b, 0);
+
+#define veor_u64(__a, __b) \
+  (uint64x1_t)__builtin_neon_veordi (__a, __b, 0);
+
+#define veorq_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_veorv16qi (__a, __b, 1);
+
+#define veorq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_veorv8hi (__a, __b, 1);
+
+#define veorq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_veorv4si (__a, __b, 1);
+
+#define veorq_s64(__a, __b) \
+  (int64x2_t)__builtin_neon_veorv2di (__a, __b, 1);
+
+#define veorq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_veorv16qi (__a, __b, 0);
+
+#define veorq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_veorv8hi (__a, __b, 0);
+
+#define veorq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_veorv4si (__a, __b, 0);
+
+#define veorq_u64(__a, __b) \
+  (uint64x2_t)__builtin_neon_veorv2di (__a, __b, 0);
+
+#define vbic_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vbicv8qi (__a, __b, 1);
+
+#define vbic_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vbicv4hi (__a, __b, 1);
+
+#define vbic_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vbicv2si (__a, __b, 1);
+
+#define vbic_s64(__a, __b) \
+  (int64x1_t)__builtin_neon_vbicdi (__a, __b, 1);
+
+#define vbic_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vbicv8qi (__a, __b, 0);
+
+#define vbic_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vbicv4hi (__a, __b, 0);
+
+#define vbic_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vbicv2si (__a, __b, 0);
+
+#define vbic_u64(__a, __b) \
+  (uint64x1_t)__builtin_neon_vbicdi (__a, __b, 0);
+
+#define vbicq_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vbicv16qi (__a, __b, 1);
+
+#define vbicq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vbicv8hi (__a, __b, 1);
+
+#define vbicq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vbicv4si (__a, __b, 1);
+
+#define vbicq_s64(__a, __b) \
+  (int64x2_t)__builtin_neon_vbicv2di (__a, __b, 1);
+
+#define vbicq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vbicv16qi (__a, __b, 0);
+
+#define vbicq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vbicv8hi (__a, __b, 0);
+
+#define vbicq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vbicv4si (__a, __b, 0);
+
+#define vbicq_u64(__a, __b) \
+  (uint64x2_t)__builtin_neon_vbicv2di (__a, __b, 0);
+
+#define vorn_s8(__a, __b) \
+  (int8x8_t)__builtin_neon_vornv8qi (__a, __b, 1);
+
+#define vorn_s16(__a, __b) \
+  (int16x4_t)__builtin_neon_vornv4hi (__a, __b, 1);
+
+#define vorn_s32(__a, __b) \
+  (int32x2_t)__builtin_neon_vornv2si (__a, __b, 1);
+
+#define vorn_s64(__a, __b) \
+  (int64x1_t)__builtin_neon_vorndi (__a, __b, 1);
+
+#define vorn_u8(__a, __b) \
+  (uint8x8_t)__builtin_neon_vornv8qi (__a, __b, 0);
+
+#define vorn_u16(__a, __b) \
+  (uint16x4_t)__builtin_neon_vornv4hi (__a, __b, 0);
+
+#define vorn_u32(__a, __b) \
+  (uint32x2_t)__builtin_neon_vornv2si (__a, __b, 0);
+
+#define vorn_u64(__a, __b) \
+  (uint64x1_t)__builtin_neon_vorndi (__a, __b, 0);
+
+#define vornq_s8(__a, __b) \
+  (int8x16_t)__builtin_neon_vornv16qi (__a, __b, 1);
+
+#define vornq_s16(__a, __b) \
+  (int16x8_t)__builtin_neon_vornv8hi (__a, __b, 1);
+
+#define vornq_s32(__a, __b) \
+  (int32x4_t)__builtin_neon_vornv4si (__a, __b, 1);
+
+#define vornq_s64(__a, __b) \
+  (int64x2_t)__builtin_neon_vornv2di (__a, __b, 1);
+
+#define vornq_u8(__a, __b) \
+  (uint8x16_t)__builtin_neon_vornv16qi (__a, __b, 0);
+
+#define vornq_u16(__a, __b) \
+  (uint16x8_t)__builtin_neon_vornv8hi (__a, __b, 0);
+
+#define vornq_u32(__a, __b) \
+  (uint32x4_t)__builtin_neon_vornv4si (__a, __b, 0);
+
+#define vornq_u64(__a, __b) \
+  (uint64x2_t)__builtin_neon_vornv2di (__a, __b, 0);
+
+
+#define vreinterpret_p8_s8(__a) \
+  (poly8x8_t)__builtin_neon_vreinterpretv8qiv8qi (__a);
+
+#define vreinterpret_p8_s16(__a) \
+  (poly8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a);
+
+#define vreinterpret_p8_s32(__a) \
+  (poly8x8_t)__builtin_neon_vreinterpretv8qiv2si (__a);
+
+#define vreinterpret_p8_s64(__a) \
+  (poly8x8_t)__builtin_neon_vreinterpretv8qidi (__a);
+
+#define vreinterpret_p8_f32(__a) \
+  (poly8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a);
+
+#define vreinterpret_p8_u8(__a) \
+  (poly8x8_t)__builtin_neon_vreinterpretv8qiv8qi (__a);
+
+#define vreinterpret_p8_u16(__a) \
+  (poly8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a);
+
+#define vreinterpret_p8_u32(__a) \
+  (poly8x8_t)__builtin_neon_vreinterpretv8qiv2si (__a);
+
+#define vreinterpret_p8_u64(__a) \
+  (poly8x8_t)__builtin_neon_vreinterpretv8qidi (__a);
+
+#define vreinterpret_p8_p16(__a) \
+  (poly8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a);
+
+#define vreinterpretq_p8_s8(__a) \
+  (poly8x16_t)__builtin_neon_vreinterpretv16qiv16qi (__a);
+
+#define vreinterpretq_p8_s16(__a) \
+  (poly8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a);
+
+#define vreinterpretq_p8_s32(__a) \
+  (poly8x16_t)__builtin_neon_vreinterpretv16qiv4si (__a);
+
+#define vreinterpretq_p8_s64(__a) \
+  (poly8x16_t)__builtin_neon_vreinterpretv16qiv2di (__a);
+
+#define vreinterpretq_p8_f32(__a) \
+  (poly8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
+
+#define vreinterpretq_p8_u8(__a) \
+  (poly8x16_t)__builtin_neon_vreinterpretv16qiv16qi (__a);
+
+#define vreinterpretq_p8_u16(__a) \
+  (poly8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a);
+
+#define vreinterpretq_p8_u32(__a) \
+  (poly8x16_t)__builtin_neon_vreinterpretv16qiv4si (__a);
+
+#define vreinterpretq_p8_u64(__a) \
+  (poly8x16_t)__builtin_neon_vreinterpretv16qiv2di (__a);
+
+#define vreinterpretq_p8_p16(__a) \
+  (poly8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a);
+
+#define vreinterpret_p16_s8(__a) \
+  (poly16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a);
+
+#define vreinterpret_p16_s16(__a) \
+  (poly16x4_t)__builtin_neon_vreinterpretv4hiv4hi (__a);
+
+#define vreinterpret_p16_s32(__a) \
+  (poly16x4_t)__builtin_neon_vreinterpretv4hiv2si (__a);
+
+#define vreinterpret_p16_s64(__a) \
+  (poly16x4_t)__builtin_neon_vreinterpretv4hidi (__a);
+
+#define vreinterpret_p16_f32(__a) \
+  (poly16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a);
+
+#define vreinterpret_p16_u8(__a) \
+  (poly16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a);
+
+#define vreinterpret_p16_u16(__a) \
+  (poly16x4_t)__builtin_neon_vreinterpretv4hiv4hi (__a);
+
+#define vreinterpret_p16_u32(__a) \
+  (poly16x4_t)__builtin_neon_vreinterpretv4hiv2si (__a);
+
+#define vreinterpret_p16_u64(__a) \
+  (poly16x4_t)__builtin_neon_vreinterpretv4hidi (__a);
+
+#define vreinterpret_p16_p8(__a) \
+  (poly16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a);
+
+#define vreinterpretq_p16_s8(__a) \
+  (poly16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a);
+
+#define vreinterpretq_p16_s16(__a) \
+  (poly16x8_t)__builtin_neon_vreinterpretv8hiv8hi (__a);
+
+#define vreinterpretq_p16_s32(__a) \
+  (poly16x8_t)__builtin_neon_vreinterpretv8hiv4si (__a);
+
+#define vreinterpretq_p16_s64(__a) \
+  (poly16x8_t)__builtin_neon_vreinterpretv8hiv2di (__a);
+
+#define vreinterpretq_p16_f32(__a) \
+  (poly16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a);
+
+#define vreinterpretq_p16_u8(__a) \
+  (poly16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a);
+
+#define vreinterpretq_p16_u16(__a) \
+  (poly16x8_t)__builtin_neon_vreinterpretv8hiv8hi (__a);
+
+#define vreinterpretq_p16_u32(__a) \
+  (poly16x8_t)__builtin_neon_vreinterpretv8hiv4si (__a);
+
+#define vreinterpretq_p16_u64(__a) \
+  (poly16x8_t)__builtin_neon_vreinterpretv8hiv2di (__a);
+
+#define vreinterpretq_p16_p8(__a) \
+  (poly16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a);
+
+#define vreinterpret_f32_s8(__a) \
+  (float32x2_t)__builtin_neon_vreinterpretv2sfv8qi (__a);
+
+#define vreinterpret_f32_s16(__a) \
+  (float32x2_t)__builtin_neon_vreinterpretv2sfv4hi (__a);
+
+#define vreinterpret_f32_s32(__a) \
+  (float32x2_t)__builtin_neon_vreinterpretv2sfv2si (__a);
+
+#define vreinterpret_f32_s64(__a) \
+  (float32x2_t)__builtin_neon_vreinterpretv2sfdi (__a);
+
+#define vreinterpret_f32_u8(__a) \
+  (float32x2_t)__builtin_neon_vreinterpretv2sfv8qi (__a);
+
+#define vreinterpret_f32_u16(__a) \
+  (float32x2_t)__builtin_neon_vreinterpretv2sfv4hi (__a);
+
+#define vreinterpret_f32_u32(__a) \
+  (float32x2_t)__builtin_neon_vreinterpretv2sfv2si (__a);
+
+#define vreinterpret_f32_u64(__a) \
+  (float32x2_t)__builtin_neon_vreinterpretv2sfdi (__a);
+
+#define vreinterpret_f32_p8(__a) \
+  (float32x2_t)__builtin_neon_vreinterpretv2sfv8qi (__a);
+
+#define vreinterpret_f32_p16(__a) \
+  (float32x2_t)__builtin_neon_vreinterpretv2sfv4hi (__a);
+
+#define vreinterpretq_f32_s8(__a) \
+  (float32x4_t)__builtin_neon_vreinterpretv4sfv16qi (__a);
+
+#define vreinterpretq_f32_s16(__a) \
+  (float32x4_t)__builtin_neon_vreinterpretv4sfv8hi (__a);
+
+#define vreinterpretq_f32_s32(__a) \
+  (float32x4_t)__builtin_neon_vreinterpretv4sfv4si (__a);
+
+#define vreinterpretq_f32_s64(__a) \
+  (float32x4_t)__builtin_neon_vreinterpretv4sfv2di (__a);
+
+#define vreinterpretq_f32_u8(__a) \
+  (float32x4_t)__builtin_neon_vreinterpretv4sfv16qi (__a);
+
+#define vreinterpretq_f32_u16(__a) \
+  (float32x4_t)__builtin_neon_vreinterpretv4sfv8hi (__a);
+
+#define vreinterpretq_f32_u32(__a) \
+  (float32x4_t)__builtin_neon_vreinterpretv4sfv4si (__a);
+
+#define vreinterpretq_f32_u64(__a) \
+  (float32x4_t)__builtin_neon_vreinterpretv4sfv2di (__a);
+
+#define vreinterpretq_f32_p8(__a) \
+  (float32x4_t)__builtin_neon_vreinterpretv4sfv16qi (__a);
+
+#define vreinterpretq_f32_p16(__a) \
+  (float32x4_t)__builtin_neon_vreinterpretv4sfv8hi (__a);
+
+#define vreinterpret_s64_s8(__a) \
+  (int64x1_t)__builtin_neon_vreinterpretdiv8qi (__a);
+
+#define vreinterpret_s64_s16(__a) \
+  (int64x1_t)__builtin_neon_vreinterpretdiv4hi (__a);
+
+#define vreinterpret_s64_s32(__a) \
+  (int64x1_t)__builtin_neon_vreinterpretdiv2si (__a);
+
+#define vreinterpret_s64_f32(__a) \
+  (int64x1_t)__builtin_neon_vreinterpretdiv2sf (__a);
+
+#define vreinterpret_s64_u8(__a) \
+  (int64x1_t)__builtin_neon_vreinterpretdiv8qi (__a);
+
+#define vreinterpret_s64_u16(__a) \
+  (int64x1_t)__builtin_neon_vreinterpretdiv4hi (__a);
+
+#define vreinterpret_s64_u32(__a) \
+  (int64x1_t)__builtin_neon_vreinterpretdiv2si (__a);
+
+#define vreinterpret_s64_u64(__a) \
+  (int64x1_t)__builtin_neon_vreinterpretdidi (__a);
+
+#define vreinterpret_s64_p8(__a) \
+  (int64x1_t)__builtin_neon_vreinterpretdiv8qi (__a);
+
+#define vreinterpret_s64_p16(__a) \
+  (int64x1_t)__builtin_neon_vreinterpretdiv4hi (__a);
+
+#define vreinterpretq_s64_s8(__a) \
+  (int64x2_t)__builtin_neon_vreinterpretv2div16qi (__a);
+
+#define vreinterpretq_s64_s16(__a) \
+  (int64x2_t)__builtin_neon_vreinterpretv2div8hi (__a);
+
+#define vreinterpretq_s64_s32(__a) \
+  (int64x2_t)__builtin_neon_vreinterpretv2div4si (__a);
+
+#define vreinterpretq_s64_f32(__a) \
+  (int64x2_t)__builtin_neon_vreinterpretv2div4sf (__a);
+
+#define vreinterpretq_s64_u8(__a) \
+  (int64x2_t)__builtin_neon_vreinterpretv2div16qi (__a);
+
+#define vreinterpretq_s64_u16(__a) \
+  (int64x2_t)__builtin_neon_vreinterpretv2div8hi (__a);
+
+#define vreinterpretq_s64_u32(__a) \
+  (int64x2_t)__builtin_neon_vreinterpretv2div4si (__a);
+
+#define vreinterpretq_s64_u64(__a) \
+  (int64x2_t)__builtin_neon_vreinterpretv2div2di (__a);
+
+#define vreinterpretq_s64_p8(__a) \
+  (int64x2_t)__builtin_neon_vreinterpretv2div16qi (__a);
+
+#define vreinterpretq_s64_p16(__a) \
+  (int64x2_t)__builtin_neon_vreinterpretv2div8hi (__a);
+
+#define vreinterpret_u64_s8(__a) \
+  (uint64x1_t)__builtin_neon_vreinterpretdiv8qi (__a);
+
+#define vreinterpret_u64_s16(__a) \
+  (uint64x1_t)__builtin_neon_vreinterpretdiv4hi (__a);
+
+#define vreinterpret_u64_s32(__a) \
+  (uint64x1_t)__builtin_neon_vreinterpretdiv2si (__a);
+
+#define vreinterpret_u64_s64(__a) \
+  (uint64x1_t)__builtin_neon_vreinterpretdidi (__a);
+
+#define vreinterpret_u64_f32(__a) \
+  (uint64x1_t)__builtin_neon_vreinterpretdiv2sf (__a);
+
+#define vreinterpret_u64_u8(__a) \
+  (uint64x1_t)__builtin_neon_vreinterpretdiv8qi (__a);
+
+#define vreinterpret_u64_u16(__a) \
+  (uint64x1_t)__builtin_neon_vreinterpretdiv4hi (__a);
+
+#define vreinterpret_u64_u32(__a) \
+  (uint64x1_t)__builtin_neon_vreinterpretdiv2si (__a);
+
+#define vreinterpret_u64_p8(__a) \
+  (uint64x1_t)__builtin_neon_vreinterpretdiv8qi (__a);
+
+#define vreinterpret_u64_p16(__a) \
+  (uint64x1_t)__builtin_neon_vreinterpretdiv4hi (__a);
+
+#define vreinterpretq_u64_s8(__a) \
+  (uint64x2_t)__builtin_neon_vreinterpretv2div16qi (__a);
+
+#define vreinterpretq_u64_s16(__a) \
+  (uint64x2_t)__builtin_neon_vreinterpretv2div8hi (__a);
+
+#define vreinterpretq_u64_s32(__a) \
+  (uint64x2_t)__builtin_neon_vreinterpretv2div4si (__a);
+
+#define vreinterpretq_u64_s64(__a) \
+  (uint64x2_t)__builtin_neon_vreinterpretv2div2di (__a);
+
+#define vreinterpretq_u64_f32(__a) \
+  (uint64x2_t)__builtin_neon_vreinterpretv2div4sf (__a);
+
+#define vreinterpretq_u64_u8(__a) \
+  (uint64x2_t)__builtin_neon_vreinterpretv2div16qi (__a);
+
+#define vreinterpretq_u64_u16(__a) \
+  (uint64x2_t)__builtin_neon_vreinterpretv2div8hi (__a);
+
+#define vreinterpretq_u64_u32(__a) \
+  (uint64x2_t)__builtin_neon_vreinterpretv2div4si (__a);
+
+#define vreinterpretq_u64_p8(__a) \
+  (uint64x2_t)__builtin_neon_vreinterpretv2div16qi (__a);
+
+#define vreinterpretq_u64_p16(__a) \
+  (uint64x2_t)__builtin_neon_vreinterpretv2div8hi (__a);
+
+#define vreinterpret_s8_s16(__a) \
+  (int8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a);
+
+#define vreinterpret_s8_s32(__a) \
+  (int8x8_t)__builtin_neon_vreinterpretv8qiv2si (__a);
+
+#define vreinterpret_s8_s64(__a) \
+  (int8x8_t)__builtin_neon_vreinterpretv8qidi (__a);
+
+#define vreinterpret_s8_f32(__a) \
+  (int8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a);
+
+#define vreinterpret_s8_u8(__a) \
+  (int8x8_t)__builtin_neon_vreinterpretv8qiv8qi (__a);
+
+#define vreinterpret_s8_u16(__a) \
+  (int8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a);
+
+#define vreinterpret_s8_u32(__a) \
+  (int8x8_t)__builtin_neon_vreinterpretv8qiv2si (__a);
+
+#define vreinterpret_s8_u64(__a) \
+  (int8x8_t)__builtin_neon_vreinterpretv8qidi (__a);
+
+#define vreinterpret_s8_p8(__a) \
+  (int8x8_t)__builtin_neon_vreinterpretv8qiv8qi (__a);
+
+#define vreinterpret_s8_p16(__a) \
+  (int8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a);
+
+#define vreinterpretq_s8_s16(__a) \
+  (int8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a);
+
+#define vreinterpretq_s8_s32(__a) \
+  (int8x16_t)__builtin_neon_vreinterpretv16qiv4si (__a);
+
+#define vreinterpretq_s8_s64(__a) \
+  (int8x16_t)__builtin_neon_vreinterpretv16qiv2di (__a);
+
+#define vreinterpretq_s8_f32(__a) \
+  (int8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
+
+#define vreinterpretq_s8_u8(__a) \
+  (int8x16_t)__builtin_neon_vreinterpretv16qiv16qi (__a);
+
+#define vreinterpretq_s8_u16(__a) \
+  (int8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a);
+
+#define vreinterpretq_s8_u32(__a) \
+  (int8x16_t)__builtin_neon_vreinterpretv16qiv4si (__a);
+
+#define vreinterpretq_s8_u64(__a) \
+  (int8x16_t)__builtin_neon_vreinterpretv16qiv2di (__a);
+
+#define vreinterpretq_s8_p8(__a) \
+  (int8x16_t)__builtin_neon_vreinterpretv16qiv16qi (__a);
+
+#define vreinterpretq_s8_p16(__a) \
+  (int8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a);
+
+#define vreinterpret_s16_s8(__a) \
+  (int16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a);
+
+#define vreinterpret_s16_s32(__a) \
+  (int16x4_t)__builtin_neon_vreinterpretv4hiv2si (__a);
+
+#define vreinterpret_s16_s64(__a) \
+  (int16x4_t)__builtin_neon_vreinterpretv4hidi (__a);
+
+#define vreinterpret_s16_f32(__a) \
+  (int16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a);
+
+#define vreinterpret_s16_u8(__a) \
+  (int16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a);
+
+#define vreinterpret_s16_u16(__a) \
+  (int16x4_t)__builtin_neon_vreinterpretv4hiv4hi (__a);
+
+#define vreinterpret_s16_u32(__a) \
+  (int16x4_t)__builtin_neon_vreinterpretv4hiv2si (__a);
+
+#define vreinterpret_s16_u64(__a) \
+  (int16x4_t)__builtin_neon_vreinterpretv4hidi (__a);
+
+#define vreinterpret_s16_p8(__a) \
+  (int16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a);
+
+#define vreinterpret_s16_p16(__a) \
+  (int16x4_t)__builtin_neon_vreinterpretv4hiv4hi (__a);
+
+#define vreinterpretq_s16_s8(__a) \
+  (int16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a);
+
+#define vreinterpretq_s16_s32(__a) \
+  (int16x8_t)__builtin_neon_vreinterpretv8hiv4si (__a);
+
+#define vreinterpretq_s16_s64(__a) \
+  (int16x8_t)__builtin_neon_vreinterpretv8hiv2di (__a);
+
+#define vreinterpretq_s16_f32(__a) \
+  (int16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a);
+
+#define vreinterpretq_s16_u8(__a) \
+  (int16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a);
+
+#define vreinterpretq_s16_u16(__a) \
+  (int16x8_t)__builtin_neon_vreinterpretv8hiv8hi (__a);
+
+#define vreinterpretq_s16_u32(__a) \
+  (int16x8_t)__builtin_neon_vreinterpretv8hiv4si (__a);
+
+#define vreinterpretq_s16_u64(__a) \
+  (int16x8_t)__builtin_neon_vreinterpretv8hiv2di (__a);
+
+#define vreinterpretq_s16_p8(__a) \
+  (int16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a);
+
+#define vreinterpretq_s16_p16(__a) \
+  (int16x8_t)__builtin_neon_vreinterpretv8hiv8hi (__a);
+
+#define vreinterpret_s32_s8(__a) \
+  (int32x2_t)__builtin_neon_vreinterpretv2siv8qi (__a);
+
+#define vreinterpret_s32_s16(__a) \
+  (int32x2_t)__builtin_neon_vreinterpretv2siv4hi (__a);
+
+#define vreinterpret_s32_s64(__a) \
+  (int32x2_t)__builtin_neon_vreinterpretv2sidi (__a);
+
+#define vreinterpret_s32_f32(__a) \
+  (int32x2_t)__builtin_neon_vreinterpretv2siv2sf (__a);
+
+#define vreinterpret_s32_u8(__a) \
+  (int32x2_t)__builtin_neon_vreinterpretv2siv8qi (__a);
+
+#define vreinterpret_s32_u16(__a) \
+  (int32x2_t)__builtin_neon_vreinterpretv2siv4hi (__a);
+
+#define vreinterpret_s32_u32(__a) \
+  (int32x2_t)__builtin_neon_vreinterpretv2siv2si (__a);
+
+#define vreinterpret_s32_u64(__a) \
+  (int32x2_t)__builtin_neon_vreinterpretv2sidi (__a);
+
+#define vreinterpret_s32_p8(__a) \
+  (int32x2_t)__builtin_neon_vreinterpretv2siv8qi (__a);
+
+#define vreinterpret_s32_p16(__a) \
+  (int32x2_t)__builtin_neon_vreinterpretv2siv4hi (__a);
+
+#define vreinterpretq_s32_s8(__a) \
+  (int32x4_t)__builtin_neon_vreinterpretv4siv16qi (__a);
+
+#define vreinterpretq_s32_s16(__a) \
+  (int32x4_t)__builtin_neon_vreinterpretv4siv8hi (__a);
+
+#define vreinterpretq_s32_s64(__a) \
+  (int32x4_t)__builtin_neon_vreinterpretv4siv2di (__a);
+
+#define vreinterpretq_s32_f32(__a) \
+  (int32x4_t)__builtin_neon_vreinterpretv4siv4sf (__a);
+
+#define vreinterpretq_s32_u8(__a) \
+  (int32x4_t)__builtin_neon_vreinterpretv4siv16qi (__a);
+
+#define vreinterpretq_s32_u16(__a) \
+  (int32x4_t)__builtin_neon_vreinterpretv4siv8hi (__a);
+
+#define vreinterpretq_s32_u32(__a) \
+  (int32x4_t)__builtin_neon_vreinterpretv4siv4si (__a);
+
+#define vreinterpretq_s32_u64(__a) \
+  (int32x4_t)__builtin_neon_vreinterpretv4siv2di (__a);
+
+#define vreinterpretq_s32_p8(__a) \
+  (int32x4_t)__builtin_neon_vreinterpretv4siv16qi (__a);
+
+#define vreinterpretq_s32_p16(__a) \
+  (int32x4_t)__builtin_neon_vreinterpretv4siv8hi (__a);
+
+#define vreinterpret_u8_s8(__a) \
+  (uint8x8_t)__builtin_neon_vreinterpretv8qiv8qi (__a);
+
+#define vreinterpret_u8_s16(__a) \
+  (uint8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a);
+
+#define vreinterpret_u8_s32(__a) \
+  (uint8x8_t)__builtin_neon_vreinterpretv8qiv2si (__a);
+
+#define vreinterpret_u8_s64(__a) \
+  (uint8x8_t)__builtin_neon_vreinterpretv8qidi (__a);
+
+#define vreinterpret_u8_f32(__a) \
+  (uint8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a);
+
+#define vreinterpret_u8_u16(__a) \
+  (uint8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a);
+
+#define vreinterpret_u8_u32(__a) \
+  (uint8x8_t)__builtin_neon_vreinterpretv8qiv2si (__a);
+
+#define vreinterpret_u8_u64(__a) \
+  (uint8x8_t)__builtin_neon_vreinterpretv8qidi (__a);
+
+#define vreinterpret_u8_p8(__a) \
+  (uint8x8_t)__builtin_neon_vreinterpretv8qiv8qi (__a);
+
+#define vreinterpret_u8_p16(__a) \
+  (uint8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a);
+
+#define vreinterpretq_u8_s8(__a) \
+  (uint8x16_t)__builtin_neon_vreinterpretv16qiv16qi (__a);
+
+#define vreinterpretq_u8_s16(__a) \
+  (uint8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a);
+
+#define vreinterpretq_u8_s32(__a) \
+  (uint8x16_t)__builtin_neon_vreinterpretv16qiv4si (__a);
+
+#define vreinterpretq_u8_s64(__a) \
+  (uint8x16_t)__builtin_neon_vreinterpretv16qiv2di (__a);
+
+#define vreinterpretq_u8_f32(__a) \
+  (uint8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
+
+#define vreinterpretq_u8_u16(__a) \
+  (uint8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a);
+
+#define vreinterpretq_u8_u32(__a) \
+  (uint8x16_t)__builtin_neon_vreinterpretv16qiv4si (__a);
+
+#define vreinterpretq_u8_u64(__a) \
+  (uint8x16_t)__builtin_neon_vreinterpretv16qiv2di (__a);
+
+#define vreinterpretq_u8_p8(__a) \
+  (uint8x16_t)__builtin_neon_vreinterpretv16qiv16qi (__a);
+
+#define vreinterpretq_u8_p16(__a) \
+  (uint8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a);
+
+#define vreinterpret_u16_s8(__a) \
+  (uint16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a);
+
+#define vreinterpret_u16_s16(__a) \
+  (uint16x4_t)__builtin_neon_vreinterpretv4hiv4hi (__a);
+
+#define vreinterpret_u16_s32(__a) \
+  (uint16x4_t)__builtin_neon_vreinterpretv4hiv2si (__a);
+
+#define vreinterpret_u16_s64(__a) \
+  (uint16x4_t)__builtin_neon_vreinterpretv4hidi (__a);
+
+#define vreinterpret_u16_f32(__a) \
+  (uint16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a);
+
+#define vreinterpret_u16_u8(__a) \
+  (uint16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a);
+
+#define vreinterpret_u16_u32(__a) \
+  (uint16x4_t)__builtin_neon_vreinterpretv4hiv2si (__a);
+
+#define vreinterpret_u16_u64(__a) \
+  (uint16x4_t)__builtin_neon_vreinterpretv4hidi (__a);
+
+#define vreinterpret_u16_p8(__a) \
+  (uint16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a);
+
+#define vreinterpret_u16_p16(__a) \
+  (uint16x4_t)__builtin_neon_vreinterpretv4hiv4hi (__a);
+
+#define vreinterpretq_u16_s8(__a) \
+  (uint16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a);
+
+#define vreinterpretq_u16_s16(__a) \
+  (uint16x8_t)__builtin_neon_vreinterpretv8hiv8hi (__a);
+
+#define vreinterpretq_u16_s32(__a) \
+  (uint16x8_t)__builtin_neon_vreinterpretv8hiv4si (__a);
+
+#define vreinterpretq_u16_s64(__a) \
+  (uint16x8_t)__builtin_neon_vreinterpretv8hiv2di (__a);
+
+#define vreinterpretq_u16_f32(__a) \
+  (uint16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a);
+
+#define vreinterpretq_u16_u8(__a) \
+  (uint16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a);
+
+#define vreinterpretq_u16_u32(__a) \
+  (uint16x8_t)__builtin_neon_vreinterpretv8hiv4si (__a);
+
+#define vreinterpretq_u16_u64(__a) \
+  (uint16x8_t)__builtin_neon_vreinterpretv8hiv2di (__a);
+
+#define vreinterpretq_u16_p8(__a) \
+  (uint16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a);
+
+#define vreinterpretq_u16_p16(__a) \
+  (uint16x8_t)__builtin_neon_vreinterpretv8hiv8hi (__a);
+
+#define vreinterpret_u32_s8(__a) \
+  (uint32x2_t)__builtin_neon_vreinterpretv2siv8qi (__a);
+
+#define vreinterpret_u32_s16(__a) \
+  (uint32x2_t)__builtin_neon_vreinterpretv2siv4hi (__a);
+
+#define vreinterpret_u32_s32(__a) \
+  (uint32x2_t)__builtin_neon_vreinterpretv2siv2si (__a);
+
+#define vreinterpret_u32_s64(__a) \
+  (uint32x2_t)__builtin_neon_vreinterpretv2sidi (__a);
+
+#define vreinterpret_u32_f32(__a) \
+  (uint32x2_t)__builtin_neon_vreinterpretv2siv2sf (__a);
+
+#define vreinterpret_u32_u8(__a) \
+  (uint32x2_t)__builtin_neon_vreinterpretv2siv8qi (__a);
+
+#define vreinterpret_u32_u16(__a) \
+  (uint32x2_t)__builtin_neon_vreinterpretv2siv4hi (__a);
+
+#define vreinterpret_u32_u64(__a) \
+  (uint32x2_t)__builtin_neon_vreinterpretv2sidi (__a);
+
+#define vreinterpret_u32_p8(__a) \
+  (uint32x2_t)__builtin_neon_vreinterpretv2siv8qi (__a);
+
+#define vreinterpret_u32_p16(__a) \
+  (uint32x2_t)__builtin_neon_vreinterpretv2siv4hi (__a);
+
+#define vreinterpretq_u32_s8(__a) \
+  (uint32x4_t)__builtin_neon_vreinterpretv4siv16qi (__a);
+
+#define vreinterpretq_u32_s16(__a) \
+  (uint32x4_t)__builtin_neon_vreinterpretv4siv8hi (__a);
+
+#define vreinterpretq_u32_s32(__a) \
+  (uint32x4_t)__builtin_neon_vreinterpretv4siv4si (__a);
+
+#define vreinterpretq_u32_s64(__a) \
+  (uint32x4_t)__builtin_neon_vreinterpretv4siv2di (__a);
+
+#define vreinterpretq_u32_f32(__a) \
+  (uint32x4_t)__builtin_neon_vreinterpretv4siv4sf (__a);
+
+#define vreinterpretq_u32_u8(__a) \
+  (uint32x4_t)__builtin_neon_vreinterpretv4siv16qi (__a);
+
+#define vreinterpretq_u32_u16(__a) \
+  (uint32x4_t)__builtin_neon_vreinterpretv4siv8hi (__a);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_u16 (uint16_t * __a, uint16x4x2_t __b)
-{
-  union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  __builtin_neon_vst2v4hi ((__builtin_neon_hi *) __a, __bu.__o);
-}
+#define vreinterpretq_u32_u64(__a) \
+  (uint32x4_t)__builtin_neon_vreinterpretv4siv2di (__a);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_u32 (uint32_t * __a, uint32x2x2_t __b)
-{
-  union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  __builtin_neon_vst2v2si ((__builtin_neon_si *) __a, __bu.__o);
-}
+#define vreinterpretq_u32_p8(__a) \
+  (uint32x4_t)__builtin_neon_vreinterpretv4siv16qi (__a);
 
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_p8 (poly8_t * __a, poly8x8x2_t __b)
-{
-  union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_p16 (poly16_t * __a, poly16x4x2_t __b)
-{
-  union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  __builtin_neon_vst2v4hi ((__builtin_neon_hi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_s64 (int64_t * __a, int64x1x2_t __b)
-{
-  union { int64x1x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  __builtin_neon_vst2di ((__builtin_neon_di *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_u64 (uint64_t * __a, uint64x1x2_t __b)
-{
-  union { uint64x1x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  __builtin_neon_vst2di ((__builtin_neon_di *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_s8 (int8_t * __a, int8x16x2_t __b)
-{
-  union { int8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_s16 (int16_t * __a, int16x8x2_t __b)
-{
-  union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_s32 (int32_t * __a, int32x4x2_t __b)
-{
-  union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst2v4si ((__builtin_neon_si *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_f32 (float32_t * __a, float32x4x2_t __b)
-{
-  union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst2v4sf (__a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_u8 (uint8_t * __a, uint8x16x2_t __b)
-{
-  union { uint8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_u16 (uint16_t * __a, uint16x8x2_t __b)
-{
-  union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_u32 (uint32_t * __a, uint32x4x2_t __b)
-{
-  union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst2v4si ((__builtin_neon_si *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_p8 (poly8_t * __a, poly8x16x2_t __b)
-{
-  union { poly8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_p16 (poly16_t * __a, poly16x8x2_t __b)
-{
-  union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_lane_s8 (int8_t * __a, int8x8x2_t __b, const int __c)
-{
-  union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_lane_s16 (int16_t * __a, int16x4x2_t __b, const int __c)
-{
-  union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_lane_s32 (int32_t * __a, int32x2x2_t __b, const int __c)
-{
-  union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  __builtin_neon_vst2_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_lane_f32 (float32_t * __a, float32x2x2_t __b, const int __c)
-{
-  union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  __builtin_neon_vst2_lanev2sf (__a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_lane_u8 (uint8_t * __a, uint8x8x2_t __b, const int __c)
-{
-  union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_lane_u16 (uint16_t * __a, uint16x4x2_t __b, const int __c)
-{
-  union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_lane_u32 (uint32_t * __a, uint32x2x2_t __b, const int __c)
-{
-  union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  __builtin_neon_vst2_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_lane_p8 (poly8_t * __a, poly8x8x2_t __b, const int __c)
-{
-  union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2_lane_p16 (poly16_t * __a, poly16x4x2_t __b, const int __c)
-{
-  union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
-  __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_lane_s16 (int16_t * __a, int16x8x2_t __b, const int __c)
-{
-  union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_lane_s32 (int32_t * __a, int32x4x2_t __b, const int __c)
-{
-  union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst2_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_lane_f32 (float32_t * __a, float32x4x2_t __b, const int __c)
-{
-  union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst2_lanev4sf (__a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_lane_u16 (uint16_t * __a, uint16x8x2_t __b, const int __c)
-{
-  union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_lane_u32 (uint32_t * __a, uint32x4x2_t __b, const int __c)
-{
-  union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst2_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst2q_lane_p16 (poly16_t * __a, poly16x8x2_t __b, const int __c)
-{
-  union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
-vld3_s8 (const int8_t * __a)
-{
-  union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3v8qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
-vld3_s16 (const int16_t * __a)
-{
-  union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3v4hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
-vld3_s32 (const int32_t * __a)
-{
-  union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3v2si ((const __builtin_neon_si *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
-vld3_f32 (const float32_t * __a)
-{
-  union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3v2sf (__a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
-vld3_u8 (const uint8_t * __a)
-{
-  union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3v8qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
-vld3_u16 (const uint16_t * __a)
-{
-  union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3v4hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
-vld3_u32 (const uint32_t * __a)
-{
-  union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3v2si ((const __builtin_neon_si *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
-vld3_p8 (const poly8_t * __a)
-{
-  union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3v8qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
-vld3_p16 (const poly16_t * __a)
-{
-  union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3v4hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
-vld3_s64 (const int64_t * __a)
-{
-  union { int64x1x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3di ((const __builtin_neon_di *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__))
-vld3_u64 (const uint64_t * __a)
-{
-  union { uint64x1x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3di ((const __builtin_neon_di *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int8x16x3_t __attribute__ ((__always_inline__))
-vld3q_s8 (const int8_t * __a)
-{
-  union { int8x16x3_t __i; __builtin_neon_ci __o; } __rv;
-  __rv.__o = __builtin_neon_vld3v16qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
-vld3q_s16 (const int16_t * __a)
-{
-  union { int16x8x3_t __i; __builtin_neon_ci __o; } __rv;
-  __rv.__o = __builtin_neon_vld3v8hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
-vld3q_s32 (const int32_t * __a)
-{
-  union { int32x4x3_t __i; __builtin_neon_ci __o; } __rv;
-  __rv.__o = __builtin_neon_vld3v4si ((const __builtin_neon_si *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
-vld3q_f32 (const float32_t * __a)
-{
-  union { float32x4x3_t __i; __builtin_neon_ci __o; } __rv;
-  __rv.__o = __builtin_neon_vld3v4sf (__a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint8x16x3_t __attribute__ ((__always_inline__))
-vld3q_u8 (const uint8_t * __a)
-{
-  union { uint8x16x3_t __i; __builtin_neon_ci __o; } __rv;
-  __rv.__o = __builtin_neon_vld3v16qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
-vld3q_u16 (const uint16_t * __a)
-{
-  union { uint16x8x3_t __i; __builtin_neon_ci __o; } __rv;
-  __rv.__o = __builtin_neon_vld3v8hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
-vld3q_u32 (const uint32_t * __a)
-{
-  union { uint32x4x3_t __i; __builtin_neon_ci __o; } __rv;
-  __rv.__o = __builtin_neon_vld3v4si ((const __builtin_neon_si *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly8x16x3_t __attribute__ ((__always_inline__))
-vld3q_p8 (const poly8_t * __a)
-{
-  union { poly8x16x3_t __i; __builtin_neon_ci __o; } __rv;
-  __rv.__o = __builtin_neon_vld3v16qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
-vld3q_p16 (const poly16_t * __a)
-{
-  union { poly16x8x3_t __i; __builtin_neon_ci __o; } __rv;
-  __rv.__o = __builtin_neon_vld3v8hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
-vld3_lane_s8 (const int8_t * __a, int8x8x3_t __b, const int __c)
-{
-  union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
-vld3_lane_s16 (const int16_t * __a, int16x4x3_t __b, const int __c)
-{
-  union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
-vld3_lane_s32 (const int32_t * __a, int32x2x3_t __b, const int __c)
-{
-  union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
-vld3_lane_f32 (const float32_t * __a, float32x2x3_t __b, const int __c)
-{
-  union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_lanev2sf (__a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
-vld3_lane_u8 (const uint8_t * __a, uint8x8x3_t __b, const int __c)
-{
-  union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
-vld3_lane_u16 (const uint16_t * __a, uint16x4x3_t __b, const int __c)
-{
-  union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
-vld3_lane_u32 (const uint32_t * __a, uint32x2x3_t __b, const int __c)
-{
-  union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
-vld3_lane_p8 (const poly8_t * __a, poly8x8x3_t __b, const int __c)
-{
-  union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
-vld3_lane_p16 (const poly16_t * __a, poly16x4x3_t __b, const int __c)
-{
-  union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
-vld3q_lane_s16 (const int16_t * __a, int16x8x3_t __b, const int __c)
-{
-  union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  union { int16x8x3_t __i; __builtin_neon_ci __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
-vld3q_lane_s32 (const int32_t * __a, int32x4x3_t __b, const int __c)
-{
-  union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  union { int32x4x3_t __i; __builtin_neon_ci __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
-vld3q_lane_f32 (const float32_t * __a, float32x4x3_t __b, const int __c)
-{
-  union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  union { float32x4x3_t __i; __builtin_neon_ci __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_lanev4sf (__a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
-vld3q_lane_u16 (const uint16_t * __a, uint16x8x3_t __b, const int __c)
-{
-  union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  union { uint16x8x3_t __i; __builtin_neon_ci __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
-vld3q_lane_u32 (const uint32_t * __a, uint32x4x3_t __b, const int __c)
-{
-  union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  union { uint32x4x3_t __i; __builtin_neon_ci __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
-vld3q_lane_p16 (const poly16_t * __a, poly16x8x3_t __b, const int __c)
-{
-  union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  union { poly16x8x3_t __i; __builtin_neon_ci __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
-vld3_dup_s8 (const int8_t * __a)
-{
-  union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_dupv8qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
-vld3_dup_s16 (const int16_t * __a)
-{
-  union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_dupv4hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
-vld3_dup_s32 (const int32_t * __a)
-{
-  union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_dupv2si ((const __builtin_neon_si *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
-vld3_dup_f32 (const float32_t * __a)
-{
-  union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_dupv2sf (__a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
-vld3_dup_u8 (const uint8_t * __a)
-{
-  union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_dupv8qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
-vld3_dup_u16 (const uint16_t * __a)
-{
-  union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_dupv4hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
-vld3_dup_u32 (const uint32_t * __a)
-{
-  union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_dupv2si ((const __builtin_neon_si *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
-vld3_dup_p8 (const poly8_t * __a)
-{
-  union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_dupv8qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
-vld3_dup_p16 (const poly16_t * __a)
-{
-  union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_dupv4hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
-vld3_dup_s64 (const int64_t * __a)
-{
-  union { int64x1x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_dupdi ((const __builtin_neon_di *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__))
-vld3_dup_u64 (const uint64_t * __a)
-{
-  union { uint64x1x3_t __i; __builtin_neon_ei __o; } __rv;
-  __rv.__o = __builtin_neon_vld3_dupdi ((const __builtin_neon_di *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_s8 (int8_t * __a, int8x8x3_t __b)
-{
-  union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_s16 (int16_t * __a, int16x4x3_t __b)
-{
-  union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  __builtin_neon_vst3v4hi ((__builtin_neon_hi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_s32 (int32_t * __a, int32x2x3_t __b)
-{
-  union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  __builtin_neon_vst3v2si ((__builtin_neon_si *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_f32 (float32_t * __a, float32x2x3_t __b)
-{
-  union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  __builtin_neon_vst3v2sf (__a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_u8 (uint8_t * __a, uint8x8x3_t __b)
-{
-  union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_u16 (uint16_t * __a, uint16x4x3_t __b)
-{
-  union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  __builtin_neon_vst3v4hi ((__builtin_neon_hi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_u32 (uint32_t * __a, uint32x2x3_t __b)
-{
-  union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  __builtin_neon_vst3v2si ((__builtin_neon_si *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_p8 (poly8_t * __a, poly8x8x3_t __b)
-{
-  union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_p16 (poly16_t * __a, poly16x4x3_t __b)
-{
-  union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  __builtin_neon_vst3v4hi ((__builtin_neon_hi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_s64 (int64_t * __a, int64x1x3_t __b)
-{
-  union { int64x1x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  __builtin_neon_vst3di ((__builtin_neon_di *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_u64 (uint64_t * __a, uint64x1x3_t __b)
-{
-  union { uint64x1x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  __builtin_neon_vst3di ((__builtin_neon_di *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_s8 (int8_t * __a, int8x16x3_t __b)
-{
-  union { int8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_s16 (int16_t * __a, int16x8x3_t __b)
-{
-  union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_s32 (int32_t * __a, int32x4x3_t __b)
-{
-  union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  __builtin_neon_vst3v4si ((__builtin_neon_si *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_f32 (float32_t * __a, float32x4x3_t __b)
-{
-  union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  __builtin_neon_vst3v4sf (__a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_u8 (uint8_t * __a, uint8x16x3_t __b)
-{
-  union { uint8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_u16 (uint16_t * __a, uint16x8x3_t __b)
-{
-  union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_u32 (uint32_t * __a, uint32x4x3_t __b)
-{
-  union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  __builtin_neon_vst3v4si ((__builtin_neon_si *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_p8 (poly8_t * __a, poly8x16x3_t __b)
-{
-  union { poly8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_p16 (poly16_t * __a, poly16x8x3_t __b)
-{
-  union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_lane_s8 (int8_t * __a, int8x8x3_t __b, const int __c)
-{
-  union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_lane_s16 (int16_t * __a, int16x4x3_t __b, const int __c)
-{
-  union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_lane_s32 (int32_t * __a, int32x2x3_t __b, const int __c)
-{
-  union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  __builtin_neon_vst3_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_lane_f32 (float32_t * __a, float32x2x3_t __b, const int __c)
-{
-  union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  __builtin_neon_vst3_lanev2sf (__a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_lane_u8 (uint8_t * __a, uint8x8x3_t __b, const int __c)
-{
-  union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_lane_u16 (uint16_t * __a, uint16x4x3_t __b, const int __c)
-{
-  union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_lane_u32 (uint32_t * __a, uint32x2x3_t __b, const int __c)
-{
-  union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  __builtin_neon_vst3_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_lane_p8 (poly8_t * __a, poly8x8x3_t __b, const int __c)
-{
-  union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3_lane_p16 (poly16_t * __a, poly16x4x3_t __b, const int __c)
-{
-  union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
-  __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_lane_s16 (int16_t * __a, int16x8x3_t __b, const int __c)
-{
-  union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_lane_s32 (int32_t * __a, int32x4x3_t __b, const int __c)
-{
-  union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  __builtin_neon_vst3_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_lane_f32 (float32_t * __a, float32x4x3_t __b, const int __c)
-{
-  union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  __builtin_neon_vst3_lanev4sf (__a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_lane_u16 (uint16_t * __a, uint16x8x3_t __b, const int __c)
-{
-  union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_lane_u32 (uint32_t * __a, uint32x4x3_t __b, const int __c)
-{
-  union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  __builtin_neon_vst3_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst3q_lane_p16 (poly16_t * __a, poly16x8x3_t __b, const int __c)
-{
-  union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
-  __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
-vld4_s8 (const int8_t * __a)
-{
-  union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4v8qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
-vld4_s16 (const int16_t * __a)
-{
-  union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4v4hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
-vld4_s32 (const int32_t * __a)
-{
-  union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4v2si ((const __builtin_neon_si *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
-vld4_f32 (const float32_t * __a)
-{
-  union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4v2sf (__a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
-vld4_u8 (const uint8_t * __a)
-{
-  union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4v8qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
-vld4_u16 (const uint16_t * __a)
-{
-  union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4v4hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
-vld4_u32 (const uint32_t * __a)
-{
-  union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4v2si ((const __builtin_neon_si *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
-vld4_p8 (const poly8_t * __a)
-{
-  union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4v8qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
-vld4_p16 (const poly16_t * __a)
-{
-  union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4v4hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
-vld4_s64 (const int64_t * __a)
-{
-  union { int64x1x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4di ((const __builtin_neon_di *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__))
-vld4_u64 (const uint64_t * __a)
-{
-  union { uint64x1x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4di ((const __builtin_neon_di *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int8x16x4_t __attribute__ ((__always_inline__))
-vld4q_s8 (const int8_t * __a)
-{
-  union { int8x16x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4v16qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
-vld4q_s16 (const int16_t * __a)
-{
-  union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4v8hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
-vld4q_s32 (const int32_t * __a)
-{
-  union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4v4si ((const __builtin_neon_si *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
-vld4q_f32 (const float32_t * __a)
-{
-  union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4v4sf (__a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint8x16x4_t __attribute__ ((__always_inline__))
-vld4q_u8 (const uint8_t * __a)
-{
-  union { uint8x16x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4v16qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
-vld4q_u16 (const uint16_t * __a)
-{
-  union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4v8hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
-vld4q_u32 (const uint32_t * __a)
-{
-  union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4v4si ((const __builtin_neon_si *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly8x16x4_t __attribute__ ((__always_inline__))
-vld4q_p8 (const poly8_t * __a)
-{
-  union { poly8x16x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4v16qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
-vld4q_p16 (const poly16_t * __a)
-{
-  union { poly16x8x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4v8hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
-vld4_lane_s8 (const int8_t * __a, int8x8x4_t __b, const int __c)
-{
-  union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
-vld4_lane_s16 (const int16_t * __a, int16x4x4_t __b, const int __c)
-{
-  union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
-vld4_lane_s32 (const int32_t * __a, int32x2x4_t __b, const int __c)
-{
-  union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
-vld4_lane_f32 (const float32_t * __a, float32x2x4_t __b, const int __c)
-{
-  union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_lanev2sf (__a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
-vld4_lane_u8 (const uint8_t * __a, uint8x8x4_t __b, const int __c)
-{
-  union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
-vld4_lane_u16 (const uint16_t * __a, uint16x4x4_t __b, const int __c)
-{
-  union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
-vld4_lane_u32 (const uint32_t * __a, uint32x2x4_t __b, const int __c)
-{
-  union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
-vld4_lane_p8 (const poly8_t * __a, poly8x8x4_t __b, const int __c)
-{
-  union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
-vld4_lane_p16 (const poly16_t * __a, poly16x4x4_t __b, const int __c)
-{
-  union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
-vld4q_lane_s16 (const int16_t * __a, int16x8x4_t __b, const int __c)
-{
-  union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
-vld4q_lane_s32 (const int32_t * __a, int32x4x4_t __b, const int __c)
-{
-  union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
-vld4q_lane_f32 (const float32_t * __a, float32x4x4_t __b, const int __c)
-{
-  union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_lanev4sf (__a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
-vld4q_lane_u16 (const uint16_t * __a, uint16x8x4_t __b, const int __c)
-{
-  union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
-vld4q_lane_u32 (const uint32_t * __a, uint32x4x4_t __b, const int __c)
-{
-  union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
-vld4q_lane_p16 (const poly16_t * __a, poly16x8x4_t __b, const int __c)
-{
-  union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  union { poly16x8x4_t __i; __builtin_neon_xi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
-  return __rv.__i;
-}
-
-__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
-vld4_dup_s8 (const int8_t * __a)
-{
-  union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_dupv8qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
-vld4_dup_s16 (const int16_t * __a)
-{
-  union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_dupv4hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
-vld4_dup_s32 (const int32_t * __a)
-{
-  union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_dupv2si ((const __builtin_neon_si *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
-vld4_dup_f32 (const float32_t * __a)
-{
-  union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_dupv2sf (__a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
-vld4_dup_u8 (const uint8_t * __a)
-{
-  union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_dupv8qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
-vld4_dup_u16 (const uint16_t * __a)
-{
-  union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_dupv4hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
-vld4_dup_u32 (const uint32_t * __a)
-{
-  union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_dupv2si ((const __builtin_neon_si *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
-vld4_dup_p8 (const poly8_t * __a)
-{
-  union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_dupv8qi ((const __builtin_neon_qi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
-vld4_dup_p16 (const poly16_t * __a)
-{
-  union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_dupv4hi ((const __builtin_neon_hi *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
-vld4_dup_s64 (const int64_t * __a)
-{
-  union { int64x1x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_dupdi ((const __builtin_neon_di *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__))
-vld4_dup_u64 (const uint64_t * __a)
-{
-  union { uint64x1x4_t __i; __builtin_neon_oi __o; } __rv;
-  __rv.__o = __builtin_neon_vld4_dupdi ((const __builtin_neon_di *) __a);
-  return __rv.__i;
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_s8 (int8_t * __a, int8x8x4_t __b)
-{
-  union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_s16 (int16_t * __a, int16x4x4_t __b)
-{
-  union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst4v4hi ((__builtin_neon_hi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_s32 (int32_t * __a, int32x2x4_t __b)
-{
-  union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst4v2si ((__builtin_neon_si *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_f32 (float32_t * __a, float32x2x4_t __b)
-{
-  union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst4v2sf (__a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_u8 (uint8_t * __a, uint8x8x4_t __b)
-{
-  union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_u16 (uint16_t * __a, uint16x4x4_t __b)
-{
-  union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst4v4hi ((__builtin_neon_hi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_u32 (uint32_t * __a, uint32x2x4_t __b)
-{
-  union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst4v2si ((__builtin_neon_si *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_p8 (poly8_t * __a, poly8x8x4_t __b)
-{
-  union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_p16 (poly16_t * __a, poly16x4x4_t __b)
-{
-  union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst4v4hi ((__builtin_neon_hi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_s64 (int64_t * __a, int64x1x4_t __b)
-{
-  union { int64x1x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst4di ((__builtin_neon_di *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_u64 (uint64_t * __a, uint64x1x4_t __b)
-{
-  union { uint64x1x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst4di ((__builtin_neon_di *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_s8 (int8_t * __a, int8x16x4_t __b)
-{
-  union { int8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_s16 (int16_t * __a, int16x8x4_t __b)
-{
-  union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_s32 (int32_t * __a, int32x4x4_t __b)
-{
-  union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  __builtin_neon_vst4v4si ((__builtin_neon_si *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_f32 (float32_t * __a, float32x4x4_t __b)
-{
-  union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  __builtin_neon_vst4v4sf (__a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_u8 (uint8_t * __a, uint8x16x4_t __b)
-{
-  union { uint8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_u16 (uint16_t * __a, uint16x8x4_t __b)
-{
-  union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_u32 (uint32_t * __a, uint32x4x4_t __b)
-{
-  union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  __builtin_neon_vst4v4si ((__builtin_neon_si *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_p8 (poly8_t * __a, poly8x16x4_t __b)
-{
-  union { poly8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_p16 (poly16_t * __a, poly16x8x4_t __b)
-{
-  union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_lane_s8 (int8_t * __a, int8x8x4_t __b, const int __c)
-{
-  union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_lane_s16 (int16_t * __a, int16x4x4_t __b, const int __c)
-{
-  union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_lane_s32 (int32_t * __a, int32x2x4_t __b, const int __c)
-{
-  union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst4_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_lane_f32 (float32_t * __a, float32x2x4_t __b, const int __c)
-{
-  union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst4_lanev2sf (__a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_lane_u8 (uint8_t * __a, uint8x8x4_t __b, const int __c)
-{
-  union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_lane_u16 (uint16_t * __a, uint16x4x4_t __b, const int __c)
-{
-  union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_lane_u32 (uint32_t * __a, uint32x2x4_t __b, const int __c)
-{
-  union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst4_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_lane_p8 (poly8_t * __a, poly8x8x4_t __b, const int __c)
-{
-  union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4_lane_p16 (poly16_t * __a, poly16x4x4_t __b, const int __c)
-{
-  union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
-  __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_lane_s16 (int16_t * __a, int16x8x4_t __b, const int __c)
-{
-  union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_lane_s32 (int32_t * __a, int32x4x4_t __b, const int __c)
-{
-  union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  __builtin_neon_vst4_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_lane_f32 (float32_t * __a, float32x4x4_t __b, const int __c)
-{
-  union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  __builtin_neon_vst4_lanev4sf (__a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_lane_u16 (uint16_t * __a, uint16x8x4_t __b, const int __c)
-{
-  union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_lane_u32 (uint32_t * __a, uint32x4x4_t __b, const int __c)
-{
-  union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  __builtin_neon_vst4_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline void __attribute__ ((__always_inline__))
-vst4q_lane_p16 (poly16_t * __a, poly16x8x4_t __b, const int __c)
-{
-  union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
-  __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vand_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vandv8qi (__a, __b, 1);
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vand_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vandv4hi (__a, __b, 1);
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vand_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vandv2si (__a, __b, 1);
-}
-
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vand_s64 (int64x1_t __a, int64x1_t __b)
-{
-  return (int64x1_t)__builtin_neon_vanddi (__a, __b, 1);
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vand_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vandv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vand_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vandv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vand_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vandv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vand_u64 (uint64x1_t __a, uint64x1_t __b)
-{
-  return (uint64x1_t)__builtin_neon_vanddi ((int64x1_t) __a, (int64x1_t) __b, 0);
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vandq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (int8x16_t)__builtin_neon_vandv16qi (__a, __b, 1);
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vandq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vandv8hi (__a, __b, 1);
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vandq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vandv4si (__a, __b, 1);
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vandq_s64 (int64x2_t __a, int64x2_t __b)
-{
-  return (int64x2_t)__builtin_neon_vandv2di (__a, __b, 1);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vandq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vandv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vandq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vandv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vandq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vandv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vandq_u64 (uint64x2_t __a, uint64x2_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vandv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vorr_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vorrv8qi (__a, __b, 1);
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vorr_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vorrv4hi (__a, __b, 1);
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vorr_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vorrv2si (__a, __b, 1);
-}
-
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vorr_s64 (int64x1_t __a, int64x1_t __b)
-{
-  return (int64x1_t)__builtin_neon_vorrdi (__a, __b, 1);
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vorr_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vorrv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vorr_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vorrv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vorr_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vorrv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vorr_u64 (uint64x1_t __a, uint64x1_t __b)
-{
-  return (uint64x1_t)__builtin_neon_vorrdi ((int64x1_t) __a, (int64x1_t) __b, 0);
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vorrq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (int8x16_t)__builtin_neon_vorrv16qi (__a, __b, 1);
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vorrq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vorrv8hi (__a, __b, 1);
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vorrq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vorrv4si (__a, __b, 1);
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vorrq_s64 (int64x2_t __a, int64x2_t __b)
-{
-  return (int64x2_t)__builtin_neon_vorrv2di (__a, __b, 1);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vorrq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vorrv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vorrq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vorrv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vorrq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vorrv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vorrq_u64 (uint64x2_t __a, uint64x2_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vorrv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-veor_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_veorv8qi (__a, __b, 1);
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-veor_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_veorv4hi (__a, __b, 1);
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-veor_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_veorv2si (__a, __b, 1);
-}
-
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-veor_s64 (int64x1_t __a, int64x1_t __b)
-{
-  return (int64x1_t)__builtin_neon_veordi (__a, __b, 1);
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-veor_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_veorv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-veor_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_veorv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-veor_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_veorv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-veor_u64 (uint64x1_t __a, uint64x1_t __b)
-{
-  return (uint64x1_t)__builtin_neon_veordi ((int64x1_t) __a, (int64x1_t) __b, 0);
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-veorq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (int8x16_t)__builtin_neon_veorv16qi (__a, __b, 1);
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-veorq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_veorv8hi (__a, __b, 1);
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-veorq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_veorv4si (__a, __b, 1);
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-veorq_s64 (int64x2_t __a, int64x2_t __b)
-{
-  return (int64x2_t)__builtin_neon_veorv2di (__a, __b, 1);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-veorq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_veorv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-veorq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_veorv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-veorq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_veorv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-veorq_u64 (uint64x2_t __a, uint64x2_t __b)
-{
-  return (uint64x2_t)__builtin_neon_veorv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vbic_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vbicv8qi (__a, __b, 1);
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vbic_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vbicv4hi (__a, __b, 1);
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vbic_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vbicv2si (__a, __b, 1);
-}
-
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vbic_s64 (int64x1_t __a, int64x1_t __b)
-{
-  return (int64x1_t)__builtin_neon_vbicdi (__a, __b, 1);
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vbic_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vbicv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vbic_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vbicv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vbic_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vbicv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vbic_u64 (uint64x1_t __a, uint64x1_t __b)
-{
-  return (uint64x1_t)__builtin_neon_vbicdi ((int64x1_t) __a, (int64x1_t) __b, 0);
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vbicq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (int8x16_t)__builtin_neon_vbicv16qi (__a, __b, 1);
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vbicq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vbicv8hi (__a, __b, 1);
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vbicq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vbicv4si (__a, __b, 1);
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vbicq_s64 (int64x2_t __a, int64x2_t __b)
-{
-  return (int64x2_t)__builtin_neon_vbicv2di (__a, __b, 1);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vbicq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vbicv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vbicq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vbicv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vbicq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vbicv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vbicq_u64 (uint64x2_t __a, uint64x2_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vbicv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vorn_s8 (int8x8_t __a, int8x8_t __b)
-{
-  return (int8x8_t)__builtin_neon_vornv8qi (__a, __b, 1);
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vorn_s16 (int16x4_t __a, int16x4_t __b)
-{
-  return (int16x4_t)__builtin_neon_vornv4hi (__a, __b, 1);
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vorn_s32 (int32x2_t __a, int32x2_t __b)
-{
-  return (int32x2_t)__builtin_neon_vornv2si (__a, __b, 1);
-}
-
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vorn_s64 (int64x1_t __a, int64x1_t __b)
-{
-  return (int64x1_t)__builtin_neon_vorndi (__a, __b, 1);
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vorn_u8 (uint8x8_t __a, uint8x8_t __b)
-{
-  return (uint8x8_t)__builtin_neon_vornv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vorn_u16 (uint16x4_t __a, uint16x4_t __b)
-{
-  return (uint16x4_t)__builtin_neon_vornv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vorn_u32 (uint32x2_t __a, uint32x2_t __b)
-{
-  return (uint32x2_t)__builtin_neon_vornv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vorn_u64 (uint64x1_t __a, uint64x1_t __b)
-{
-  return (uint64x1_t)__builtin_neon_vorndi ((int64x1_t) __a, (int64x1_t) __b, 0);
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vornq_s8 (int8x16_t __a, int8x16_t __b)
-{
-  return (int8x16_t)__builtin_neon_vornv16qi (__a, __b, 1);
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vornq_s16 (int16x8_t __a, int16x8_t __b)
-{
-  return (int16x8_t)__builtin_neon_vornv8hi (__a, __b, 1);
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vornq_s32 (int32x4_t __a, int32x4_t __b)
-{
-  return (int32x4_t)__builtin_neon_vornv4si (__a, __b, 1);
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vornq_s64 (int64x2_t __a, int64x2_t __b)
-{
-  return (int64x2_t)__builtin_neon_vornv2di (__a, __b, 1);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vornq_u8 (uint8x16_t __a, uint8x16_t __b)
-{
-  return (uint8x16_t)__builtin_neon_vornv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vornq_u16 (uint16x8_t __a, uint16x8_t __b)
-{
-  return (uint16x8_t)__builtin_neon_vornv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vornq_u32 (uint32x4_t __a, uint32x4_t __b)
-{
-  return (uint32x4_t)__builtin_neon_vornv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vornq_u64 (uint64x2_t __a, uint64x2_t __b)
-{
-  return (uint64x2_t)__builtin_neon_vornv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
-}
-
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vreinterpret_p8_s8 (int8x8_t __a)
-{
-  return (poly8x8_t)__builtin_neon_vreinterpretv8qiv8qi (__a);
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vreinterpret_p8_s16 (int16x4_t __a)
-{
-  return (poly8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a);
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vreinterpret_p8_s32 (int32x2_t __a)
-{
-  return (poly8x8_t)__builtin_neon_vreinterpretv8qiv2si (__a);
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vreinterpret_p8_s64 (int64x1_t __a)
-{
-  return (poly8x8_t)__builtin_neon_vreinterpretv8qidi (__a);
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vreinterpret_p8_f32 (float32x2_t __a)
-{
-  return (poly8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a);
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vreinterpret_p8_u8 (uint8x8_t __a)
-{
-  return (poly8x8_t)__builtin_neon_vreinterpretv8qiv8qi ((int8x8_t) __a);
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vreinterpret_p8_u16 (uint16x4_t __a)
-{
-  return (poly8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a);
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vreinterpret_p8_u32 (uint32x2_t __a)
-{
-  return (poly8x8_t)__builtin_neon_vreinterpretv8qiv2si ((int32x2_t) __a);
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vreinterpret_p8_u64 (uint64x1_t __a)
-{
-  return (poly8x8_t)__builtin_neon_vreinterpretv8qidi ((int64x1_t) __a);
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vreinterpret_p8_p16 (poly16x4_t __a)
-{
-  return (poly8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a);
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_p8_s8 (int8x16_t __a)
-{
-  return (poly8x16_t)__builtin_neon_vreinterpretv16qiv16qi (__a);
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_p8_s16 (int16x8_t __a)
-{
-  return (poly8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a);
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_p8_s32 (int32x4_t __a)
-{
-  return (poly8x16_t)__builtin_neon_vreinterpretv16qiv4si (__a);
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_p8_s64 (int64x2_t __a)
-{
-  return (poly8x16_t)__builtin_neon_vreinterpretv16qiv2di (__a);
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_p8_f32 (float32x4_t __a)
-{
-  return (poly8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_p8_u8 (uint8x16_t __a)
-{
-  return (poly8x16_t)__builtin_neon_vreinterpretv16qiv16qi ((int8x16_t) __a);
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_p8_u16 (uint16x8_t __a)
-{
-  return (poly8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a);
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_p8_u32 (uint32x4_t __a)
-{
-  return (poly8x16_t)__builtin_neon_vreinterpretv16qiv4si ((int32x4_t) __a);
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_p8_u64 (uint64x2_t __a)
-{
-  return (poly8x16_t)__builtin_neon_vreinterpretv16qiv2di ((int64x2_t) __a);
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_p8_p16 (poly16x8_t __a)
-{
-  return (poly8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a);
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vreinterpret_p16_s8 (int8x8_t __a)
-{
-  return (poly16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a);
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vreinterpret_p16_s16 (int16x4_t __a)
-{
-  return (poly16x4_t)__builtin_neon_vreinterpretv4hiv4hi (__a);
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vreinterpret_p16_s32 (int32x2_t __a)
-{
-  return (poly16x4_t)__builtin_neon_vreinterpretv4hiv2si (__a);
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vreinterpret_p16_s64 (int64x1_t __a)
-{
-  return (poly16x4_t)__builtin_neon_vreinterpretv4hidi (__a);
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vreinterpret_p16_f32 (float32x2_t __a)
-{
-  return (poly16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a);
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vreinterpret_p16_u8 (uint8x8_t __a)
-{
-  return (poly16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a);
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vreinterpret_p16_u16 (uint16x4_t __a)
-{
-  return (poly16x4_t)__builtin_neon_vreinterpretv4hiv4hi ((int16x4_t) __a);
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vreinterpret_p16_u32 (uint32x2_t __a)
-{
-  return (poly16x4_t)__builtin_neon_vreinterpretv4hiv2si ((int32x2_t) __a);
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vreinterpret_p16_u64 (uint64x1_t __a)
-{
-  return (poly16x4_t)__builtin_neon_vreinterpretv4hidi ((int64x1_t) __a);
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vreinterpret_p16_p8 (poly8x8_t __a)
-{
-  return (poly16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a);
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_p16_s8 (int8x16_t __a)
-{
-  return (poly16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a);
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_p16_s16 (int16x8_t __a)
-{
-  return (poly16x8_t)__builtin_neon_vreinterpretv8hiv8hi (__a);
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_p16_s32 (int32x4_t __a)
-{
-  return (poly16x8_t)__builtin_neon_vreinterpretv8hiv4si (__a);
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_p16_s64 (int64x2_t __a)
-{
-  return (poly16x8_t)__builtin_neon_vreinterpretv8hiv2di (__a);
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_p16_f32 (float32x4_t __a)
-{
-  return (poly16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a);
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_p16_u8 (uint8x16_t __a)
-{
-  return (poly16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a);
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_p16_u16 (uint16x8_t __a)
-{
-  return (poly16x8_t)__builtin_neon_vreinterpretv8hiv8hi ((int16x8_t) __a);
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_p16_u32 (uint32x4_t __a)
-{
-  return (poly16x8_t)__builtin_neon_vreinterpretv8hiv4si ((int32x4_t) __a);
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_p16_u64 (uint64x2_t __a)
-{
-  return (poly16x8_t)__builtin_neon_vreinterpretv8hiv2di ((int64x2_t) __a);
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_p16_p8 (poly8x16_t __a)
-{
-  return (poly16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a);
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vreinterpret_f32_s8 (int8x8_t __a)
-{
-  return (float32x2_t)__builtin_neon_vreinterpretv2sfv8qi (__a);
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vreinterpret_f32_s16 (int16x4_t __a)
-{
-  return (float32x2_t)__builtin_neon_vreinterpretv2sfv4hi (__a);
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vreinterpret_f32_s32 (int32x2_t __a)
-{
-  return (float32x2_t)__builtin_neon_vreinterpretv2sfv2si (__a);
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vreinterpret_f32_s64 (int64x1_t __a)
-{
-  return (float32x2_t)__builtin_neon_vreinterpretv2sfdi (__a);
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vreinterpret_f32_u8 (uint8x8_t __a)
-{
-  return (float32x2_t)__builtin_neon_vreinterpretv2sfv8qi ((int8x8_t) __a);
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vreinterpret_f32_u16 (uint16x4_t __a)
-{
-  return (float32x2_t)__builtin_neon_vreinterpretv2sfv4hi ((int16x4_t) __a);
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vreinterpret_f32_u32 (uint32x2_t __a)
-{
-  return (float32x2_t)__builtin_neon_vreinterpretv2sfv2si ((int32x2_t) __a);
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vreinterpret_f32_u64 (uint64x1_t __a)
-{
-  return (float32x2_t)__builtin_neon_vreinterpretv2sfdi ((int64x1_t) __a);
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vreinterpret_f32_p8 (poly8x8_t __a)
-{
-  return (float32x2_t)__builtin_neon_vreinterpretv2sfv8qi ((int8x8_t) __a);
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vreinterpret_f32_p16 (poly16x4_t __a)
-{
-  return (float32x2_t)__builtin_neon_vreinterpretv2sfv4hi ((int16x4_t) __a);
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_f32_s8 (int8x16_t __a)
-{
-  return (float32x4_t)__builtin_neon_vreinterpretv4sfv16qi (__a);
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_f32_s16 (int16x8_t __a)
-{
-  return (float32x4_t)__builtin_neon_vreinterpretv4sfv8hi (__a);
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_f32_s32 (int32x4_t __a)
-{
-  return (float32x4_t)__builtin_neon_vreinterpretv4sfv4si (__a);
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_f32_s64 (int64x2_t __a)
-{
-  return (float32x4_t)__builtin_neon_vreinterpretv4sfv2di (__a);
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_f32_u8 (uint8x16_t __a)
-{
-  return (float32x4_t)__builtin_neon_vreinterpretv4sfv16qi ((int8x16_t) __a);
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_f32_u16 (uint16x8_t __a)
-{
-  return (float32x4_t)__builtin_neon_vreinterpretv4sfv8hi ((int16x8_t) __a);
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_f32_u32 (uint32x4_t __a)
-{
-  return (float32x4_t)__builtin_neon_vreinterpretv4sfv4si ((int32x4_t) __a);
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_f32_u64 (uint64x2_t __a)
-{
-  return (float32x4_t)__builtin_neon_vreinterpretv4sfv2di ((int64x2_t) __a);
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_f32_p8 (poly8x16_t __a)
-{
-  return (float32x4_t)__builtin_neon_vreinterpretv4sfv16qi ((int8x16_t) __a);
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_f32_p16 (poly16x8_t __a)
-{
-  return (float32x4_t)__builtin_neon_vreinterpretv4sfv8hi ((int16x8_t) __a);
-}
-
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vreinterpret_s64_s8 (int8x8_t __a)
-{
-  return (int64x1_t)__builtin_neon_vreinterpretdiv8qi (__a);
-}
-
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vreinterpret_s64_s16 (int16x4_t __a)
-{
-  return (int64x1_t)__builtin_neon_vreinterpretdiv4hi (__a);
-}
-
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vreinterpret_s64_s32 (int32x2_t __a)
-{
-  return (int64x1_t)__builtin_neon_vreinterpretdiv2si (__a);
-}
-
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vreinterpret_s64_f32 (float32x2_t __a)
-{
-  return (int64x1_t)__builtin_neon_vreinterpretdiv2sf (__a);
-}
-
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vreinterpret_s64_u8 (uint8x8_t __a)
-{
-  return (int64x1_t)__builtin_neon_vreinterpretdiv8qi ((int8x8_t) __a);
-}
-
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vreinterpret_s64_u16 (uint16x4_t __a)
-{
-  return (int64x1_t)__builtin_neon_vreinterpretdiv4hi ((int16x4_t) __a);
-}
-
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vreinterpret_s64_u32 (uint32x2_t __a)
-{
-  return (int64x1_t)__builtin_neon_vreinterpretdiv2si ((int32x2_t) __a);
-}
-
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vreinterpret_s64_u64 (uint64x1_t __a)
-{
-  return (int64x1_t)__builtin_neon_vreinterpretdidi ((int64x1_t) __a);
-}
-
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vreinterpret_s64_p8 (poly8x8_t __a)
-{
-  return (int64x1_t)__builtin_neon_vreinterpretdiv8qi ((int8x8_t) __a);
-}
-
-__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
-vreinterpret_s64_p16 (poly16x4_t __a)
-{
-  return (int64x1_t)__builtin_neon_vreinterpretdiv4hi ((int16x4_t) __a);
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_s64_s8 (int8x16_t __a)
-{
-  return (int64x2_t)__builtin_neon_vreinterpretv2div16qi (__a);
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_s64_s16 (int16x8_t __a)
-{
-  return (int64x2_t)__builtin_neon_vreinterpretv2div8hi (__a);
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_s64_s32 (int32x4_t __a)
-{
-  return (int64x2_t)__builtin_neon_vreinterpretv2div4si (__a);
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_s64_f32 (float32x4_t __a)
-{
-  return (int64x2_t)__builtin_neon_vreinterpretv2div4sf (__a);
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_s64_u8 (uint8x16_t __a)
-{
-  return (int64x2_t)__builtin_neon_vreinterpretv2div16qi ((int8x16_t) __a);
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_s64_u16 (uint16x8_t __a)
-{
-  return (int64x2_t)__builtin_neon_vreinterpretv2div8hi ((int16x8_t) __a);
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_s64_u32 (uint32x4_t __a)
-{
-  return (int64x2_t)__builtin_neon_vreinterpretv2div4si ((int32x4_t) __a);
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_s64_u64 (uint64x2_t __a)
-{
-  return (int64x2_t)__builtin_neon_vreinterpretv2div2di ((int64x2_t) __a);
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_s64_p8 (poly8x16_t __a)
-{
-  return (int64x2_t)__builtin_neon_vreinterpretv2div16qi ((int8x16_t) __a);
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_s64_p16 (poly16x8_t __a)
-{
-  return (int64x2_t)__builtin_neon_vreinterpretv2div8hi ((int16x8_t) __a);
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vreinterpret_u64_s8 (int8x8_t __a)
-{
-  return (uint64x1_t)__builtin_neon_vreinterpretdiv8qi (__a);
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vreinterpret_u64_s16 (int16x4_t __a)
-{
-  return (uint64x1_t)__builtin_neon_vreinterpretdiv4hi (__a);
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vreinterpret_u64_s32 (int32x2_t __a)
-{
-  return (uint64x1_t)__builtin_neon_vreinterpretdiv2si (__a);
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vreinterpret_u64_s64 (int64x1_t __a)
-{
-  return (uint64x1_t)__builtin_neon_vreinterpretdidi (__a);
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vreinterpret_u64_f32 (float32x2_t __a)
-{
-  return (uint64x1_t)__builtin_neon_vreinterpretdiv2sf (__a);
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vreinterpret_u64_u8 (uint8x8_t __a)
-{
-  return (uint64x1_t)__builtin_neon_vreinterpretdiv8qi ((int8x8_t) __a);
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vreinterpret_u64_u16 (uint16x4_t __a)
-{
-  return (uint64x1_t)__builtin_neon_vreinterpretdiv4hi ((int16x4_t) __a);
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vreinterpret_u64_u32 (uint32x2_t __a)
-{
-  return (uint64x1_t)__builtin_neon_vreinterpretdiv2si ((int32x2_t) __a);
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vreinterpret_u64_p8 (poly8x8_t __a)
-{
-  return (uint64x1_t)__builtin_neon_vreinterpretdiv8qi ((int8x8_t) __a);
-}
-
-__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
-vreinterpret_u64_p16 (poly16x4_t __a)
-{
-  return (uint64x1_t)__builtin_neon_vreinterpretdiv4hi ((int16x4_t) __a);
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_u64_s8 (int8x16_t __a)
-{
-  return (uint64x2_t)__builtin_neon_vreinterpretv2div16qi (__a);
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_u64_s16 (int16x8_t __a)
-{
-  return (uint64x2_t)__builtin_neon_vreinterpretv2div8hi (__a);
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_u64_s32 (int32x4_t __a)
-{
-  return (uint64x2_t)__builtin_neon_vreinterpretv2div4si (__a);
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_u64_s64 (int64x2_t __a)
-{
-  return (uint64x2_t)__builtin_neon_vreinterpretv2div2di (__a);
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_u64_f32 (float32x4_t __a)
-{
-  return (uint64x2_t)__builtin_neon_vreinterpretv2div4sf (__a);
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_u64_u8 (uint8x16_t __a)
-{
-  return (uint64x2_t)__builtin_neon_vreinterpretv2div16qi ((int8x16_t) __a);
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_u64_u16 (uint16x8_t __a)
-{
-  return (uint64x2_t)__builtin_neon_vreinterpretv2div8hi ((int16x8_t) __a);
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_u64_u32 (uint32x4_t __a)
-{
-  return (uint64x2_t)__builtin_neon_vreinterpretv2div4si ((int32x4_t) __a);
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_u64_p8 (poly8x16_t __a)
-{
-  return (uint64x2_t)__builtin_neon_vreinterpretv2div16qi ((int8x16_t) __a);
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vreinterpretq_u64_p16 (poly16x8_t __a)
-{
-  return (uint64x2_t)__builtin_neon_vreinterpretv2div8hi ((int16x8_t) __a);
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vreinterpret_s8_s16 (int16x4_t __a)
-{
-  return (int8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a);
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vreinterpret_s8_s32 (int32x2_t __a)
-{
-  return (int8x8_t)__builtin_neon_vreinterpretv8qiv2si (__a);
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vreinterpret_s8_s64 (int64x1_t __a)
-{
-  return (int8x8_t)__builtin_neon_vreinterpretv8qidi (__a);
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vreinterpret_s8_f32 (float32x2_t __a)
-{
-  return (int8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a);
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vreinterpret_s8_u8 (uint8x8_t __a)
-{
-  return (int8x8_t)__builtin_neon_vreinterpretv8qiv8qi ((int8x8_t) __a);
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vreinterpret_s8_u16 (uint16x4_t __a)
-{
-  return (int8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a);
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vreinterpret_s8_u32 (uint32x2_t __a)
-{
-  return (int8x8_t)__builtin_neon_vreinterpretv8qiv2si ((int32x2_t) __a);
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vreinterpret_s8_u64 (uint64x1_t __a)
-{
-  return (int8x8_t)__builtin_neon_vreinterpretv8qidi ((int64x1_t) __a);
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vreinterpret_s8_p8 (poly8x8_t __a)
-{
-  return (int8x8_t)__builtin_neon_vreinterpretv8qiv8qi ((int8x8_t) __a);
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vreinterpret_s8_p16 (poly16x4_t __a)
-{
-  return (int8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a);
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_s8_s16 (int16x8_t __a)
-{
-  return (int8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a);
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_s8_s32 (int32x4_t __a)
-{
-  return (int8x16_t)__builtin_neon_vreinterpretv16qiv4si (__a);
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_s8_s64 (int64x2_t __a)
-{
-  return (int8x16_t)__builtin_neon_vreinterpretv16qiv2di (__a);
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_s8_f32 (float32x4_t __a)
-{
-  return (int8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_s8_u8 (uint8x16_t __a)
-{
-  return (int8x16_t)__builtin_neon_vreinterpretv16qiv16qi ((int8x16_t) __a);
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_s8_u16 (uint16x8_t __a)
-{
-  return (int8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a);
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_s8_u32 (uint32x4_t __a)
-{
-  return (int8x16_t)__builtin_neon_vreinterpretv16qiv4si ((int32x4_t) __a);
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_s8_u64 (uint64x2_t __a)
-{
-  return (int8x16_t)__builtin_neon_vreinterpretv16qiv2di ((int64x2_t) __a);
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_s8_p8 (poly8x16_t __a)
-{
-  return (int8x16_t)__builtin_neon_vreinterpretv16qiv16qi ((int8x16_t) __a);
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_s8_p16 (poly16x8_t __a)
-{
-  return (int8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a);
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vreinterpret_s16_s8 (int8x8_t __a)
-{
-  return (int16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a);
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vreinterpret_s16_s32 (int32x2_t __a)
-{
-  return (int16x4_t)__builtin_neon_vreinterpretv4hiv2si (__a);
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vreinterpret_s16_s64 (int64x1_t __a)
-{
-  return (int16x4_t)__builtin_neon_vreinterpretv4hidi (__a);
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vreinterpret_s16_f32 (float32x2_t __a)
-{
-  return (int16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a);
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vreinterpret_s16_u8 (uint8x8_t __a)
-{
-  return (int16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a);
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vreinterpret_s16_u16 (uint16x4_t __a)
-{
-  return (int16x4_t)__builtin_neon_vreinterpretv4hiv4hi ((int16x4_t) __a);
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vreinterpret_s16_u32 (uint32x2_t __a)
-{
-  return (int16x4_t)__builtin_neon_vreinterpretv4hiv2si ((int32x2_t) __a);
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vreinterpret_s16_u64 (uint64x1_t __a)
-{
-  return (int16x4_t)__builtin_neon_vreinterpretv4hidi ((int64x1_t) __a);
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vreinterpret_s16_p8 (poly8x8_t __a)
-{
-  return (int16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a);
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vreinterpret_s16_p16 (poly16x4_t __a)
-{
-  return (int16x4_t)__builtin_neon_vreinterpretv4hiv4hi ((int16x4_t) __a);
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_s16_s8 (int8x16_t __a)
-{
-  return (int16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a);
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_s16_s32 (int32x4_t __a)
-{
-  return (int16x8_t)__builtin_neon_vreinterpretv8hiv4si (__a);
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_s16_s64 (int64x2_t __a)
-{
-  return (int16x8_t)__builtin_neon_vreinterpretv8hiv2di (__a);
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_s16_f32 (float32x4_t __a)
-{
-  return (int16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a);
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_s16_u8 (uint8x16_t __a)
-{
-  return (int16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a);
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_s16_u16 (uint16x8_t __a)
-{
-  return (int16x8_t)__builtin_neon_vreinterpretv8hiv8hi ((int16x8_t) __a);
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_s16_u32 (uint32x4_t __a)
-{
-  return (int16x8_t)__builtin_neon_vreinterpretv8hiv4si ((int32x4_t) __a);
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_s16_u64 (uint64x2_t __a)
-{
-  return (int16x8_t)__builtin_neon_vreinterpretv8hiv2di ((int64x2_t) __a);
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_s16_p8 (poly8x16_t __a)
-{
-  return (int16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a);
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_s16_p16 (poly16x8_t __a)
-{
-  return (int16x8_t)__builtin_neon_vreinterpretv8hiv8hi ((int16x8_t) __a);
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vreinterpret_s32_s8 (int8x8_t __a)
-{
-  return (int32x2_t)__builtin_neon_vreinterpretv2siv8qi (__a);
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vreinterpret_s32_s16 (int16x4_t __a)
-{
-  return (int32x2_t)__builtin_neon_vreinterpretv2siv4hi (__a);
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vreinterpret_s32_s64 (int64x1_t __a)
-{
-  return (int32x2_t)__builtin_neon_vreinterpretv2sidi (__a);
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vreinterpret_s32_f32 (float32x2_t __a)
-{
-  return (int32x2_t)__builtin_neon_vreinterpretv2siv2sf (__a);
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vreinterpret_s32_u8 (uint8x8_t __a)
-{
-  return (int32x2_t)__builtin_neon_vreinterpretv2siv8qi ((int8x8_t) __a);
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vreinterpret_s32_u16 (uint16x4_t __a)
-{
-  return (int32x2_t)__builtin_neon_vreinterpretv2siv4hi ((int16x4_t) __a);
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vreinterpret_s32_u32 (uint32x2_t __a)
-{
-  return (int32x2_t)__builtin_neon_vreinterpretv2siv2si ((int32x2_t) __a);
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vreinterpret_s32_u64 (uint64x1_t __a)
-{
-  return (int32x2_t)__builtin_neon_vreinterpretv2sidi ((int64x1_t) __a);
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vreinterpret_s32_p8 (poly8x8_t __a)
-{
-  return (int32x2_t)__builtin_neon_vreinterpretv2siv8qi ((int8x8_t) __a);
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vreinterpret_s32_p16 (poly16x4_t __a)
-{
-  return (int32x2_t)__builtin_neon_vreinterpretv2siv4hi ((int16x4_t) __a);
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_s32_s8 (int8x16_t __a)
-{
-  return (int32x4_t)__builtin_neon_vreinterpretv4siv16qi (__a);
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_s32_s16 (int16x8_t __a)
-{
-  return (int32x4_t)__builtin_neon_vreinterpretv4siv8hi (__a);
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_s32_s64 (int64x2_t __a)
-{
-  return (int32x4_t)__builtin_neon_vreinterpretv4siv2di (__a);
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_s32_f32 (float32x4_t __a)
-{
-  return (int32x4_t)__builtin_neon_vreinterpretv4siv4sf (__a);
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_s32_u8 (uint8x16_t __a)
-{
-  return (int32x4_t)__builtin_neon_vreinterpretv4siv16qi ((int8x16_t) __a);
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_s32_u16 (uint16x8_t __a)
-{
-  return (int32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a);
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_s32_u32 (uint32x4_t __a)
-{
-  return (int32x4_t)__builtin_neon_vreinterpretv4siv4si ((int32x4_t) __a);
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_s32_u64 (uint64x2_t __a)
-{
-  return (int32x4_t)__builtin_neon_vreinterpretv4siv2di ((int64x2_t) __a);
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_s32_p8 (poly8x16_t __a)
-{
-  return (int32x4_t)__builtin_neon_vreinterpretv4siv16qi ((int8x16_t) __a);
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_s32_p16 (poly16x8_t __a)
-{
-  return (int32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a);
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vreinterpret_u8_s8 (int8x8_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vreinterpretv8qiv8qi (__a);
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vreinterpret_u8_s16 (int16x4_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a);
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vreinterpret_u8_s32 (int32x2_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vreinterpretv8qiv2si (__a);
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vreinterpret_u8_s64 (int64x1_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vreinterpretv8qidi (__a);
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vreinterpret_u8_f32 (float32x2_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a);
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vreinterpret_u8_u16 (uint16x4_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a);
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vreinterpret_u8_u32 (uint32x2_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vreinterpretv8qiv2si ((int32x2_t) __a);
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vreinterpret_u8_u64 (uint64x1_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vreinterpretv8qidi ((int64x1_t) __a);
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vreinterpret_u8_p8 (poly8x8_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vreinterpretv8qiv8qi ((int8x8_t) __a);
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vreinterpret_u8_p16 (poly16x4_t __a)
-{
-  return (uint8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_u8_s8 (int8x16_t __a)
-{
-  return (uint8x16_t)__builtin_neon_vreinterpretv16qiv16qi (__a);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_u8_s16 (int16x8_t __a)
-{
-  return (uint8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_u8_s32 (int32x4_t __a)
-{
-  return (uint8x16_t)__builtin_neon_vreinterpretv16qiv4si (__a);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_u8_s64 (int64x2_t __a)
-{
-  return (uint8x16_t)__builtin_neon_vreinterpretv16qiv2di (__a);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_u8_f32 (float32x4_t __a)
-{
-  return (uint8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_u8_u16 (uint16x8_t __a)
-{
-  return (uint8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_u8_u32 (uint32x4_t __a)
-{
-  return (uint8x16_t)__builtin_neon_vreinterpretv16qiv4si ((int32x4_t) __a);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_u8_u64 (uint64x2_t __a)
-{
-  return (uint8x16_t)__builtin_neon_vreinterpretv16qiv2di ((int64x2_t) __a);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_u8_p8 (poly8x16_t __a)
-{
-  return (uint8x16_t)__builtin_neon_vreinterpretv16qiv16qi ((int8x16_t) __a);
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vreinterpretq_u8_p16 (poly16x8_t __a)
-{
-  return (uint8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a);
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vreinterpret_u16_s8 (int8x8_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a);
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vreinterpret_u16_s16 (int16x4_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vreinterpretv4hiv4hi (__a);
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vreinterpret_u16_s32 (int32x2_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vreinterpretv4hiv2si (__a);
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vreinterpret_u16_s64 (int64x1_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vreinterpretv4hidi (__a);
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vreinterpret_u16_f32 (float32x2_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a);
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vreinterpret_u16_u8 (uint8x8_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a);
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vreinterpret_u16_u32 (uint32x2_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vreinterpretv4hiv2si ((int32x2_t) __a);
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vreinterpret_u16_u64 (uint64x1_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vreinterpretv4hidi ((int64x1_t) __a);
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vreinterpret_u16_p8 (poly8x8_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a);
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vreinterpret_u16_p16 (poly16x4_t __a)
-{
-  return (uint16x4_t)__builtin_neon_vreinterpretv4hiv4hi ((int16x4_t) __a);
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_u16_s8 (int8x16_t __a)
-{
-  return (uint16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a);
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_u16_s16 (int16x8_t __a)
-{
-  return (uint16x8_t)__builtin_neon_vreinterpretv8hiv8hi (__a);
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_u16_s32 (int32x4_t __a)
-{
-  return (uint16x8_t)__builtin_neon_vreinterpretv8hiv4si (__a);
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_u16_s64 (int64x2_t __a)
-{
-  return (uint16x8_t)__builtin_neon_vreinterpretv8hiv2di (__a);
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_u16_f32 (float32x4_t __a)
-{
-  return (uint16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a);
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_u16_u8 (uint8x16_t __a)
-{
-  return (uint16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a);
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_u16_u32 (uint32x4_t __a)
-{
-  return (uint16x8_t)__builtin_neon_vreinterpretv8hiv4si ((int32x4_t) __a);
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_u16_u64 (uint64x2_t __a)
-{
-  return (uint16x8_t)__builtin_neon_vreinterpretv8hiv2di ((int64x2_t) __a);
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_u16_p8 (poly8x16_t __a)
-{
-  return (uint16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a);
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vreinterpretq_u16_p16 (poly16x8_t __a)
-{
-  return (uint16x8_t)__builtin_neon_vreinterpretv8hiv8hi ((int16x8_t) __a);
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vreinterpret_u32_s8 (int8x8_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vreinterpretv2siv8qi (__a);
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vreinterpret_u32_s16 (int16x4_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vreinterpretv2siv4hi (__a);
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vreinterpret_u32_s32 (int32x2_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vreinterpretv2siv2si (__a);
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vreinterpret_u32_s64 (int64x1_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vreinterpretv2sidi (__a);
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vreinterpret_u32_f32 (float32x2_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vreinterpretv2siv2sf (__a);
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vreinterpret_u32_u8 (uint8x8_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vreinterpretv2siv8qi ((int8x8_t) __a);
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vreinterpret_u32_u16 (uint16x4_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vreinterpretv2siv4hi ((int16x4_t) __a);
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vreinterpret_u32_u64 (uint64x1_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vreinterpretv2sidi ((int64x1_t) __a);
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vreinterpret_u32_p8 (poly8x8_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vreinterpretv2siv8qi ((int8x8_t) __a);
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vreinterpret_u32_p16 (poly16x4_t __a)
-{
-  return (uint32x2_t)__builtin_neon_vreinterpretv2siv4hi ((int16x4_t) __a);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_u32_s8 (int8x16_t __a)
-{
-  return (uint32x4_t)__builtin_neon_vreinterpretv4siv16qi (__a);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_u32_s16 (int16x8_t __a)
-{
-  return (uint32x4_t)__builtin_neon_vreinterpretv4siv8hi (__a);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_u32_s32 (int32x4_t __a)
-{
-  return (uint32x4_t)__builtin_neon_vreinterpretv4siv4si (__a);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_u32_s64 (int64x2_t __a)
-{
-  return (uint32x4_t)__builtin_neon_vreinterpretv4siv2di (__a);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_u32_f32 (float32x4_t __a)
-{
-  return (uint32x4_t)__builtin_neon_vreinterpretv4siv4sf (__a);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_u32_u8 (uint8x16_t __a)
-{
-  return (uint32x4_t)__builtin_neon_vreinterpretv4siv16qi ((int8x16_t) __a);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_u32_u16 (uint16x8_t __a)
-{
-  return (uint32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_u32_u64 (uint64x2_t __a)
-{
-  return (uint32x4_t)__builtin_neon_vreinterpretv4siv2di ((int64x2_t) __a);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_u32_p8 (poly8x16_t __a)
-{
-  return (uint32x4_t)__builtin_neon_vreinterpretv4siv16qi ((int8x16_t) __a);
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vreinterpretq_u32_p16 (poly16x8_t __a)
-{
-  return (uint32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a);
-}
+#define vreinterpretq_u32_p16(__a) \
+  (uint32x4_t)__builtin_neon_vreinterpretv4siv8hi (__a);
 
 #ifdef __cplusplus
 }

Added: llvm-gcc-4.2/trunk/gcc/config/arm/llvm-arm.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/llvm-arm.cpp?rev=76819&view=auto

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/llvm-arm.cpp (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/llvm-arm.cpp Wed Jul 22 18:35:22 2009
@@ -0,0 +1,2160 @@
+/* LLVM LOCAL begin (ENTIRE FILE!)  */
+/* High-level LLVM backend interface 
+Copyright (C) 2008, 2009 Apple Inc.
+
+This file is part of GCC.
+
+GCC is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free
+Software Foundation; either version 2, or (at your option) any later
+version.
+
+GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or
+FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received a copy of the GNU General Public License
+along with GCC; see the file COPYING.  If not, write to the Free
+Software Foundation, 59 Temple Place - Suite 330, Boston, MA
+02111-1307, USA.  */
+
+//===----------------------------------------------------------------------===//
+// This is a C++ source file that implements specific llvm ARM ABI.
+//===----------------------------------------------------------------------===//
+
+#include "llvm-abi.h"
+#include "llvm-internal.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+
+extern "C" {
+#include "insn-codes.h"
+#include "toplev.h"
+#include "rtl.h"
+#include "insn-config.h"
+#include "recog.h"
+
+enum neon_itype { neon_itype_dummy };
+extern enum insn_code locate_neon_builtin_icode
+  (int fcode, neon_itype *itype, enum neon_builtins *neon_code);
+}
+
+/// UnexpectedError - Report errors about unexpected uses of builtins.  The
+/// msg argument should begin with a "%H" so that the location of the
+/// expression is printed in the error message.
+static bool UnexpectedError(const char *msg, tree exp, Value *&Result) {
+  error(msg, &EXPR_LOCATION(exp));
+
+  // Set the Result to an undefined value.
+  const Type *ResTy = ConvertType(TREE_TYPE(exp));
+  if (ResTy->isSingleValueType())
+    Result = getGlobalContext().getUndef(ResTy);
+
+  // Return true, which can be propagated as the return value of
+  // TargetIntrinsicLower, to indicate that no further error message
+  // is needed.
+  return true;
+}
+
+static bool NonImmediateError(tree exp, Value *&Result) {
+  return UnexpectedError("%Hlast builtin argument must be an immediate",
+                         exp, Result);
+}
+
+static bool BadImmediateError(tree exp, Value *&Result) {
+  return UnexpectedError("%Hunexpected immediate argument for builtin",
+                         exp, Result);
+}
+
+static bool BadModeError(tree exp, Value *&Result) {
+  return UnexpectedError("%Hunexpected mode for builtin argument",
+                         exp, Result);
+}
+
+enum neon_datatype {
+  neon_datatype_unspecified,
+  neon_datatype_signed,
+  neon_datatype_unsigned,
+  neon_datatype_float,
+  neon_datatype_polynomial
+};
+
+/// GetBuiltinExtraInfo - Decipher the extra integer immediate argument
+/// used with many of GCC's builtins for NEON to distinguish variants of an
+/// operation.  The following values for that argument are used:
+///   - bit0: For integer types (i.e., bit2 == 0), 0 = unsigned, 1 = signed;
+///           otherwise, 0 = polynomial, 1 = float.
+///   - bit1: The operation rounds its results.
+///   - bit2: 0 = integer datatypes, 1 = floating-point or polynomial.
+///   .
+/// Returns false if the extra argument is not an integer immediate.
+static bool GetBuiltinExtraInfo(const Value *extra_arg,
+                                neon_datatype &datatype, bool &isRounded) {
+  const ConstantInt *arg = dyn_cast<ConstantInt>(extra_arg);
+  if (!arg)
+    return false;
+
+  int argval = arg->getZExtValue();
+  isRounded = ((argval & 2) != 0);
+  if ((argval & 4) == 0) {
+    if ((argval & 1) == 0)
+      datatype = neon_datatype_unsigned;
+    else
+      datatype = neon_datatype_signed;
+  } else {
+    if ((argval & 1) == 0)
+      datatype = neon_datatype_polynomial;
+    else
+      datatype = neon_datatype_float;
+  }
+  return true;
+}
+
+/// BuildConstantSplatVector - Create a ConstantVector with the same value
+/// replicated in each element.
+static Value *BuildConstantSplatVector(unsigned NumElements, ConstantInt *Val) {
+  std::vector<Constant*> CstOps;
+  for (unsigned i = 0; i != NumElements; ++i)
+    CstOps.push_back(Val);
+  return getGlobalContext().getConstantVector(CstOps);
+}
+
+/// BuildDup - Build a splat operation to duplicate a value into every
+/// element of a vector.
+static Value *BuildDup(const Type *ResultType, Value *Val,
+                       LLVMBuilder &Builder) {
+  LLVMContext &Context = getGlobalContext();
+
+  // GCC may promote the scalar argument; cast it back.
+  const VectorType *VTy = dyn_cast<const VectorType>(ResultType);
+  assert(VTy && "expected a vector type");
+  const Type *ElTy = VTy->getElementType();
+  if (Val->getType() != ElTy) {
+    assert(!ElTy->isFloatingPoint() &&
+           "only integer types expected to be promoted");
+    Val = Builder.CreateTrunc(Val, ElTy);
+  }
+
+  // Insert the value into lane 0 of an undef vector.
+  Value *Undef = Context.getUndef(ResultType);
+  Value *Result =
+    Builder.CreateInsertElement(Undef, Val,
+                                Context.getConstantInt(Type::Int32Ty, 0));
+
+  // Use a shuffle to move the value into the other lanes.
+  unsigned NUnits = VTy->getNumElements();
+  if (NUnits > 1) {
+    std::vector<Constant*> Idxs;
+    for (unsigned i = 0; i != NUnits; ++i)
+      Idxs.push_back(Context.getConstantInt(Type::Int32Ty, 0));
+    Result = Builder.CreateShuffleVector(Result, Undef,
+                                         Context.getConstantVector(Idxs));
+  }
+  return Result;
+}
+
+/// BuildDupLane - Build a splat operation to take a value from one element
+/// of a vector and splat it into another vector.
+static Value *BuildDupLane(Value *Vec, unsigned LaneVal, unsigned NUnits,
+                           LLVMBuilder &Builder) {
+  // Translate this to a vector shuffle.
+  std::vector<Constant*> Idxs;
+  LLVMContext &Context = getGlobalContext();
+  for (unsigned i = 0; i != NUnits; ++i)
+    Idxs.push_back(Context.getConstantInt(Type::Int32Ty, LaneVal));
+  return Builder.CreateShuffleVector(Vec, Context.getUndef(Vec->getType()),
+                                     Context.getConstantVector(Idxs));
+}
+
+// NEON vector shift counts must be in the range 0..ElemBits-1 for left shifts
+// or 1..ElemBits for right shifts.  For narrowing shifts, compare against the
+// destination element size.  For widening shifts, the upper bound can be
+// equal to the element size.  Define separate functions to check these
+// constraints, so that the rest of the code for handling vector shift counts
+// can be shared.
+
+typedef bool (*ShiftCountChecker)(int Cnt, int ElemBits);
+
+static bool CheckLeftShiftCount(int Cnt, int ElemBits) {
+  return (Cnt >= 0 && Cnt < ElemBits);
+}
+
+static bool CheckLongLeftShiftCount(int Cnt, int ElemBits) {
+  return (Cnt >= 0 && Cnt <= ElemBits);
+}
+
+static bool CheckRightShiftCount(int Cnt, int ElemBits) {
+  return (Cnt >= 1 && Cnt <= ElemBits);
+}
+
+static bool CheckNarrowRightShiftCount(int Cnt, int ElemBits) {
+  return (Cnt >= 1 && Cnt <= ElemBits / 2);
+}
+
+/// BuildShiftCountVector - Check that the shift count argument to a constant
+/// shift builtin is a constant in the appropriate range for the shift
+/// operation.  It expands the shift count into a vector, optionally with the
+/// count negated for right shifts.  Returns true on success.
+static bool BuildShiftCountVector(Value *&Op, enum machine_mode Mode,
+                                  ShiftCountChecker CheckCount,
+                                  bool NegateRightShift) {
+  ConstantInt *Cnt = dyn_cast<ConstantInt>(Op);
+  if (!Cnt)
+    return false;
+  int CntVal = Cnt->getSExtValue();
+
+  assert (VECTOR_MODE_P (Mode) && "expected vector mode for shift");
+  unsigned ElemBits = GET_MODE_BITSIZE (GET_MODE_INNER (Mode));
+  if (!CheckCount(CntVal, ElemBits))
+    return false;
+
+  // Right shifts are represented in NEON intrinsics by a negative shift count.
+  LLVMContext &Context = getGlobalContext();
+  Cnt = Context.getConstantInt(Context.getIntegerType(ElemBits),
+                               NegateRightShift ? -CntVal : CntVal);
+  Op = BuildConstantSplatVector(GET_MODE_NUNITS(Mode), Cnt);
+  return true;
+}
+
+/// isValidLane - Check if the lane operand for a vector intrinsic is a
+/// ConstantInt in the range 0..NUnits.  If pLaneVal is not null, store
+/// the lane value to it.
+static bool isValidLane(Value *LnOp, int NUnits, unsigned *pLaneVal = 0) {
+  ConstantInt *Lane = dyn_cast<ConstantInt>(LnOp);
+  if (!Lane)
+    return false;
+
+  int LaneVal = Lane->getSExtValue();
+  if (LaneVal < 0 || LaneVal >= NUnits)
+    return false;
+
+  if (pLaneVal)
+    *pLaneVal = LaneVal;
+  return true;
+}
+
+/// GetVldstType - Get the vector type of a NEON vector load/store instruction.
+/// For NEON vector structs used in vldN/vstN instruction (2 <= N <= 4), GCC
+/// treats NEON vector structs as scalars, but LLVM uses wide vector types
+/// that combine all the vectors in a struct.  For example, int8x8x4 (4 int8x8
+/// vectors) is treated as a single vector of 32 i8 elements.  Since none of
+/// the instruction operands identify the vector mode, get the element type
+/// from the pointer type of the first argument and the total size from the
+/// result mode.
+static const VectorType *
+GetVldstType(tree exp, enum machine_mode ResultMode) {
+  tree FnDecl = get_callee_fndecl(exp);
+  tree ArgTy = TREE_VALUE(TYPE_ARG_TYPES(TREE_TYPE(FnDecl)));
+  assert(ArgTy && POINTER_TYPE_P (ArgTy) && "Expected a pointer type!");
+  enum machine_mode ElemMode = TYPE_MODE(TREE_TYPE(ArgTy));
+  // Note: Because of a field size limitation in GCC, the NEON XI mode is
+  // defined as 511 bits instead of 512.  Add one below to adjust for this.
+  unsigned NumElems =
+    (GET_MODE_BITSIZE(ResultMode) + 1) / GET_MODE_BITSIZE(ElemMode);
+  const Type *ElemType = ConvertType(TREE_TYPE(ArgTy));
+  return getGlobalContext().getVectorType(ElemType, NumElems);
+}
+
+/// TargetIntrinsicLower - To handle builtins, we want to expand the
+/// invocation into normal LLVM code.  If the target can handle the builtin,
+/// this function should emit the expanded code and return true.
+bool TreeToLLVM::TargetIntrinsicLower(tree exp,
+                                      unsigned FnCode,
+                                      const MemRef *DestLoc,
+                                      Value *&Result,
+                                      const Type *ResultType,
+                                      std::vector<Value*> &Ops) {
+  neon_datatype datatype = neon_datatype_unspecified;
+  bool isRounded = false;
+  Intrinsic::ID intID;
+  Function *intFn;
+  const Type* intOpTypes[2];
+
+  if (FnCode < ARM_BUILTIN_NEON_BASE)
+    return false;
+
+  LLVMContext &Context = getGlobalContext();
+
+  neon_builtins neon_code;
+  enum insn_code icode = locate_neon_builtin_icode (FnCode, 0, &neon_code);
+
+  // Read the extra immediate argument to the builtin.
+  switch (neon_code) {
+  default:
+    return false;
+  case NEON_BUILTIN_vpaddl:
+  case NEON_BUILTIN_vneg:
+  case NEON_BUILTIN_vqneg:
+  case NEON_BUILTIN_vabs:
+  case NEON_BUILTIN_vqabs:
+  case NEON_BUILTIN_vcls:
+  case NEON_BUILTIN_vclz:
+  case NEON_BUILTIN_vcnt:
+  case NEON_BUILTIN_vrecpe:
+  case NEON_BUILTIN_vrsqrte:
+  case NEON_BUILTIN_vmvn:
+  case NEON_BUILTIN_vcvt:
+  case NEON_BUILTIN_vmovn:
+  case NEON_BUILTIN_vqmovn:
+  case NEON_BUILTIN_vqmovun:
+  case NEON_BUILTIN_vmovl:
+  case NEON_BUILTIN_vrev64:
+  case NEON_BUILTIN_vrev32:
+  case NEON_BUILTIN_vrev16:
+    if (!GetBuiltinExtraInfo(Ops[1], datatype, isRounded))
+      return NonImmediateError(exp, Result);
+    break;
+  case NEON_BUILTIN_vadd:
+  case NEON_BUILTIN_vaddl:
+  case NEON_BUILTIN_vaddw:
+  case NEON_BUILTIN_vhadd:
+  case NEON_BUILTIN_vqadd:
+  case NEON_BUILTIN_vaddhn:
+  case NEON_BUILTIN_vmul:
+  case NEON_BUILTIN_vqdmulh:
+  case NEON_BUILTIN_vmull:
+  case NEON_BUILTIN_vqdmull:
+  case NEON_BUILTIN_vsub:
+  case NEON_BUILTIN_vsubl:
+  case NEON_BUILTIN_vsubw:
+  case NEON_BUILTIN_vqsub:
+  case NEON_BUILTIN_vhsub:
+  case NEON_BUILTIN_vsubhn:
+  case NEON_BUILTIN_vceq:
+  case NEON_BUILTIN_vcge:
+  case NEON_BUILTIN_vcgt:
+  case NEON_BUILTIN_vcage:
+  case NEON_BUILTIN_vcagt:
+  case NEON_BUILTIN_vtst:
+  case NEON_BUILTIN_vabd:
+  case NEON_BUILTIN_vabdl:
+  case NEON_BUILTIN_vmax:
+  case NEON_BUILTIN_vmin:
+  case NEON_BUILTIN_vpadd:
+  case NEON_BUILTIN_vpadal:
+  case NEON_BUILTIN_vpmax:
+  case NEON_BUILTIN_vpmin:
+  case NEON_BUILTIN_vrecps:
+  case NEON_BUILTIN_vrsqrts:
+  case NEON_BUILTIN_vshl:
+  case NEON_BUILTIN_vqshl:
+  case NEON_BUILTIN_vshr_n:
+  case NEON_BUILTIN_vshrn_n:
+  case NEON_BUILTIN_vqshrn_n:
+  case NEON_BUILTIN_vqshrun_n:
+  case NEON_BUILTIN_vshl_n:
+  case NEON_BUILTIN_vqshl_n:
+  case NEON_BUILTIN_vqshlu_n:
+  case NEON_BUILTIN_vshll_n:
+  case NEON_BUILTIN_vget_lane:
+  case NEON_BUILTIN_vcvt_n:
+  case NEON_BUILTIN_vmul_n:
+  case NEON_BUILTIN_vmull_n:
+  case NEON_BUILTIN_vqdmull_n:
+  case NEON_BUILTIN_vqdmulh_n:
+  case NEON_BUILTIN_vand:
+  case NEON_BUILTIN_vorr:
+  case NEON_BUILTIN_veor:
+  case NEON_BUILTIN_vbic:
+  case NEON_BUILTIN_vorn:
+    if (!GetBuiltinExtraInfo(Ops[2], datatype, isRounded))
+      return NonImmediateError(exp, Result);
+    break;
+  case NEON_BUILTIN_vmla:
+  case NEON_BUILTIN_vmls:
+  case NEON_BUILTIN_vmlal:
+  case NEON_BUILTIN_vmlsl:
+  case NEON_BUILTIN_vqdmlal:
+  case NEON_BUILTIN_vqdmlsl:
+  case NEON_BUILTIN_vaba:
+  case NEON_BUILTIN_vabal:
+  case NEON_BUILTIN_vsra_n:
+  case NEON_BUILTIN_vmul_lane:
+  case NEON_BUILTIN_vmull_lane:
+  case NEON_BUILTIN_vqdmull_lane:
+  case NEON_BUILTIN_vqdmulh_lane:
+  case NEON_BUILTIN_vmla_n:
+  case NEON_BUILTIN_vmlal_n:
+  case NEON_BUILTIN_vqdmlal_n:
+  case NEON_BUILTIN_vmls_n:
+  case NEON_BUILTIN_vmlsl_n:
+  case NEON_BUILTIN_vqdmlsl_n:
+    if (!GetBuiltinExtraInfo(Ops[3], datatype, isRounded))
+      return NonImmediateError(exp, Result);
+    break;
+  case NEON_BUILTIN_vmla_lane:
+  case NEON_BUILTIN_vmlal_lane:
+  case NEON_BUILTIN_vqdmlal_lane:
+  case NEON_BUILTIN_vmls_lane:
+  case NEON_BUILTIN_vmlsl_lane:
+  case NEON_BUILTIN_vqdmlsl_lane:
+    if (!GetBuiltinExtraInfo(Ops[4], datatype, isRounded))
+      return NonImmediateError(exp, Result);
+    break;
+  case NEON_BUILTIN_vsri_n:
+  case NEON_BUILTIN_vsli_n:
+  case NEON_BUILTIN_vset_lane:
+  case NEON_BUILTIN_vcreate:
+  case NEON_BUILTIN_vdup_n:
+  case NEON_BUILTIN_vdup_lane:
+  case NEON_BUILTIN_vcombine:
+  case NEON_BUILTIN_vget_high:
+  case NEON_BUILTIN_vget_low:
+  case NEON_BUILTIN_vtbl1:
+  case NEON_BUILTIN_vtbl2:
+  case NEON_BUILTIN_vtbl3:
+  case NEON_BUILTIN_vtbl4:
+  case NEON_BUILTIN_vtbx1:
+  case NEON_BUILTIN_vtbx2:
+  case NEON_BUILTIN_vtbx3:
+  case NEON_BUILTIN_vtbx4:
+  case NEON_BUILTIN_vext:
+  case NEON_BUILTIN_vbsl:
+  case NEON_BUILTIN_vtrn:
+  case NEON_BUILTIN_vzip:
+  case NEON_BUILTIN_vuzp:
+  case NEON_BUILTIN_vld1:
+  case NEON_BUILTIN_vld2:
+  case NEON_BUILTIN_vld3:
+  case NEON_BUILTIN_vld4:
+  case NEON_BUILTIN_vld1_lane:
+  case NEON_BUILTIN_vld2_lane:
+  case NEON_BUILTIN_vld3_lane:
+  case NEON_BUILTIN_vld4_lane:
+  case NEON_BUILTIN_vld1_dup:
+  case NEON_BUILTIN_vld2_dup:
+  case NEON_BUILTIN_vld3_dup:
+  case NEON_BUILTIN_vld4_dup:
+  case NEON_BUILTIN_vst1:
+  case NEON_BUILTIN_vst2:
+  case NEON_BUILTIN_vst3:
+  case NEON_BUILTIN_vst4:
+  case NEON_BUILTIN_vst1_lane:
+  case NEON_BUILTIN_vst2_lane:
+  case NEON_BUILTIN_vst3_lane:
+  case NEON_BUILTIN_vst4_lane:
+  case NEON_BUILTIN_vreinterpretv8qi:
+  case NEON_BUILTIN_vreinterpretv4hi:
+  case NEON_BUILTIN_vreinterpretv2si:
+  case NEON_BUILTIN_vreinterpretv2sf:
+  case NEON_BUILTIN_vreinterpretdi:
+  case NEON_BUILTIN_vreinterpretv16qi:
+  case NEON_BUILTIN_vreinterpretv8hi:
+  case NEON_BUILTIN_vreinterpretv4si:
+  case NEON_BUILTIN_vreinterpretv4sf:
+  case NEON_BUILTIN_vreinterpretv2di:
+    // No extra argument used here.
+    break;
+  }
+
+  // Check that the isRounded flag is only set when it is supported.
+  if (isRounded) {
+    switch (neon_code) {
+    case NEON_BUILTIN_vhadd:
+    case NEON_BUILTIN_vaddhn:
+    case NEON_BUILTIN_vqdmulh:
+    case NEON_BUILTIN_vsubhn:
+    case NEON_BUILTIN_vshl:
+    case NEON_BUILTIN_vqshl:
+    case NEON_BUILTIN_vshr_n:
+    case NEON_BUILTIN_vshrn_n:
+    case NEON_BUILTIN_vqshrn_n:
+    case NEON_BUILTIN_vqshrun_n:
+    case NEON_BUILTIN_vsra_n:
+    case NEON_BUILTIN_vqdmulh_lane:
+    case NEON_BUILTIN_vqdmulh_n:
+      // These all support a rounded variant.
+      break;
+    default:
+      return BadImmediateError(exp, Result);
+    }
+  }
+
+  // Check for supported vector modes.
+
+  // Set defaults for mode checking.
+  int modeCheckOpnd = 1;
+  bool allow_64bit_modes = true;
+  bool allow_128bit_modes = true;
+  bool allow_8bit_elements = true;
+  bool allow_16bit_elements = true;
+  bool allow_32bit_elements = true;
+  bool allow_64bit_elements = false;
+  bool allow_16bit_polynomials = false;
+
+  switch (neon_code) {
+  default:
+    assert(0 && "unexpected builtin");
+    break;
+
+  case NEON_BUILTIN_vadd:
+  case NEON_BUILTIN_vsub:
+  case NEON_BUILTIN_vqadd:
+  case NEON_BUILTIN_vqsub:
+  case NEON_BUILTIN_vshl:
+  case NEON_BUILTIN_vqshl:
+  case NEON_BUILTIN_vshr_n:
+  case NEON_BUILTIN_vshl_n:
+  case NEON_BUILTIN_vqshl_n:
+  case NEON_BUILTIN_vqshlu_n:
+  case NEON_BUILTIN_vsra_n:
+  case NEON_BUILTIN_vsri_n:
+  case NEON_BUILTIN_vsli_n:
+  case NEON_BUILTIN_vmvn:
+  case NEON_BUILTIN_vext:
+  case NEON_BUILTIN_vbsl:
+  case NEON_BUILTIN_vand:
+  case NEON_BUILTIN_vorr:
+  case NEON_BUILTIN_veor:
+  case NEON_BUILTIN_vbic:
+  case NEON_BUILTIN_vorn:
+  case NEON_BUILTIN_vdup_lane:
+    allow_64bit_elements = true;
+    break;
+
+  case NEON_BUILTIN_vhadd:
+  case NEON_BUILTIN_vhsub:
+  case NEON_BUILTIN_vmul:
+  case NEON_BUILTIN_vceq:
+  case NEON_BUILTIN_vcge:
+  case NEON_BUILTIN_vcgt:
+  case NEON_BUILTIN_vcage:
+  case NEON_BUILTIN_vcagt:
+  case NEON_BUILTIN_vtst:
+  case NEON_BUILTIN_vabd:
+  case NEON_BUILTIN_vabdl:
+  case NEON_BUILTIN_vaba:
+  case NEON_BUILTIN_vabal:
+  case NEON_BUILTIN_vmax:
+  case NEON_BUILTIN_vmin:
+  case NEON_BUILTIN_vpaddl:
+  case NEON_BUILTIN_vrecps:
+  case NEON_BUILTIN_vrsqrts:
+  case NEON_BUILTIN_vneg:
+  case NEON_BUILTIN_vqneg:
+  case NEON_BUILTIN_vabs:
+  case NEON_BUILTIN_vqabs:
+  case NEON_BUILTIN_vcls:
+  case NEON_BUILTIN_vclz:
+  case NEON_BUILTIN_vtrn:
+  case NEON_BUILTIN_vzip:
+  case NEON_BUILTIN_vuzp:
+    break;
+
+  case NEON_BUILTIN_vmla:
+  case NEON_BUILTIN_vmls:
+  case NEON_BUILTIN_vpadal:
+    modeCheckOpnd = 2;
+    break;
+
+  case NEON_BUILTIN_vaddhn:
+  case NEON_BUILTIN_vsubhn:
+  case NEON_BUILTIN_vshrn_n:
+  case NEON_BUILTIN_vqshrn_n:
+  case NEON_BUILTIN_vqshrun_n:
+  case NEON_BUILTIN_vmovn:
+  case NEON_BUILTIN_vqmovn:
+  case NEON_BUILTIN_vqmovun:
+    allow_64bit_modes = false;
+    allow_8bit_elements = false;
+    allow_64bit_elements = true;
+    break;
+
+  case NEON_BUILTIN_vqdmulh:
+  case NEON_BUILTIN_vqdmulh_lane:
+  case NEON_BUILTIN_vqdmulh_n:
+  case NEON_BUILTIN_vmul_lane:
+  case NEON_BUILTIN_vmul_n:
+  case NEON_BUILTIN_vmla_lane:
+  case NEON_BUILTIN_vmla_n:
+  case NEON_BUILTIN_vmls_lane:
+  case NEON_BUILTIN_vmls_n:
+    allow_8bit_elements = false;
+    break;
+
+  case NEON_BUILTIN_vqdmull:
+  case NEON_BUILTIN_vqdmull_lane:
+  case NEON_BUILTIN_vqdmull_n:
+  case NEON_BUILTIN_vmull_lane:
+  case NEON_BUILTIN_vmull_n:
+    allow_128bit_modes = false;
+    allow_8bit_elements = false;
+    break;
+
+  case NEON_BUILTIN_vqdmlal:
+  case NEON_BUILTIN_vqdmlal_lane:
+  case NEON_BUILTIN_vqdmlal_n:
+  case NEON_BUILTIN_vqdmlsl:
+  case NEON_BUILTIN_vmlal_lane:
+  case NEON_BUILTIN_vmlal_n:
+  case NEON_BUILTIN_vmlsl_lane:
+  case NEON_BUILTIN_vmlsl_n:
+  case NEON_BUILTIN_vqdmlsl_lane:
+  case NEON_BUILTIN_vqdmlsl_n:
+    modeCheckOpnd = 2;
+    allow_128bit_modes = false;
+    allow_8bit_elements = false;
+    break;
+
+  case NEON_BUILTIN_vaddw:
+  case NEON_BUILTIN_vmlal:
+  case NEON_BUILTIN_vmlsl:
+  case NEON_BUILTIN_vsubw:
+    modeCheckOpnd = 2;
+    allow_128bit_modes = false;
+    break;
+
+  case NEON_BUILTIN_vaddl:
+  case NEON_BUILTIN_vmull:
+  case NEON_BUILTIN_vsubl:
+  case NEON_BUILTIN_vpadd:
+  case NEON_BUILTIN_vpmax:
+  case NEON_BUILTIN_vpmin:
+  case NEON_BUILTIN_vshll_n:
+  case NEON_BUILTIN_vmovl:
+    allow_128bit_modes = false;
+    break;
+
+  case NEON_BUILTIN_vcnt:
+    allow_16bit_elements = false;
+    allow_32bit_elements = false;
+    break;
+
+  case NEON_BUILTIN_vtbl1:
+  case NEON_BUILTIN_vtbl2:
+  case NEON_BUILTIN_vtbl3:
+  case NEON_BUILTIN_vtbl4:
+  case NEON_BUILTIN_vtbx1:
+  case NEON_BUILTIN_vtbx2:
+  case NEON_BUILTIN_vtbx3:
+  case NEON_BUILTIN_vtbx4:
+    allow_16bit_elements = false;
+    allow_32bit_elements = false;
+    allow_128bit_modes = false;
+    modeCheckOpnd = 0;
+    break;
+
+  case NEON_BUILTIN_vrecpe:
+  case NEON_BUILTIN_vrsqrte:
+  case NEON_BUILTIN_vcvt:
+  case NEON_BUILTIN_vcvt_n:
+    allow_8bit_elements = false;
+    allow_16bit_elements = false;
+    break;
+
+  case NEON_BUILTIN_vget_lane:
+    allow_64bit_elements = true;
+    allow_16bit_polynomials = true;
+    break;
+
+  case NEON_BUILTIN_vset_lane:
+    allow_64bit_elements = true;
+    allow_16bit_polynomials = true;
+    modeCheckOpnd = 2;
+    break;
+
+  case NEON_BUILTIN_vrev64:
+    allow_16bit_polynomials = true;
+    break;
+
+  case NEON_BUILTIN_vrev32:
+    allow_16bit_polynomials = true;
+    allow_32bit_elements = false;
+    break;
+
+  case NEON_BUILTIN_vrev16:
+    allow_16bit_elements = false;
+    allow_32bit_elements = false;
+    break;
+
+  case NEON_BUILTIN_vcreate:
+    modeCheckOpnd = 0;
+    allow_128bit_modes = false;
+    allow_64bit_elements = true;
+    break;
+
+  case NEON_BUILTIN_vdup_n:
+    modeCheckOpnd = 0;
+    allow_64bit_elements = true;
+    break;
+
+  case NEON_BUILTIN_vcombine:
+  case NEON_BUILTIN_vreinterpretv8qi:
+  case NEON_BUILTIN_vreinterpretv4hi:
+  case NEON_BUILTIN_vreinterpretv2si:
+  case NEON_BUILTIN_vreinterpretv2sf:
+  case NEON_BUILTIN_vreinterpretdi:
+    allow_128bit_modes = false;
+    allow_64bit_elements = true;
+    break;
+
+  case NEON_BUILTIN_vget_high:
+  case NEON_BUILTIN_vget_low:
+  case NEON_BUILTIN_vreinterpretv16qi:
+  case NEON_BUILTIN_vreinterpretv8hi:
+  case NEON_BUILTIN_vreinterpretv4si:
+  case NEON_BUILTIN_vreinterpretv4sf:
+  case NEON_BUILTIN_vreinterpretv2di:
+    allow_64bit_modes = false;
+    allow_64bit_elements = true;
+    break;
+
+  case NEON_BUILTIN_vld1:
+  case NEON_BUILTIN_vld2:
+  case NEON_BUILTIN_vld3:
+  case NEON_BUILTIN_vld4:
+  case NEON_BUILTIN_vld1_lane:
+  case NEON_BUILTIN_vld2_lane:
+  case NEON_BUILTIN_vld3_lane:
+  case NEON_BUILTIN_vld4_lane:
+  case NEON_BUILTIN_vld1_dup:
+  case NEON_BUILTIN_vld2_dup:
+  case NEON_BUILTIN_vld3_dup:
+  case NEON_BUILTIN_vld4_dup:
+  case NEON_BUILTIN_vst1:
+  case NEON_BUILTIN_vst2:
+  case NEON_BUILTIN_vst3:
+  case NEON_BUILTIN_vst4:
+  case NEON_BUILTIN_vst1_lane:
+  case NEON_BUILTIN_vst2_lane:
+  case NEON_BUILTIN_vst3_lane:
+  case NEON_BUILTIN_vst4_lane:
+    // Most of the load/store builtins do not have operands with the mode of
+    // the operation.  Skip the mode check, since there is no extra operand
+    // to check against the mode anyway.
+    modeCheckOpnd = -1;
+    break;
+  }
+
+  if (modeCheckOpnd >= 0) {
+
+    switch (insn_data[icode].operand[modeCheckOpnd].mode) {
+    case V8QImode: case V4HImode: case V2SImode: case DImode: case V2SFmode:
+      if (!allow_64bit_modes)
+        return BadModeError(exp, Result);
+      break;
+    case V16QImode: case V8HImode: case V4SImode: case V2DImode: case V4SFmode:
+      if (!allow_128bit_modes)
+        return BadModeError(exp, Result);
+      break;
+    default:
+      return BadModeError(exp, Result);
+    }
+
+    if (datatype == neon_datatype_polynomial) {
+
+      switch (insn_data[icode].operand[modeCheckOpnd].mode) {
+      case V8QImode: case V16QImode:
+        break;
+      case V4HImode: case V8HImode:
+        if (!allow_16bit_polynomials)
+          return BadModeError(exp, Result);
+        break;
+      default:
+        return BadModeError(exp, Result);
+      }
+
+    } else if (datatype == neon_datatype_float) {
+
+      switch (insn_data[icode].operand[modeCheckOpnd].mode) {
+      case V2SFmode: case V4SFmode:
+        break;
+      default:
+        return BadModeError(exp, Result);
+      }
+
+    } else {
+
+      switch (insn_data[icode].operand[modeCheckOpnd].mode) {
+      case V8QImode: case V16QImode:
+        if (!allow_8bit_elements)
+          return BadModeError(exp, Result);
+        break;
+      case V4HImode: case V8HImode:
+        if (!allow_16bit_elements)
+          return BadModeError(exp, Result);
+        break;
+      case V2SImode: case V4SImode:
+      case V2SFmode: case V4SFmode:
+        if (!allow_32bit_elements)
+          return BadModeError(exp, Result);
+        break;
+      case DImode: case V2DImode:
+        if (!allow_64bit_elements)
+          return BadModeError(exp, Result);
+        break;
+      default:
+        return BadModeError(exp, Result);
+      }
+    }
+  }
+
+  // Now translate the builtin to LLVM.
+
+  switch (neon_code) {
+  default:
+    assert(0 && "unimplemented builtin");
+    break;
+
+  case NEON_BUILTIN_vadd:
+    if (datatype == neon_datatype_polynomial)
+      return BadImmediateError(exp, Result);
+    Result = Builder.CreateAdd(Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vaddl:
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vaddls;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vaddlu;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vaddw:
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vaddws;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vaddwu;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vhadd:
+    if (datatype == neon_datatype_signed)
+      intID = (isRounded ?
+               Intrinsic::arm_neon_vrhadds :
+               Intrinsic::arm_neon_vhadds);
+    else if (datatype == neon_datatype_unsigned)
+      intID = (isRounded ?
+               Intrinsic::arm_neon_vrhaddu :
+               Intrinsic::arm_neon_vhaddu);
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vqadd:
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vqadds;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vqaddu;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vaddhn:
+    if (datatype == neon_datatype_signed ||
+        datatype == neon_datatype_unsigned)
+      intID = (isRounded ?
+               Intrinsic::arm_neon_vraddhn :
+               Intrinsic::arm_neon_vaddhn);
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vmul_lane:
+  case NEON_BUILTIN_vmul_n:
+    if (datatype == neon_datatype_polynomial)
+      return BadImmediateError(exp, Result);
+    // fall through....
+  case NEON_BUILTIN_vmul:
+    if (neon_code == NEON_BUILTIN_vmul_n) {
+      Ops[1] = BuildDup(Ops[0]->getType(), Ops[1], Builder);
+    } else if (neon_code == NEON_BUILTIN_vmul_lane) {
+      unsigned LaneVal;
+      unsigned NUnits = GET_MODE_NUNITS(insn_data[icode].operand[1].mode);
+      if (!isValidLane(Ops[2], NUnits, &LaneVal))
+        return UnexpectedError("%Hinvalid lane number", exp, Result);
+      Ops[1] = BuildDupLane(Ops[1], LaneVal, NUnits, Builder);
+    }
+    if (datatype == neon_datatype_polynomial) {
+      intID = Intrinsic::arm_neon_vmulp;
+      intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+      Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    } else
+      Result = Builder.CreateMul(Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vmla_lane:
+  case NEON_BUILTIN_vmla_n:
+  case NEON_BUILTIN_vmla:
+    if (neon_code == NEON_BUILTIN_vmla_n) {
+      Ops[2] = BuildDup(Ops[1]->getType(), Ops[2], Builder);
+    } else if (neon_code == NEON_BUILTIN_vmla_lane) {
+      unsigned LaneVal;
+      unsigned NUnits = GET_MODE_NUNITS(insn_data[icode].operand[2].mode);
+      if (!isValidLane(Ops[3], NUnits, &LaneVal))
+        return UnexpectedError("%Hinvalid lane number", exp, Result);
+      Ops[2] = BuildDupLane(Ops[2], LaneVal, NUnits, Builder);
+    }
+    if (datatype == neon_datatype_polynomial)
+      return BadImmediateError(exp, Result);
+    Result = Builder.CreateAdd(Ops[0], Builder.CreateMul(Ops[1], Ops[2]));
+    break;
+
+  case NEON_BUILTIN_vmls_lane:
+  case NEON_BUILTIN_vmls_n:
+  case NEON_BUILTIN_vmls:
+    if (neon_code == NEON_BUILTIN_vmls_n) {
+      Ops[2] = BuildDup(Ops[1]->getType(), Ops[2], Builder);
+    } else if (neon_code == NEON_BUILTIN_vmls_lane) {
+      unsigned LaneVal;
+      unsigned NUnits = GET_MODE_NUNITS(insn_data[icode].operand[2].mode);
+      if (!isValidLane(Ops[3], NUnits, &LaneVal))
+        return UnexpectedError("%Hinvalid lane number", exp, Result);
+      Ops[2] = BuildDupLane(Ops[2], LaneVal, NUnits, Builder);
+    }
+    if (datatype == neon_datatype_polynomial)
+      return BadImmediateError(exp, Result);
+    Result = Builder.CreateSub(Ops[0], Builder.CreateMul(Ops[1], Ops[2]));
+    break;
+
+  case NEON_BUILTIN_vmlal_lane:
+  case NEON_BUILTIN_vmlal_n:
+  case NEON_BUILTIN_vmlal:
+    if (neon_code == NEON_BUILTIN_vmlal_n) {
+      Ops[2] = BuildDup(Ops[1]->getType(), Ops[2], Builder);
+    } else if (neon_code == NEON_BUILTIN_vmlal_lane) {
+      unsigned LaneVal;
+      unsigned NUnits = GET_MODE_NUNITS(insn_data[icode].operand[2].mode);
+      if (!isValidLane(Ops[3], NUnits, &LaneVal))
+        return UnexpectedError("%Hinvalid lane number", exp, Result);
+      Ops[2] = BuildDupLane(Ops[2], LaneVal, NUnits, Builder);
+    }
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vmlals;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vmlalu;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall3(intFn, Ops[0], Ops[1], Ops[2]);
+    break;
+
+  case NEON_BUILTIN_vmlsl_lane:
+  case NEON_BUILTIN_vmlsl_n:
+  case NEON_BUILTIN_vmlsl:
+    if (neon_code == NEON_BUILTIN_vmlsl_n) {
+      Ops[2] = BuildDup(Ops[1]->getType(), Ops[2], Builder);
+    } else if (neon_code == NEON_BUILTIN_vmlsl_lane) {
+      unsigned LaneVal;
+      unsigned NUnits = GET_MODE_NUNITS(insn_data[icode].operand[2].mode);
+      if (!isValidLane(Ops[3], NUnits, &LaneVal))
+        return UnexpectedError("%Hinvalid lane number", exp, Result);
+      Ops[2] = BuildDupLane(Ops[2], LaneVal, NUnits, Builder);
+    }
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vmlsls;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vmlslu;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall3(intFn, Ops[0], Ops[1], Ops[2]);
+    break;
+
+  case NEON_BUILTIN_vqdmulh_lane:
+  case NEON_BUILTIN_vqdmulh_n:
+  case NEON_BUILTIN_vqdmulh:
+    if (neon_code == NEON_BUILTIN_vqdmulh_n) {
+      Ops[1] = BuildDup(Ops[0]->getType(), Ops[1], Builder);
+    } else if (neon_code == NEON_BUILTIN_vqdmulh_lane) {
+      unsigned LaneVal;
+      unsigned NUnits = GET_MODE_NUNITS(insn_data[icode].operand[1].mode);
+      if (!isValidLane(Ops[2], NUnits, &LaneVal))
+        return UnexpectedError("%Hinvalid lane number", exp, Result);
+      Ops[1] = BuildDupLane(Ops[1], LaneVal, NUnits, Builder);
+    }
+    if (datatype == neon_datatype_signed)
+      intID = (isRounded ?
+               Intrinsic::arm_neon_vqrdmulh :
+               Intrinsic::arm_neon_vqdmulh);
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vqdmlal_lane:
+  case NEON_BUILTIN_vqdmlal_n:
+  case NEON_BUILTIN_vqdmlal:
+    if (neon_code == NEON_BUILTIN_vqdmlal_n) {
+      Ops[2] = BuildDup(Ops[1]->getType(), Ops[2], Builder);
+    } else if (neon_code == NEON_BUILTIN_vqdmlal_lane) {
+      unsigned LaneVal;
+      unsigned NUnits = GET_MODE_NUNITS(insn_data[icode].operand[2].mode);
+      if (!isValidLane(Ops[3], NUnits, &LaneVal))
+        return UnexpectedError("%Hinvalid lane number", exp, Result);
+      Ops[2] = BuildDupLane(Ops[2], LaneVal, NUnits, Builder);
+    }
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vqdmlal;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall3(intFn, Ops[0], Ops[1], Ops[2]);
+    break;
+
+  case NEON_BUILTIN_vqdmlsl_lane:
+  case NEON_BUILTIN_vqdmlsl_n:
+  case NEON_BUILTIN_vqdmlsl:
+    if (neon_code == NEON_BUILTIN_vqdmlsl_n) {
+      Ops[2] = BuildDup(Ops[1]->getType(), Ops[2], Builder);
+    } else if (neon_code == NEON_BUILTIN_vqdmlsl_lane) {
+      unsigned LaneVal;
+      unsigned NUnits = GET_MODE_NUNITS(insn_data[icode].operand[2].mode);
+      if (!isValidLane(Ops[3], NUnits, &LaneVal))
+        return UnexpectedError("%Hinvalid lane number", exp, Result);
+      Ops[2] = BuildDupLane(Ops[2], LaneVal, NUnits, Builder);
+    }
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vqdmlsl;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall3(intFn, Ops[0], Ops[1], Ops[2]);
+    break;
+
+  case NEON_BUILTIN_vmull_lane:
+  case NEON_BUILTIN_vmull_n:
+  case NEON_BUILTIN_vmull:
+    if (neon_code == NEON_BUILTIN_vmull_n) {
+      Ops[1] = BuildDup(Ops[0]->getType(), Ops[1], Builder);
+    } else if (neon_code == NEON_BUILTIN_vmull_lane) {
+      unsigned LaneVal;
+      unsigned NUnits = GET_MODE_NUNITS(insn_data[icode].operand[1].mode);
+      if (!isValidLane(Ops[2], NUnits, &LaneVal))
+        return UnexpectedError("%Hinvalid lane number", exp, Result);
+      Ops[1] = BuildDupLane(Ops[1], LaneVal, NUnits, Builder);
+    }
+    if (datatype == neon_datatype_polynomial)
+      intID = Intrinsic::arm_neon_vmullp;
+    else if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vmulls;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vmullu;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vqdmull_n:
+  case NEON_BUILTIN_vqdmull_lane:
+  case NEON_BUILTIN_vqdmull:
+    if (neon_code == NEON_BUILTIN_vqdmull_n) {
+      Ops[1] = BuildDup(Ops[0]->getType(), Ops[1], Builder);
+    } else if (neon_code == NEON_BUILTIN_vqdmull_lane) {
+      unsigned LaneVal;
+      unsigned NUnits = GET_MODE_NUNITS(insn_data[icode].operand[1].mode);
+      if (!isValidLane(Ops[2], NUnits, &LaneVal))
+        return UnexpectedError("%Hinvalid lane number", exp, Result);
+      Ops[1] = BuildDupLane(Ops[1], LaneVal, NUnits, Builder);
+    }
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vqdmull;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vshl_n:
+    if (!BuildShiftCountVector(Ops[1], insn_data[icode].operand[1].mode,
+                               CheckLeftShiftCount, false))
+      return UnexpectedError("%Hinvalid shift count", exp, Result);
+    if (datatype == neon_datatype_signed ||
+        datatype == neon_datatype_unsigned)
+      Result = Builder.CreateShl(Ops[0], Ops[1]);
+    else
+      return BadImmediateError(exp, Result);
+    break;
+
+  case NEON_BUILTIN_vshr_n:
+    if (!BuildShiftCountVector(Ops[1], insn_data[icode].operand[1].mode,
+                               CheckRightShiftCount, isRounded))
+      return UnexpectedError("%Hinvalid shift count", exp, Result);
+    if (!isRounded) {
+      if (datatype == neon_datatype_signed)
+        Result = Builder.CreateAShr(Ops[0], Ops[1]);
+      else if (datatype == neon_datatype_unsigned)
+        Result = Builder.CreateLShr(Ops[0], Ops[1]);
+      else
+        return BadImmediateError(exp, Result);
+      break;
+    }
+    // fall through....
+  case NEON_BUILTIN_vshl:
+    if (datatype == neon_datatype_signed)
+      intID = (isRounded ?
+               Intrinsic::arm_neon_vrshifts :
+               Intrinsic::arm_neon_vshifts);
+    else if (datatype == neon_datatype_unsigned)
+      intID = (isRounded ?
+               Intrinsic::arm_neon_vrshiftu :
+               Intrinsic::arm_neon_vshiftu);
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vshrn_n:
+    if (!BuildShiftCountVector(Ops[1], insn_data[icode].operand[1].mode,
+                               CheckNarrowRightShiftCount, true))
+      return UnexpectedError("%Hinvalid shift count", exp, Result);
+    if (datatype == neon_datatype_signed ||
+        datatype == neon_datatype_unsigned)
+      intID = (isRounded ?
+               Intrinsic::arm_neon_vrshiftn :
+               Intrinsic::arm_neon_vshiftn);
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vqshl_n:
+    if (!BuildShiftCountVector(Ops[1], insn_data[icode].operand[1].mode,
+                               CheckLeftShiftCount, false))
+      return UnexpectedError("%Hinvalid shift count", exp, Result);
+    // fall through....
+  case NEON_BUILTIN_vqshl:
+    if (datatype == neon_datatype_signed)
+      intID = (isRounded ?
+               Intrinsic::arm_neon_vqrshifts :
+               Intrinsic::arm_neon_vqshifts);
+    else if (datatype == neon_datatype_unsigned)
+      intID = (isRounded ?
+               Intrinsic::arm_neon_vqrshiftu :
+               Intrinsic::arm_neon_vqshiftu);
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vqshlu_n:
+    if (!BuildShiftCountVector(Ops[1], insn_data[icode].operand[1].mode,
+                               CheckLeftShiftCount, false))
+      return UnexpectedError("%Hinvalid shift count", exp, Result);
+    if (datatype != neon_datatype_signed)
+      return BadImmediateError(exp, Result);
+    intID = Intrinsic::arm_neon_vqshiftsu;
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vqshrn_n:
+    if (!BuildShiftCountVector(Ops[1], insn_data[icode].operand[1].mode,
+                               CheckNarrowRightShiftCount, true))
+      return UnexpectedError("%Hinvalid shift count", exp, Result);
+    if (datatype == neon_datatype_signed)
+      intID = (isRounded ?
+               Intrinsic::arm_neon_vqrshiftns :
+               Intrinsic::arm_neon_vqshiftns);
+    else if (datatype == neon_datatype_unsigned)
+      intID = (isRounded ?
+               Intrinsic::arm_neon_vqrshiftnu :
+               Intrinsic::arm_neon_vqshiftnu);
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vqshrun_n:
+    if (!BuildShiftCountVector(Ops[1], insn_data[icode].operand[1].mode,
+                               CheckNarrowRightShiftCount, true))
+      return UnexpectedError("%Hinvalid shift count", exp, Result);
+    if (datatype == neon_datatype_signed)
+      intID = (isRounded ?
+               Intrinsic::arm_neon_vqrshiftnsu :
+               Intrinsic::arm_neon_vqshiftnsu);
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vshll_n:
+    if (!BuildShiftCountVector(Ops[1], insn_data[icode].operand[1].mode,
+                               CheckLongLeftShiftCount, false))
+      return UnexpectedError("%Hinvalid shift count", exp, Result);
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vshiftls;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vshiftlu;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vsra_n:
+    if (!BuildShiftCountVector(Ops[2], insn_data[icode].operand[1].mode,
+                               CheckRightShiftCount, isRounded))
+      return UnexpectedError("%Hinvalid shift count", exp, Result);
+    if (!isRounded) {
+      if (datatype == neon_datatype_signed)
+        Result = Builder.CreateAShr(Ops[1], Ops[2]);
+      else if (datatype == neon_datatype_unsigned)
+        Result = Builder.CreateLShr(Ops[1], Ops[2]);
+      else
+        return BadImmediateError(exp, Result);
+    } else {
+      if (datatype == neon_datatype_signed)
+        intID = (isRounded ?
+                 Intrinsic::arm_neon_vrshifts :
+                 Intrinsic::arm_neon_vshifts);
+      else if (datatype == neon_datatype_unsigned)
+        intID = (isRounded ?
+                 Intrinsic::arm_neon_vrshiftu :
+                 Intrinsic::arm_neon_vshiftu);
+      else
+        return BadImmediateError(exp, Result);
+
+      intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+      Result = Builder.CreateCall2(intFn, Ops[1], Ops[2]);
+    }
+    Result = Builder.CreateAdd(Ops[0], Result);
+    break;
+
+  case NEON_BUILTIN_vsub:
+    if (datatype == neon_datatype_polynomial)
+      return BadImmediateError(exp, Result);
+    Result = Builder.CreateSub(Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vsubl:
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vsubls;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vsublu;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vsubw:
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vsubws;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vsubwu;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vqsub:
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vqsubs;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vqsubu;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vhsub:
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vhsubs;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vhsubu;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vsubhn:
+    if (datatype == neon_datatype_signed ||
+        datatype == neon_datatype_unsigned)
+      intID = (isRounded ?
+               Intrinsic::arm_neon_vrsubhn :
+               Intrinsic::arm_neon_vsubhn);
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vceq:
+    if (datatype == neon_datatype_float)
+      Result = Builder.CreateFCmp(FCmpInst::FCMP_OEQ, Ops[0], Ops[1]);
+    else
+      Result = Builder.CreateICmp(ICmpInst::ICMP_EQ, Ops[0], Ops[1]);
+    Result = Builder.CreateSExt(Result, ResultType);
+    break;
+
+  case NEON_BUILTIN_vcge:
+    if (datatype == neon_datatype_float)
+      Result = Builder.CreateFCmp(FCmpInst::FCMP_OGE, Ops[0], Ops[1]);
+    else if (datatype == neon_datatype_signed)
+      Result = Builder.CreateICmp(ICmpInst::ICMP_SGE, Ops[0], Ops[1]);
+    else if (datatype == neon_datatype_unsigned)
+      Result = Builder.CreateICmp(ICmpInst::ICMP_UGE, Ops[0], Ops[1]);
+    else
+      return BadImmediateError(exp, Result);
+    Result = Builder.CreateSExt(Result, ResultType);
+    break;
+
+  case NEON_BUILTIN_vcgt:
+    if (datatype == neon_datatype_float)
+      Result = Builder.CreateFCmp(FCmpInst::FCMP_OGT, Ops[0], Ops[1]);
+    else if (datatype == neon_datatype_signed)
+      Result = Builder.CreateICmp(ICmpInst::ICMP_SGT, Ops[0], Ops[1]);
+    else if (datatype == neon_datatype_unsigned)
+      Result = Builder.CreateICmp(ICmpInst::ICMP_UGT, Ops[0], Ops[1]);
+    else
+      return BadImmediateError(exp, Result);
+    Result = Builder.CreateSExt(Result, ResultType);
+    break;
+
+  case NEON_BUILTIN_vcage:
+    if (datatype != neon_datatype_float)
+      return BadImmediateError(exp, Result);
+
+    switch (insn_data[icode].operand[1].mode) {
+    case V2SFmode:
+      intID = Intrinsic::arm_neon_vacged;
+      break;
+    case V4SFmode:
+      intID = Intrinsic::arm_neon_vacgeq;
+      break;
+    default:
+      return BadModeError(exp, Result);
+    }
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vcagt:
+    if (datatype != neon_datatype_float)
+      return BadImmediateError(exp, Result);
+
+    switch (insn_data[icode].operand[1].mode) {
+    case V2SFmode:
+      intID = Intrinsic::arm_neon_vacgtd;
+      break;
+    case V4SFmode:
+      intID = Intrinsic::arm_neon_vacgtq;
+      break;
+    default:
+      return BadModeError(exp, Result);
+    }
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vtst:
+    if (datatype == neon_datatype_float)
+      return BadImmediateError(exp, Result);
+    Result = Builder.CreateICmp(ICmpInst::ICMP_NE,
+                                Builder.CreateAnd(Ops[0], Ops[1]),
+                                Context.getConstantAggregateZero(ResultType));
+    Result = Builder.CreateSExt(Result, ResultType);
+    break;
+
+  case NEON_BUILTIN_vabd:
+    if (datatype == neon_datatype_float)
+      intID = Intrinsic::arm_neon_vabdf;
+    else if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vabds;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vabdu;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vabdl:
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vabdls;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vabdlu;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vaba:
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vabas;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vabau;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall3(intFn, Ops[0], Ops[1], Ops[2]);
+    break;
+
+  case NEON_BUILTIN_vabal:
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vabals;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vabalu;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall3(intFn, Ops[0], Ops[1], Ops[2]);
+    break;
+
+  case NEON_BUILTIN_vmax:
+    if (datatype == neon_datatype_float)
+      intID = Intrinsic::arm_neon_vmaxf;
+    else if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vmaxs;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vmaxu;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vmin:
+    if (datatype == neon_datatype_float)
+      intID = Intrinsic::arm_neon_vminf;
+    else if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vmins;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vminu;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vpadd:
+    if (datatype == neon_datatype_float)
+      intID = Intrinsic::arm_neon_vpaddf;
+    else if (datatype == neon_datatype_signed ||
+             datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vpaddi;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vpaddl:
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vpaddls;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vpaddlu;
+    else
+      return BadImmediateError(exp, Result);
+
+    intOpTypes[0] = ResultType;
+    intOpTypes[1] = Ops[0]->getType();
+    intFn = Intrinsic::getDeclaration(TheModule, intID, intOpTypes, 2);
+    Result = Builder.CreateCall(intFn, Ops[0]);
+    break;
+
+  case NEON_BUILTIN_vpadal:
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vpadals;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vpadalu;
+    else
+      return BadImmediateError(exp, Result);
+
+    intOpTypes[0] = ResultType;
+    intOpTypes[1] = Ops[1]->getType();
+    intFn = Intrinsic::getDeclaration(TheModule, intID, intOpTypes, 2);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vpmax:
+    if (datatype == neon_datatype_float)
+      intID = Intrinsic::arm_neon_vpmaxf;
+    else if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vpmaxs;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vpmaxu;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vpmin:
+    if (datatype == neon_datatype_float)
+      intID = Intrinsic::arm_neon_vpminf;
+    else if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vpmins;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vpminu;
+    else
+      return BadImmediateError(exp, Result);
+
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vrecps:
+    if (datatype != neon_datatype_float)
+      return BadImmediateError(exp, Result);
+    intID = Intrinsic::arm_neon_vrecps;
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vrsqrts:
+    if (datatype != neon_datatype_float)
+      return BadImmediateError(exp, Result);
+    intID = Intrinsic::arm_neon_vrsqrts;
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vsri_n:
+    if (!BuildShiftCountVector(Ops[2], insn_data[icode].operand[1].mode,
+                               CheckRightShiftCount, true))
+      return UnexpectedError("%Hinvalid shift count", exp, Result);
+    intID = Intrinsic::arm_neon_vshiftins;
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall3(intFn, Ops[0], Ops[1], Ops[2]);
+    break;
+
+  case NEON_BUILTIN_vsli_n:
+    if (!BuildShiftCountVector(Ops[2], insn_data[icode].operand[1].mode,
+                               CheckLeftShiftCount, false))
+      return UnexpectedError("%Hinvalid shift count", exp, Result);
+    intID = Intrinsic::arm_neon_vshiftins;
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall3(intFn, Ops[0], Ops[1], Ops[2]);
+    break;
+
+  case NEON_BUILTIN_vabs:
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vabs;
+    else if (datatype == neon_datatype_float)
+      intID = Intrinsic::arm_neon_vabsf;
+    else
+      return BadImmediateError(exp, Result);
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall(intFn, Ops[0]);
+    break;
+
+  case NEON_BUILTIN_vqabs:
+    if (datatype != neon_datatype_signed)
+      return BadImmediateError(exp, Result);
+    intID = Intrinsic::arm_neon_vqabs;
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall(intFn, Ops[0]);
+    break;
+
+  case NEON_BUILTIN_vneg:
+    if (datatype != neon_datatype_signed &&
+        datatype != neon_datatype_float)
+      return BadImmediateError(exp, Result);
+    Result = Builder.CreateNeg(Ops[0]);
+    break;
+
+  case NEON_BUILTIN_vqneg:
+    if (datatype != neon_datatype_signed)
+      return BadImmediateError(exp, Result);
+    intID = Intrinsic::arm_neon_vqneg;
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall(intFn, Ops[0]);
+    break;
+
+  case NEON_BUILTIN_vcls:
+    if (datatype != neon_datatype_signed)
+      return BadImmediateError(exp, Result);
+    intID = Intrinsic::arm_neon_vcls;
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall(intFn, Ops[0]);
+    break;
+
+  case NEON_BUILTIN_vclz:
+    if (datatype != neon_datatype_signed &&
+        datatype != neon_datatype_unsigned)
+      return BadImmediateError(exp, Result);
+    intID = Intrinsic::arm_neon_vclz;
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall(intFn, Ops[0]);
+    break;
+
+  case NEON_BUILTIN_vcnt:
+    if (datatype == neon_datatype_float)
+      return BadImmediateError(exp, Result);
+    intID = Intrinsic::arm_neon_vcnt;
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall(intFn, Ops[0]);
+    break;
+
+  case NEON_BUILTIN_vrecpe:
+    if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vrecpe;
+    else if (datatype == neon_datatype_float)
+      intID = Intrinsic::arm_neon_vrecpef;
+    else
+      return BadImmediateError(exp, Result);
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall(intFn, Ops[0]);
+    break;
+
+  case NEON_BUILTIN_vrsqrte:
+    if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vrsqrte;
+    else if (datatype == neon_datatype_float)
+      intID = Intrinsic::arm_neon_vrsqrtef;
+    else
+      return BadImmediateError(exp, Result);
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall(intFn, Ops[0]);
+    break;
+
+  case NEON_BUILTIN_vmvn:
+    if (datatype == neon_datatype_float)
+      return BadImmediateError(exp, Result);
+    Result = Builder.CreateNot(Ops[0]);
+    break;
+
+  case NEON_BUILTIN_vget_lane: {
+    unsigned NUnits = GET_MODE_NUNITS(insn_data[icode].operand[1].mode);
+    if (!isValidLane(Ops[1], NUnits))
+      return UnexpectedError("%Hinvalid lane number", exp, Result);
+    Result = Builder.CreateExtractElement(Ops[0], Ops[1]);
+    break;
+  }
+
+  case NEON_BUILTIN_vset_lane: {
+    unsigned NUnits = GET_MODE_NUNITS(insn_data[icode].operand[2].mode);
+    if (!isValidLane(Ops[2], NUnits))
+      return UnexpectedError("%Hinvalid lane number", exp, Result);
+    // GCC may promote the scalar argument; cast it back.
+    const VectorType *VTy = dyn_cast<const VectorType>(Ops[1]->getType());
+    assert(VTy && "expected a vector type for vset_lane vector operand");
+    const Type *ElTy = VTy->getElementType();
+    if (Ops[0]->getType() != ElTy) {
+      assert(!ElTy->isFloatingPoint() &&
+             "only integer types expected to be promoted");
+      Ops[0] = Builder.CreateTrunc(Ops[0], ElTy);
+    }
+    Result = Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2]);
+    break;
+  }
+
+  case NEON_BUILTIN_vcreate:
+    Result = Builder.CreateBitCast(Ops[0], ResultType);
+    break;
+
+  case NEON_BUILTIN_vdup_n:
+    Result = BuildDup(ResultType, Ops[0], Builder);
+    break;
+
+  case NEON_BUILTIN_vdup_lane: {
+    unsigned LaneVal;
+    unsigned NUnits = GET_MODE_NUNITS(insn_data[icode].operand[1].mode);
+    if (!isValidLane(Ops[1], NUnits, &LaneVal))
+      return UnexpectedError("%Hinvalid lane number", exp, Result);
+    unsigned DstUnits = GET_MODE_NUNITS(insn_data[icode].operand[0].mode);
+    Result = BuildDupLane(Ops[0], LaneVal, DstUnits, Builder);
+    break;
+  }
+
+  case NEON_BUILTIN_vcombine: {
+    unsigned NUnits = GET_MODE_NUNITS(insn_data[icode].operand[0].mode);
+    std::vector<Constant*> Idxs;
+    for (unsigned i = 0; i != NUnits; ++i)
+      Idxs.push_back(Context.getConstantInt(Type::Int32Ty, i));
+    Result = Builder.CreateShuffleVector(Ops[0], Ops[1],
+                                         Context.getConstantVector(Idxs));
+    break;
+  }
+
+  case NEON_BUILTIN_vget_high:
+  case NEON_BUILTIN_vget_low: {
+    unsigned NUnits = GET_MODE_NUNITS(insn_data[icode].operand[0].mode);
+    std::vector<Constant*> Idxs;
+    unsigned Idx = (neon_code == NEON_BUILTIN_vget_low ? 0 : NUnits);
+    for (unsigned i = 0; i != NUnits; ++i)
+      Idxs.push_back(Context.getConstantInt(Type::Int32Ty, Idx++));
+    Result = Builder.CreateShuffleVector(Ops[0],
+                                         Context.getUndef(Ops[0]->getType()),
+                                         Context.getConstantVector(Idxs));
+    break;
+  }
+
+  case NEON_BUILTIN_vmovn:
+    if (datatype == neon_datatype_signed ||
+        datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vmovn;
+    else
+      return BadImmediateError(exp, Result);
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall(intFn, Ops[0]);
+    break;
+
+  case NEON_BUILTIN_vqmovn:
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vqmovns;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vqmovnu;
+    else
+      return BadImmediateError(exp, Result);
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall(intFn, Ops[0]);
+    break;
+
+  case NEON_BUILTIN_vqmovun:
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vqmovnsu;
+    else
+      return BadImmediateError(exp, Result);
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall(intFn, Ops[0]);
+    break;
+
+  case NEON_BUILTIN_vmovl:
+    if (datatype == neon_datatype_signed)
+      intID = Intrinsic::arm_neon_vmovls;
+    else if (datatype == neon_datatype_unsigned)
+      intID = Intrinsic::arm_neon_vmovlu;
+    else
+      return BadImmediateError(exp, Result);
+    intFn = Intrinsic::getDeclaration(TheModule, intID, &ResultType, 1);
+    Result = Builder.CreateCall(intFn, Ops[0]);
+    break;
+
+  case NEON_BUILTIN_vext: {
+    // Check if immediate operand is valid.
+    unsigned NUnits = GET_MODE_NUNITS(insn_data[icode].operand[1].mode);
+    ConstantInt *Imm = dyn_cast<ConstantInt>(Ops[2]);
+    if (!Imm)
+      return UnexpectedError("%Hinvalid immediate for vext", exp, Result);
+    int ImmVal = Imm->getSExtValue();
+    if (ImmVal < 0 || ImmVal >= (int)NUnits)
+      return UnexpectedError("%Hout of range immediate for vext", exp, Result);
+    if (ImmVal == 0) {
+      Result = Ops[0];
+      break;
+    }
+    // Translate to a vector shuffle.
+    std::vector<Constant*> Idxs;
+    for (unsigned i = 0; i != NUnits; ++i)
+      Idxs.push_back(Context.getConstantInt(Type::Int32Ty, i + ImmVal));
+    Result = Builder.CreateShuffleVector(Ops[0], Ops[1],
+                                         Context.getConstantVector(Idxs));
+    break;
+  }
+
+  case NEON_BUILTIN_vrev64:
+  case NEON_BUILTIN_vrev32:
+  case NEON_BUILTIN_vrev16: {
+    unsigned ChunkBits = 0;
+    switch (neon_code) {
+    case NEON_BUILTIN_vrev64: ChunkBits = 64; break;
+    case NEON_BUILTIN_vrev32: ChunkBits = 32; break;
+    case NEON_BUILTIN_vrev16: ChunkBits = 16; break;
+    default: assert(false);
+    }
+    const VectorType *VTy = dyn_cast<const VectorType>(ResultType);
+    assert(VTy && "expected a vector type");
+    const Type *ElTy = VTy->getElementType();
+    unsigned ChunkElts = ChunkBits / ElTy->getPrimitiveSizeInBits();
+
+    // Translate to a vector shuffle.
+    std::vector<Constant*> Idxs;
+    unsigned NUnits = VTy->getNumElements();
+    for (unsigned c = ChunkElts; c <= NUnits; c += ChunkElts) {
+      for (unsigned i = 0; i != ChunkElts; ++i) {
+        Idxs.push_back(Context.getConstantInt(Type::Int32Ty, c - i - 1));
+      }
+    }
+    Result = Builder.CreateShuffleVector(Ops[0], Context.getUndef(ResultType),
+                                         Context.getConstantVector(Idxs));
+    break;
+  }
+
+  case NEON_BUILTIN_vcvt:
+    if (FLOAT_MODE_P(insn_data[icode].operand[1].mode)) {
+      if (datatype == neon_datatype_unsigned)
+        Result = Builder.CreateFPToUI(Ops[0], ResultType);
+      else if (datatype == neon_datatype_signed)
+        Result = Builder.CreateFPToSI(Ops[0], ResultType);
+      else
+        return BadImmediateError(exp, Result);
+    } else {
+      if (datatype == neon_datatype_unsigned)
+        Result = Builder.CreateUIToFP(Ops[0], ResultType);
+      else if (datatype == neon_datatype_signed)
+        Result = Builder.CreateSIToFP(Ops[0], ResultType);
+      else
+        return BadImmediateError(exp, Result);
+    }
+    break;
+
+  case NEON_BUILTIN_vcvt_n: {
+    // Check if the fractional bits argument is between 1 and 32.
+    ConstantInt *FBits = dyn_cast<ConstantInt>(Ops[1]);
+    if (!FBits)
+      return UnexpectedError("%Hinvalid fractional bit count", exp, Result);
+    int FBitsVal = FBits->getSExtValue();
+    if (FBitsVal < 1 || FBitsVal > 32)
+      return UnexpectedError("%Hinvalid fractional bit count", exp, Result);
+    if (FLOAT_MODE_P(insn_data[icode].operand[1].mode)) {
+      if (datatype == neon_datatype_unsigned)
+        intID = Intrinsic::arm_neon_vcvtfp2fxu;
+      else if (datatype == neon_datatype_signed)
+        intID = Intrinsic::arm_neon_vcvtfp2fxs;
+      else
+        return BadImmediateError(exp, Result);
+    } else {
+      if (datatype == neon_datatype_unsigned)
+        intID = Intrinsic::arm_neon_vcvtfxu2fp;
+      else if (datatype == neon_datatype_signed)
+        intID = Intrinsic::arm_neon_vcvtfxs2fp;
+      else
+        return BadImmediateError(exp, Result);
+    }
+    intOpTypes[0] = ResultType;
+    intOpTypes[1] = Ops[0]->getType();
+    intFn = Intrinsic::getDeclaration(TheModule, intID, intOpTypes, 2);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+  }
+
+  case NEON_BUILTIN_vbsl:
+    Result = Builder.CreateOr(Builder.CreateAnd(Ops[1], Ops[0]),
+                              Builder.CreateAnd(Ops[2],
+                                                Builder.CreateNot(Ops[0])));
+    break;
+
+  case NEON_BUILTIN_vtbl1:
+  case NEON_BUILTIN_vtbl2:
+  case NEON_BUILTIN_vtbl3:
+  case NEON_BUILTIN_vtbl4: {
+    unsigned NUnits = Ops[0]->getType()->getPrimitiveSizeInBits() / 8;
+    intOpTypes[0] = Context.getVectorType(Context.getIntegerType(8), NUnits);
+    intID = Intrinsic::arm_neon_vtbl;
+    intFn = Intrinsic::getDeclaration(TheModule, intID, intOpTypes, 1);
+    Result = Builder.CreateCall2(intFn, Ops[0], Ops[1]);
+    break;
+  }
+
+  case NEON_BUILTIN_vtbx1:
+  case NEON_BUILTIN_vtbx2:
+  case NEON_BUILTIN_vtbx3:
+  case NEON_BUILTIN_vtbx4: {
+    unsigned NUnits = Ops[1]->getType()->getPrimitiveSizeInBits() / 8;
+    intOpTypes[0] = Context.getVectorType(Context.getIntegerType(8), NUnits);
+    intID = Intrinsic::arm_neon_vtbx;
+    intFn = Intrinsic::getDeclaration(TheModule, intID, intOpTypes, 1);
+    Result = Builder.CreateCall3(intFn, Ops[0], Ops[1], Ops[2]);
+    break;
+  }
+
+  case NEON_BUILTIN_vtrn: {
+    // Translate this to a vector shuffle.
+    std::vector<Constant*> Idxs;
+    unsigned NUnits = GET_MODE_NUNITS(insn_data[icode].operand[1].mode);
+    for (unsigned EvenOdd = 0; EvenOdd != 2; ++EvenOdd) {
+      for (unsigned i = 0; i < NUnits; i += 2) {
+        Idxs.push_back(Context.getConstantInt(Type::Int32Ty, i + EvenOdd));
+        Idxs.push_back(Context.getConstantInt(Type::Int32Ty,
+                                              i + NUnits + EvenOdd));
+      }
+    }
+    Result = Builder.CreateShuffleVector(Ops[1], Ops[2],
+                                         Context.getConstantVector(Idxs));
+    Type *PtrTy = Result->getType()->getPointerTo();
+    Builder.CreateStore(Result, BitCastToType(Ops[0], PtrTy));
+    Result = 0;
+    break;
+  }
+
+  case NEON_BUILTIN_vzip: {
+    // Translate this to a vector shuffle.
+    std::vector<Constant*> Idxs;
+    unsigned NUnits = GET_MODE_NUNITS(insn_data[icode].operand[1].mode);
+    for (unsigned i = 0; i != NUnits; ++i) {
+      Idxs.push_back(Context.getConstantInt(Type::Int32Ty, i));
+      Idxs.push_back(Context.getConstantInt(Type::Int32Ty, i + NUnits));
+    }
+    Result = Builder.CreateShuffleVector(Ops[1], Ops[2],
+                                         Context.getConstantVector(Idxs));
+    Type *PtrTy = Result->getType()->getPointerTo();
+    Builder.CreateStore(Result, BitCastToType(Ops[0], PtrTy));
+    Result = 0;
+    break;
+  }
+
+  case NEON_BUILTIN_vuzp: {
+    // Translate this to a vector shuffle.
+    std::vector<Constant*> Idxs;
+    unsigned NUnits = GET_MODE_NUNITS(insn_data[icode].operand[1].mode);
+    for (unsigned EvenOdd = 0; EvenOdd != 2; ++EvenOdd) {
+      for (unsigned i = 0; i != NUnits; ++i)
+        Idxs.push_back(Context.getConstantInt(Type::Int32Ty, 2 * i + EvenOdd));
+    }
+    Result = Builder.CreateShuffleVector(Ops[1], Ops[2],
+                                         Context.getConstantVector(Idxs));
+    Type *PtrTy = Result->getType()->getPointerTo();
+    Builder.CreateStore(Result, BitCastToType(Ops[0], PtrTy));
+    Result = 0;
+    break;
+  }
+
+  case NEON_BUILTIN_vreinterpretv8qi:
+  case NEON_BUILTIN_vreinterpretv4hi:
+  case NEON_BUILTIN_vreinterpretv2si:
+  case NEON_BUILTIN_vreinterpretv2sf:
+  case NEON_BUILTIN_vreinterpretdi:
+  case NEON_BUILTIN_vreinterpretv16qi:
+  case NEON_BUILTIN_vreinterpretv8hi:
+  case NEON_BUILTIN_vreinterpretv4si:
+  case NEON_BUILTIN_vreinterpretv4sf:
+  case NEON_BUILTIN_vreinterpretv2di:
+    Result = Builder.CreateBitCast(Ops[0], ResultType);
+    break;
+
+  case NEON_BUILTIN_vld1:
+  case NEON_BUILTIN_vld2:
+  case NEON_BUILTIN_vld3:
+  case NEON_BUILTIN_vld4: {
+    const VectorType *VTy =
+      GetVldstType(exp, insn_data[icode].operand[0].mode);
+    if (VTy->getElementType()->isFloatingPoint())
+      intID = Intrinsic::arm_neon_vldf;
+    else
+      intID = Intrinsic::arm_neon_vldi;
+    intFn = Intrinsic::getDeclaration(TheModule, intID, (const Type **)&VTy, 1);
+
+    unsigned N = 0;
+    switch (neon_code) {
+    case NEON_BUILTIN_vld1: N = 1; break;
+    case NEON_BUILTIN_vld2: N = 2; break;
+    case NEON_BUILTIN_vld3: N = 3; break;
+    case NEON_BUILTIN_vld4: N = 4; break;
+    default: assert(false);
+    }
+
+    Type *VPTy = Context.getPointerTypeUnqual(Type::Int8Ty);
+    Result = Builder.CreateCall2(intFn, BitCastToType(Ops[0], VPTy),
+                                 Context.getConstantInt(Type::Int32Ty, N));
+    Result = BitCastToType(Result, ResultType);
+    break;
+  }
+
+  case NEON_BUILTIN_vld1_lane:
+  case NEON_BUILTIN_vld2_lane:
+  case NEON_BUILTIN_vld3_lane:
+  case NEON_BUILTIN_vld4_lane: {
+    const VectorType *VTy =
+      GetVldstType(exp, insn_data[icode].operand[0].mode);
+    unsigned LaneVal, NumVecs;
+    switch (neon_code) {
+    case NEON_BUILTIN_vld1_lane: NumVecs = 1; break;
+    case NEON_BUILTIN_vld2_lane: NumVecs = 2; break;
+    case NEON_BUILTIN_vld3_lane: NumVecs = 3; break;
+    case NEON_BUILTIN_vld4_lane: NumVecs = 4; break;
+    default: assert(false);
+    }
+    unsigned NUnits = VTy->getNumElements() / NumVecs;
+    if (!isValidLane(Ops[2], NUnits, &LaneVal))
+      return UnexpectedError("%Hinvalid lane number", exp, Result);
+    Result = BitCastToType(Ops[1], VTy);
+    for (unsigned n = 0; n != NumVecs; ++n) {
+      Value *Addr = (n == 0) ? Ops[0] :
+        Builder.CreateGEP(Ops[0], Context.getConstantInt(Type::Int32Ty, n));
+      Value *Elt = Builder.CreateLoad(Addr);
+      Value *Ndx = Context.getConstantInt(Type::Int32Ty,
+                                          LaneVal + (n * NUnits));
+      Result = Builder.CreateInsertElement(Result, Elt, Ndx);
+    }
+    Result = BitCastToType(Result, ResultType);
+    break;
+  }
+
+  case NEON_BUILTIN_vld1_dup:
+  case NEON_BUILTIN_vld2_dup:
+  case NEON_BUILTIN_vld3_dup:
+  case NEON_BUILTIN_vld4_dup: {
+    const VectorType *VTy =
+      GetVldstType(exp, insn_data[icode].operand[0].mode);
+    unsigned NumVecs;
+    switch (neon_code) {
+    case NEON_BUILTIN_vld1_dup: NumVecs = 1; break;
+    case NEON_BUILTIN_vld2_dup: NumVecs = 2; break;
+    case NEON_BUILTIN_vld3_dup: NumVecs = 3; break;
+    case NEON_BUILTIN_vld4_dup: NumVecs = 4; break;
+    default: assert(false);
+    }
+    unsigned NUnits = VTy->getNumElements() / NumVecs;
+    Result = Context.getUndef(VTy);
+    for (unsigned n = 0; n != NumVecs; ++n) {
+      Value *Addr = (n == 0) ? Ops[0] :
+        Builder.CreateGEP(Ops[0], Context.getConstantInt(Type::Int32Ty, n));
+      Value *Elt = Builder.CreateLoad(Addr);
+      // Insert the value into one lane of the result.
+      Value *Ndx = Context.getConstantInt(Type::Int32Ty, n * NUnits);
+      Result = Builder.CreateInsertElement(Result, Elt, Ndx);
+    }
+    // Use a shuffle to move the value into the other lanes of the vector.
+    if (NUnits > 1) {
+      std::vector<Constant*> Idxs;
+      for (unsigned n = 0; n != NumVecs; ++n) {
+        for (unsigned i = 0; i != NUnits; ++i)
+          Idxs.push_back(Context.getConstantInt(Type::Int32Ty, n * NUnits));
+      }
+      Result = Builder.CreateShuffleVector(Result, Context.getUndef(VTy),
+                                           Context.getConstantVector(Idxs));
+    }
+    Result = BitCastToType(Result, ResultType);
+    break;
+  }
+
+  case NEON_BUILTIN_vst1:
+  case NEON_BUILTIN_vst2:
+  case NEON_BUILTIN_vst3:
+  case NEON_BUILTIN_vst4: {
+    const VectorType *VTy =
+      GetVldstType(exp, insn_data[icode].operand[1].mode);
+    if (VTy->getElementType()->isFloatingPoint())
+      intID = Intrinsic::arm_neon_vstf;
+    else
+      intID = Intrinsic::arm_neon_vsti;
+    intFn = Intrinsic::getDeclaration(TheModule, intID, (const Type **)&VTy, 1);
+
+    unsigned N = 0;
+    switch (neon_code) {
+    case NEON_BUILTIN_vst1: N = 1; break;
+    case NEON_BUILTIN_vst2: N = 2; break;
+    case NEON_BUILTIN_vst3: N = 3; break;
+    case NEON_BUILTIN_vst4: N = 4; break;
+    default: assert(false);
+    }
+
+    Type *VPTy = Context.getPointerTypeUnqual(Type::Int8Ty);
+    Value *Vec = BitCastToType(Ops[1], VTy);
+    Builder.CreateCall3(intFn, BitCastToType(Ops[0], VPTy), Vec,
+                        Context.getConstantInt(Type::Int32Ty, N));
+    Result = 0;
+    break;
+  }
+
+  case NEON_BUILTIN_vst1_lane:
+  case NEON_BUILTIN_vst2_lane:
+  case NEON_BUILTIN_vst3_lane:
+  case NEON_BUILTIN_vst4_lane: {
+    const VectorType *VTy =
+      GetVldstType(exp, insn_data[icode].operand[1].mode);
+    unsigned LaneVal, NumVecs;
+    switch (neon_code) {
+    case NEON_BUILTIN_vst1_lane: NumVecs = 1; break;
+    case NEON_BUILTIN_vst2_lane: NumVecs = 2; break;
+    case NEON_BUILTIN_vst3_lane: NumVecs = 3; break;
+    case NEON_BUILTIN_vst4_lane: NumVecs = 4; break;
+    default: assert(false);
+    }
+    unsigned NUnits = VTy->getNumElements() / NumVecs;
+    if (!isValidLane(Ops[2], NUnits, &LaneVal))
+      return UnexpectedError("%Hinvalid lane number", exp, Result);
+    Value *Vec = BitCastToType(Ops[1], VTy);
+    for (unsigned n = 0; n != NumVecs; ++n) {
+      Value *Addr = (n == 0) ? Ops[0] :
+        Builder.CreateGEP(Ops[0], Context.getConstantInt(Type::Int32Ty, n));
+      Value *Ndx = Context.getConstantInt(Type::Int32Ty,
+                                          LaneVal + (n * NUnits));
+      Builder.CreateStore(Builder.CreateExtractElement(Vec, Ndx), Addr);
+    }
+    Result = 0;
+    break;
+  }
+
+  case NEON_BUILTIN_vand:
+    if (datatype != neon_datatype_signed &&
+        datatype != neon_datatype_unsigned)
+      return BadImmediateError(exp, Result);
+    Result = Builder.CreateAnd(Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vorr:
+    if (datatype != neon_datatype_signed &&
+        datatype != neon_datatype_unsigned)
+      return BadImmediateError(exp, Result);
+    Result = Builder.CreateOr(Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_veor:
+    if (datatype != neon_datatype_signed &&
+        datatype != neon_datatype_unsigned)
+      return BadImmediateError(exp, Result);
+    Result = Builder.CreateXor(Ops[0], Ops[1]);
+    break;
+
+  case NEON_BUILTIN_vbic:
+    if (datatype != neon_datatype_signed &&
+        datatype != neon_datatype_unsigned)
+      return BadImmediateError(exp, Result);
+    Result = Builder.CreateAnd(Ops[0], Builder.CreateNot(Ops[1]));
+    break;
+
+  case NEON_BUILTIN_vorn:
+    if (datatype != neon_datatype_signed &&
+        datatype != neon_datatype_unsigned)
+      return BadImmediateError(exp, Result);
+    Result = Builder.CreateOr(Ops[0], Builder.CreateNot(Ops[1]));
+    break;
+  }
+
+  return true;
+}
+
+/* LLVM LOCAL end (ENTIRE FILE!)  */

Modified: llvm-gcc-4.2/trunk/gcc/config/arm/neon-gen.ml
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/neon-gen.ml?rev=76819&r1=76818&r2=76819&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/neon-gen.ml (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/neon-gen.ml Wed Jul 22 18:35:22 2009
@@ -67,38 +67,46 @@
     0 -> Format.printf "@]@,@<0>}"
   | _ -> Format.printf "@]@,@<0>}@]"
 
+(* LLVM LOCAL begin Print macros instead of inline functions.
+   This is needed so that immediate arguments (e.g., lane numbers, shift
+   amounts, etc.) can be checked for validity.  GCC can check them after
+   inlining, but LLVM does inlining separately.  This is not ideal for
+   error messages.  In the simple cases, llvm-gcc will use the GCC builtin
+   names instead of the user-visible ARM intrinsic names.  In cases where
+   the macros convert arguments to/from scalars (where the GCC builtins
+   expect that for some reason), the error messages may not show any context
+   information at all.  This could all be avoided if the compiler recognized
+   the intrinsics directly. *)
 let print_function arity fnname body =
   let ffmt = start_function () in
-  Format.printf "__extension__ static __inline ";
-  let inl = "__attribute__ ((__always_inline__))" in
+  Format.printf "@[<v 2>#define ";
   begin match arity with
     Arity0 ret ->
-      Format.printf "%s %s@,%s (void)" (string_of_vectype ret) inl fnname
+      Format.printf "%s()" fnname
   | Arity1 (ret, arg0) ->
-      Format.printf "%s %s@,%s (%s __a)" (string_of_vectype ret) inl fnname
-                                        (string_of_vectype arg0)
+      Format.printf "%s(__a)" fnname
   | Arity2 (ret, arg0, arg1) ->
-      Format.printf "%s %s@,%s (%s __a, %s __b)"
-        (string_of_vectype ret) inl fnname (string_of_vectype arg0)
-	(string_of_vectype arg1)
+      Format.printf "%s(__a, __b)" fnname
   | Arity3 (ret, arg0, arg1, arg2) ->
-      Format.printf "%s %s@,%s (%s __a, %s __b, %s __c)"
-        (string_of_vectype ret) inl fnname (string_of_vectype arg0)
-	(string_of_vectype arg1) (string_of_vectype arg2)
+      Format.printf "%s(__a, __b, __c)" fnname
   | Arity4 (ret, arg0, arg1, arg2, arg3) ->
-      Format.printf "%s %s@,%s (%s __a, %s __b, %s __c, %s __d)"
-        (string_of_vectype ret) inl fnname (string_of_vectype arg0)
-	(string_of_vectype arg1) (string_of_vectype arg2)
-        (string_of_vectype arg3)
+      Format.printf "%s(__a, __b, __c, __d)" fnname
   end;
-  open_braceblock ffmt;
+  Format.printf " \\@,";
   let rec print_lines = function
     [] -> ()
+  | [line] -> Format.printf "%s \\" line
+  | line::lines -> Format.printf "%s \\@," line; print_lines lines in
+  let print_macro_body = function
+    [] -> ()
   | [line] -> Format.printf "%s" line
-  | line::lines -> Format.printf "%s@," line; print_lines lines in
-  print_lines body;
-  close_braceblock ffmt;
+  | line::lines -> Format.printf "@[<v 3>({ \\@,%s \\@," line;
+                   print_lines lines;
+                   Format.printf "@]@, })" in
+  print_macro_body body;
+  Format.printf "@]";
   end_function ffmt
+(* LLVM LOCAL end Print macros instead of inline functions.  *)
 
 let return_by_ptr features = List.mem ReturnPtr features
 
@@ -142,6 +150,7 @@
 
 (* Return a tuple of a list of declarations to go at the start of the function,
    and a list of statements needed to return THING.  *)
+(* LLVM LOCAL begin Remove "return" keywords since these are now macros.  *)
 let return arity return_by_ptr thing =
   match arity with
     Arity0 (ret) | Arity1 (ret, _) | Arity2 (ret, _, _) | Arity3 (ret, _, _, _)
@@ -151,13 +160,14 @@
         if return_by_ptr then
           let sname = string_of_vectype ret in
           [Printf.sprintf "%s __rv;" sname],
-          [thing ^ ";"; "return __rv;"]
+          [thing ^ ";"; "__rv;"]
         else
           let uname = union_string num vec "__rv" in
-          [uname ^ ";"], ["__rv.__o = " ^ thing ^ ";"; "return __rv.__i;"]
+          [uname ^ ";"], ["__rv.__o = " ^ thing ^ ";"; "__rv.__i;"]
     | T_void -> [], [thing ^ ";"]
     | _ ->
-        [], ["return " ^ (cast_for_return ret) ^ thing ^ ";"]
+        [], [(cast_for_return ret) ^ thing ^ ";"]
+(* LLVM LOCAL end Remove "return" keywords since these are now macros.  *)
 
 let rec element_type ctype =
   match ctype with
@@ -173,7 +183,8 @@
         let decl = Printf.sprintf "%s = { %s };" uname p in
         pdecls := decl :: !pdecls;
         p ^ "u.__o"
-    | _ -> add_cast t p in
+    (* LLVM LOCAL Omit casts so so we get better error messages.  *)
+    | _ -> (* add_cast t *) p in
   let plist = match ps with
     Arity0 _ -> []
   | Arity1 (_, t1) -> [ptype t1 "__a"]
@@ -367,6 +378,8 @@
 
 let _ =
   print_lines [
+"/* LLVM LOCAL file Changed to use preprocessor macros.  */";
+"/* APPLE LOCAL file v7 support. Merge from Codesourcery */";
 "/* ARM NEON intrinsics include file. This file is generated automatically";
 "   using neon-gen.ml.  Please do not edit manually.";
 "";

Modified: llvm-gcc-4.2/trunk/gcc/config/darwin.h
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/darwin.h?rev=76819&r1=76818&r2=76819&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/darwin.h (original)
+++ llvm-gcc-4.2/trunk/gcc/config/darwin.h Wed Jul 22 18:35:22 2009
@@ -1472,13 +1472,29 @@
     argvec.push_back ("--relocation-model=pic");     \
   else if (!MACHO_DYNAMIC_NO_PIC_P)                  \
     argvec.push_back ("--relocation-model=static")
-#else /* defined (TARGET_386) */
+#elif defined (TARGET_ARM)
+#define LLVM_SET_TARGET_OPTIONS(argvec)              \
+  if (flag_pic)                                      \
+    argvec.push_back ("--relocation-model=pic");     \
+  else if (!MACHO_DYNAMIC_NO_PIC_P)                  \
+    argvec.push_back ("--relocation-model=static");  \
+  if (darwin_iphoneos_version_min)                   \
+    {                                                \
+      const char *p = darwin_iphoneos_version_min;   \
+      if (ISDIGIT (*p) && *p == '1' || *p == '2')    \
+        {                                            \
+          ++p;                                       \
+          if (!p || *p == '.')                       \
+            argvec.push_back("--arm-reserve-r9");    \
+        }                                            \
+    }
+#else /* !TARGET_386 && !TARGET_ARM */
 #define LLVM_SET_TARGET_OPTIONS(argvec)              \
   if (flag_pic)                                      \
     argvec.push_back ("--relocation-model=pic");     \
   else if (!MACHO_DYNAMIC_NO_PIC_P)                  \
     argvec.push_back ("--relocation-model=static")
-#endif /* defined (TARGET_386) */
+#endif /* !TARGET_386 && !TARGET_ARM */
 
 /* On Darwin _Unwind_Resume is sensitive to the dynamic stack layout; we
    use _Unwind_Resume_or_Rethrow instead.  */

Modified: llvm-gcc-4.2/trunk/gcc/llvm-types.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/llvm-types.cpp?rev=76819&r1=76818&r2=76819&view=diff

==============================================================================
--- llvm-gcc-4.2/trunk/gcc/llvm-types.cpp (original)
+++ llvm-gcc-4.2/trunk/gcc/llvm-types.cpp Wed Jul 22 18:35:22 2009
@@ -726,9 +726,13 @@
     }
     // FALL THROUGH.
     type = orig_type;
-  case INTEGER_TYPE:
+  case INTEGER_TYPE: {
     if (const Type *Ty = GET_TYPE_LLVM(type)) return Ty;
-    return SET_TYPE_LLVM(type, Context.getIntegerType(TYPE_PRECISION(type)));
+    // The ARM port defines __builtin_neon_xi as a 511-bit type because GCC's
+    // type precision field has only 9 bits.  Treat this as a special case.
+    int precision = TYPE_PRECISION(type) == 511 ? 512 : TYPE_PRECISION(type);
+    return SET_TYPE_LLVM(type, Context.getIntegerType(precision));
+  }
   case REAL_TYPE:
     if (const Type *Ty = GET_TYPE_LLVM(type)) return Ty;
     switch (TYPE_PRECISION(type)) {





More information about the llvm-commits mailing list