[llvm-commits] [llvm-gcc-4.2] r76781 [2/5] - in /llvm-gcc-4.2/trunk: ./ gcc/ gcc/config/ gcc/config/arm/ gcc/config/rs6000/ gcc/cp/ gcc/doc/ gcc/testsuite/g++.apple/ gcc/testsuite/g++.dg/abi/ gcc/testsuite/gcc.apple/ gcc/testsuite/gcc.target/arm/ gcc/testsuite/gcc.target/arm/neon/ gcc/testsuite/obj-c++.dg/ gcc/testsuite/objc.dg/
Bob Wilson
bob.wilson at apple.com
Wed Jul 22 13:36:46 PDT 2009
Added: llvm-gcc-4.2/trunk/gcc/config/arm/arm_neon.h
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/arm_neon.h?rev=76781&view=auto
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/arm_neon.h (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/arm_neon.h Wed Jul 22 15:36:27 2009
@@ -0,0 +1,12180 @@
+/* APPLE LOCAL file v7 support. Merge from Codesourcery */
+/* ARM NEON intrinsics include file. This file is generated automatically
+ using neon-gen.ml. Please do not edit manually.
+
+ Copyright (C) 2006, 2007 Free Software Foundation, Inc.
+ Contributed by CodeSourcery.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published
+ by the Free Software Foundation; either version 2, or (at your
+ option) any later version.
+
+ GCC is distributed in the hope that it will be useful, but WITHOUT
+ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
+ License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING. If not, write to the
+ Free Software Foundation, 51 Franklin Street, Fifth Floor, Boston,
+ MA 02110-1301, USA. */
+
+/* As a special exception, if you include this header file into source
+ files compiled by GCC, this header file does not by itself cause
+ the resulting executable to be covered by the GNU General Public
+ License. This exception does not however invalidate any other
+ reasons why the executable file might be covered by the GNU General
+ Public License. */
+
+#ifndef _GCC_ARM_NEON_H
+#define _GCC_ARM_NEON_H 1
+
+#ifndef __ARM_NEON__
+#error You must enable NEON instructions (e.g. -mfloat-abi=softfp -mfpu=neon) to use arm_neon.h
+#else
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+
+typedef __builtin_neon_qi int8x8_t __attribute__ ((__vector_size__ (8)));
+typedef __builtin_neon_hi int16x4_t __attribute__ ((__vector_size__ (8)));
+typedef __builtin_neon_si int32x2_t __attribute__ ((__vector_size__ (8)));
+typedef __builtin_neon_di int64x1_t;
+typedef __builtin_neon_sf float32x2_t __attribute__ ((__vector_size__ (8)));
+typedef __builtin_neon_poly8 poly8x8_t __attribute__ ((__vector_size__ (8)));
+typedef __builtin_neon_poly16 poly16x4_t __attribute__ ((__vector_size__ (8)));
+typedef __builtin_neon_uqi uint8x8_t __attribute__ ((__vector_size__ (8)));
+typedef __builtin_neon_uhi uint16x4_t __attribute__ ((__vector_size__ (8)));
+typedef __builtin_neon_usi uint32x2_t __attribute__ ((__vector_size__ (8)));
+typedef __builtin_neon_udi uint64x1_t;
+typedef __builtin_neon_qi int8x16_t __attribute__ ((__vector_size__ (16)));
+typedef __builtin_neon_hi int16x8_t __attribute__ ((__vector_size__ (16)));
+typedef __builtin_neon_si int32x4_t __attribute__ ((__vector_size__ (16)));
+typedef __builtin_neon_di int64x2_t __attribute__ ((__vector_size__ (16)));
+typedef __builtin_neon_sf float32x4_t __attribute__ ((__vector_size__ (16)));
+typedef __builtin_neon_poly8 poly8x16_t __attribute__ ((__vector_size__ (16)));
+typedef __builtin_neon_poly16 poly16x8_t __attribute__ ((__vector_size__ (16)));
+typedef __builtin_neon_uqi uint8x16_t __attribute__ ((__vector_size__ (16)));
+typedef __builtin_neon_uhi uint16x8_t __attribute__ ((__vector_size__ (16)));
+typedef __builtin_neon_usi uint32x4_t __attribute__ ((__vector_size__ (16)));
+typedef __builtin_neon_udi uint64x2_t __attribute__ ((__vector_size__ (16)));
+
+typedef __builtin_neon_sf float32_t;
+typedef __builtin_neon_poly8 poly8_t;
+typedef __builtin_neon_poly16 poly16_t;
+
+typedef struct int8x8x2_t
+{
+ int8x8_t val[2];
+} int8x8x2_t;
+
+typedef struct int8x16x2_t
+{
+ int8x16_t val[2];
+} int8x16x2_t;
+
+typedef struct int16x4x2_t
+{
+ int16x4_t val[2];
+} int16x4x2_t;
+
+typedef struct int16x8x2_t
+{
+ int16x8_t val[2];
+} int16x8x2_t;
+
+typedef struct int32x2x2_t
+{
+ int32x2_t val[2];
+} int32x2x2_t;
+
+typedef struct int32x4x2_t
+{
+ int32x4_t val[2];
+} int32x4x2_t;
+
+typedef struct int64x1x2_t
+{
+ int64x1_t val[2];
+} int64x1x2_t;
+
+typedef struct int64x2x2_t
+{
+ int64x2_t val[2];
+} int64x2x2_t;
+
+typedef struct uint8x8x2_t
+{
+ uint8x8_t val[2];
+} uint8x8x2_t;
+
+typedef struct uint8x16x2_t
+{
+ uint8x16_t val[2];
+} uint8x16x2_t;
+
+typedef struct uint16x4x2_t
+{
+ uint16x4_t val[2];
+} uint16x4x2_t;
+
+typedef struct uint16x8x2_t
+{
+ uint16x8_t val[2];
+} uint16x8x2_t;
+
+typedef struct uint32x2x2_t
+{
+ uint32x2_t val[2];
+} uint32x2x2_t;
+
+typedef struct uint32x4x2_t
+{
+ uint32x4_t val[2];
+} uint32x4x2_t;
+
+typedef struct uint64x1x2_t
+{
+ uint64x1_t val[2];
+} uint64x1x2_t;
+
+typedef struct uint64x2x2_t
+{
+ uint64x2_t val[2];
+} uint64x2x2_t;
+
+typedef struct float32x2x2_t
+{
+ float32x2_t val[2];
+} float32x2x2_t;
+
+typedef struct float32x4x2_t
+{
+ float32x4_t val[2];
+} float32x4x2_t;
+
+typedef struct poly8x8x2_t
+{
+ poly8x8_t val[2];
+} poly8x8x2_t;
+
+typedef struct poly8x16x2_t
+{
+ poly8x16_t val[2];
+} poly8x16x2_t;
+
+typedef struct poly16x4x2_t
+{
+ poly16x4_t val[2];
+} poly16x4x2_t;
+
+typedef struct poly16x8x2_t
+{
+ poly16x8_t val[2];
+} poly16x8x2_t;
+
+typedef struct int8x8x3_t
+{
+ int8x8_t val[3];
+} int8x8x3_t;
+
+typedef struct int8x16x3_t
+{
+ int8x16_t val[3];
+} int8x16x3_t;
+
+typedef struct int16x4x3_t
+{
+ int16x4_t val[3];
+} int16x4x3_t;
+
+typedef struct int16x8x3_t
+{
+ int16x8_t val[3];
+} int16x8x3_t;
+
+typedef struct int32x2x3_t
+{
+ int32x2_t val[3];
+} int32x2x3_t;
+
+typedef struct int32x4x3_t
+{
+ int32x4_t val[3];
+} int32x4x3_t;
+
+typedef struct int64x1x3_t
+{
+ int64x1_t val[3];
+} int64x1x3_t;
+
+typedef struct int64x2x3_t
+{
+ int64x2_t val[3];
+} int64x2x3_t;
+
+typedef struct uint8x8x3_t
+{
+ uint8x8_t val[3];
+} uint8x8x3_t;
+
+typedef struct uint8x16x3_t
+{
+ uint8x16_t val[3];
+} uint8x16x3_t;
+
+typedef struct uint16x4x3_t
+{
+ uint16x4_t val[3];
+} uint16x4x3_t;
+
+typedef struct uint16x8x3_t
+{
+ uint16x8_t val[3];
+} uint16x8x3_t;
+
+typedef struct uint32x2x3_t
+{
+ uint32x2_t val[3];
+} uint32x2x3_t;
+
+typedef struct uint32x4x3_t
+{
+ uint32x4_t val[3];
+} uint32x4x3_t;
+
+typedef struct uint64x1x3_t
+{
+ uint64x1_t val[3];
+} uint64x1x3_t;
+
+typedef struct uint64x2x3_t
+{
+ uint64x2_t val[3];
+} uint64x2x3_t;
+
+typedef struct float32x2x3_t
+{
+ float32x2_t val[3];
+} float32x2x3_t;
+
+typedef struct float32x4x3_t
+{
+ float32x4_t val[3];
+} float32x4x3_t;
+
+typedef struct poly8x8x3_t
+{
+ poly8x8_t val[3];
+} poly8x8x3_t;
+
+typedef struct poly8x16x3_t
+{
+ poly8x16_t val[3];
+} poly8x16x3_t;
+
+typedef struct poly16x4x3_t
+{
+ poly16x4_t val[3];
+} poly16x4x3_t;
+
+typedef struct poly16x8x3_t
+{
+ poly16x8_t val[3];
+} poly16x8x3_t;
+
+typedef struct int8x8x4_t
+{
+ int8x8_t val[4];
+} int8x8x4_t;
+
+typedef struct int8x16x4_t
+{
+ int8x16_t val[4];
+} int8x16x4_t;
+
+typedef struct int16x4x4_t
+{
+ int16x4_t val[4];
+} int16x4x4_t;
+
+typedef struct int16x8x4_t
+{
+ int16x8_t val[4];
+} int16x8x4_t;
+
+typedef struct int32x2x4_t
+{
+ int32x2_t val[4];
+} int32x2x4_t;
+
+typedef struct int32x4x4_t
+{
+ int32x4_t val[4];
+} int32x4x4_t;
+
+typedef struct int64x1x4_t
+{
+ int64x1_t val[4];
+} int64x1x4_t;
+
+typedef struct int64x2x4_t
+{
+ int64x2_t val[4];
+} int64x2x4_t;
+
+typedef struct uint8x8x4_t
+{
+ uint8x8_t val[4];
+} uint8x8x4_t;
+
+typedef struct uint8x16x4_t
+{
+ uint8x16_t val[4];
+} uint8x16x4_t;
+
+typedef struct uint16x4x4_t
+{
+ uint16x4_t val[4];
+} uint16x4x4_t;
+
+typedef struct uint16x8x4_t
+{
+ uint16x8_t val[4];
+} uint16x8x4_t;
+
+typedef struct uint32x2x4_t
+{
+ uint32x2_t val[4];
+} uint32x2x4_t;
+
+typedef struct uint32x4x4_t
+{
+ uint32x4_t val[4];
+} uint32x4x4_t;
+
+typedef struct uint64x1x4_t
+{
+ uint64x1_t val[4];
+} uint64x1x4_t;
+
+typedef struct uint64x2x4_t
+{
+ uint64x2_t val[4];
+} uint64x2x4_t;
+
+typedef struct float32x2x4_t
+{
+ float32x2_t val[4];
+} float32x2x4_t;
+
+typedef struct float32x4x4_t
+{
+ float32x4_t val[4];
+} float32x4x4_t;
+
+typedef struct poly8x8x4_t
+{
+ poly8x8_t val[4];
+} poly8x8x4_t;
+
+typedef struct poly8x16x4_t
+{
+ poly8x16_t val[4];
+} poly8x16x4_t;
+
+typedef struct poly16x4x4_t
+{
+ poly16x4_t val[4];
+} poly16x4x4_t;
+
+typedef struct poly16x8x4_t
+{
+ poly16x8_t val[4];
+} poly16x8x4_t;
+
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vadd_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vaddv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vadd_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vaddv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vadd_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vaddv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vadd_s64 (int64x1_t __a, int64x1_t __b)
+{
+ return (int64x1_t)__builtin_neon_vadddi (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vadd_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (float32x2_t)__builtin_neon_vaddv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vadd_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vadd_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vaddv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vadd_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vaddv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vadd_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+ return (uint64x1_t)__builtin_neon_vadddi ((int64x1_t) __a, (int64x1_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vaddq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t)__builtin_neon_vaddv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vaddq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vaddv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vaddq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vaddv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vaddq_s64 (int64x2_t __a, int64x2_t __b)
+{
+ return (int64x2_t)__builtin_neon_vaddv2di (__a, __b, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vaddq_f32 (float32x4_t __a, float32x4_t __b)
+{
+ return (float32x4_t)__builtin_neon_vaddv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vaddq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vaddv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vaddq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vaddv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vaddq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vaddv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vaddq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vaddv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vaddl_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vaddlv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vaddl_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vaddlv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vaddl_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int64x2_t)__builtin_neon_vaddlv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vaddl_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vaddlv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vaddl_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vaddlv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vaddl_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vaddlv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vaddw_s8 (int16x8_t __a, int8x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vaddwv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vaddw_s16 (int32x4_t __a, int16x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vaddwv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vaddw_s32 (int64x2_t __a, int32x2_t __b)
+{
+ return (int64x2_t)__builtin_neon_vaddwv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vaddw_u8 (uint16x8_t __a, uint8x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vaddwv8qi ((int16x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vaddw_u16 (uint32x4_t __a, uint16x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vaddwv4hi ((int32x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vaddw_u32 (uint64x2_t __a, uint32x2_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vaddwv2si ((int64x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vhadd_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vhaddv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vhadd_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vhaddv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vhadd_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vhaddv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vhadd_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vhaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vhadd_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vhaddv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vhadd_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vhaddv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vhaddq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t)__builtin_neon_vhaddv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vhaddq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vhaddv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vhaddq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vhaddv4si (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vhaddq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vhaddv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vhaddq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vhaddv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vhaddq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vhaddv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrhadd_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vhaddv8qi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrhadd_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vhaddv4hi (__a, __b, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vrhadd_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vhaddv2si (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrhadd_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vhaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 2);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrhadd_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vhaddv4hi ((int16x4_t) __a, (int16x4_t) __b, 2);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrhadd_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vhaddv2si ((int32x2_t) __a, (int32x2_t) __b, 2);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vrhaddq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t)__builtin_neon_vhaddv16qi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vrhaddq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vhaddv8hi (__a, __b, 3);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vrhaddq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vhaddv4si (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vrhaddq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vhaddv16qi ((int8x16_t) __a, (int8x16_t) __b, 2);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vrhaddq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vhaddv8hi ((int16x8_t) __a, (int16x8_t) __b, 2);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrhaddq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vhaddv4si ((int32x4_t) __a, (int32x4_t) __b, 2);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqadd_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vqaddv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqadd_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vqaddv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqadd_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vqaddv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vqadd_s64 (int64x1_t __a, int64x1_t __b)
+{
+ return (int64x1_t)__builtin_neon_vqadddi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqadd_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vqaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqadd_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vqaddv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqadd_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vqaddv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vqadd_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+ return (uint64x1_t)__builtin_neon_vqadddi ((int64x1_t) __a, (int64x1_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqaddq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t)__builtin_neon_vqaddv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqaddq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vqaddv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqaddq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vqaddv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqaddq_s64 (int64x2_t __a, int64x2_t __b)
+{
+ return (int64x2_t)__builtin_neon_vqaddv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vqaddq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vqaddv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vqaddq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vqaddv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vqaddq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vqaddv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vqaddq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vqaddv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vaddhn_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vaddhnv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vaddhn_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vaddhnv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vaddhn_s64 (int64x2_t __a, int64x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vaddhnv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vaddhn_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vaddhnv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vaddhn_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vaddhnv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vaddhn_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vaddhnv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vraddhn_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vaddhnv8hi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vraddhn_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vaddhnv4si (__a, __b, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vraddhn_s64 (int64x2_t __a, int64x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vaddhnv2di (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vraddhn_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vaddhnv8hi ((int16x8_t) __a, (int16x8_t) __b, 2);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vraddhn_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vaddhnv4si ((int32x4_t) __a, (int32x4_t) __b, 2);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vraddhn_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vaddhnv2di ((int64x2_t) __a, (int64x2_t) __b, 2);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vmul_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vmulv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmul_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vmulv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmul_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vmulv2si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmul_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (float32x2_t)__builtin_neon_vmulv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vmul_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vmulv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmul_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vmulv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmul_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vmulv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vmul_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+ return (poly8x8_t)__builtin_neon_vmulv8qi ((int8x8_t) __a, (int8x8_t) __b, 4);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vmulq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t)__builtin_neon_vmulv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmulq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vmulv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmulq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vmulv4si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulq_f32 (float32x4_t __a, float32x4_t __b)
+{
+ return (float32x4_t)__builtin_neon_vmulv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vmulq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vmulv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmulq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vmulv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmulq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vmulv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vmulq_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+ return (poly8x16_t)__builtin_neon_vmulv16qi ((int8x16_t) __a, (int8x16_t) __b, 4);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqdmulh_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vqdmulhv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqdmulh_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vqdmulhv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqdmulhq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vqdmulhv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmulhq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vqdmulhv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqrdmulh_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vqdmulhv4hi (__a, __b, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqrdmulh_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vqdmulhv2si (__a, __b, 3);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqrdmulhq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vqdmulhv8hi (__a, __b, 3);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqrdmulhq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vqdmulhv4si (__a, __b, 3);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmull_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vmullv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vmullv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int64x2_t)__builtin_neon_vmullv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmull_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vmullv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vmullv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vmullv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vmull_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+ return (poly16x8_t)__builtin_neon_vmullv8qi ((int8x8_t) __a, (int8x8_t) __b, 4);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmull_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vqdmullv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqdmull_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int64x2_t)__builtin_neon_vqdmullv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vmla_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
+{
+ return (int8x8_t)__builtin_neon_vmlav8qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmla_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+ return (int16x4_t)__builtin_neon_vmlav4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmla_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+ return (int32x2_t)__builtin_neon_vmlav2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmla_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+ return (float32x2_t)__builtin_neon_vmlav2sf (__a, __b, __c, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vmla_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
+{
+ return (uint8x8_t)__builtin_neon_vmlav8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmla_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
+{
+ return (uint16x4_t)__builtin_neon_vmlav4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmla_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
+{
+ return (uint32x2_t)__builtin_neon_vmlav2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vmlaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
+{
+ return (int8x16_t)__builtin_neon_vmlav16qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
+{
+ return (int16x8_t)__builtin_neon_vmlav8hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
+{
+ return (int32x4_t)__builtin_neon_vmlav4si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmlaq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+ return (float32x4_t)__builtin_neon_vmlav4sf (__a, __b, __c, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vmlaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
+{
+ return (uint8x16_t)__builtin_neon_vmlav16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
+{
+ return (uint16x8_t)__builtin_neon_vmlav8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
+{
+ return (uint32x4_t)__builtin_neon_vmlav4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
+{
+ return (int16x8_t)__builtin_neon_vmlalv8qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+ return (int32x4_t)__builtin_neon_vmlalv4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+ return (int64x2_t)__builtin_neon_vmlalv2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
+{
+ return (uint16x8_t)__builtin_neon_vmlalv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
+{
+ return (uint32x4_t)__builtin_neon_vmlalv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmlal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
+{
+ return (uint64x2_t)__builtin_neon_vmlalv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmlal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+ return (int32x4_t)__builtin_neon_vqdmlalv4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqdmlal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+ return (int64x2_t)__builtin_neon_vqdmlalv2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vmls_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
+{
+ return (int8x8_t)__builtin_neon_vmlsv8qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmls_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+ return (int16x4_t)__builtin_neon_vmlsv4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmls_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+ return (int32x2_t)__builtin_neon_vmlsv2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmls_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+ return (float32x2_t)__builtin_neon_vmlsv2sf (__a, __b, __c, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vmls_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
+{
+ return (uint8x8_t)__builtin_neon_vmlsv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmls_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
+{
+ return (uint16x4_t)__builtin_neon_vmlsv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmls_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
+{
+ return (uint32x2_t)__builtin_neon_vmlsv2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vmlsq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
+{
+ return (int8x16_t)__builtin_neon_vmlsv16qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlsq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
+{
+ return (int16x8_t)__builtin_neon_vmlsv8hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlsq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
+{
+ return (int32x4_t)__builtin_neon_vmlsv4si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmlsq_f32 (float32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+ return (float32x4_t)__builtin_neon_vmlsv4sf (__a, __b, __c, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vmlsq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
+{
+ return (uint8x16_t)__builtin_neon_vmlsv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlsq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
+{
+ return (uint16x8_t)__builtin_neon_vmlsv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlsq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
+{
+ return (uint32x4_t)__builtin_neon_vmlsv4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlsl_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
+{
+ return (int16x8_t)__builtin_neon_vmlslv8qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+ return (int32x4_t)__builtin_neon_vmlslv4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+ return (int64x2_t)__builtin_neon_vmlslv2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlsl_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
+{
+ return (uint16x8_t)__builtin_neon_vmlslv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlsl_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
+{
+ return (uint32x4_t)__builtin_neon_vmlslv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmlsl_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
+{
+ return (uint64x2_t)__builtin_neon_vmlslv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmlsl_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+ return (int32x4_t)__builtin_neon_vqdmlslv4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqdmlsl_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+ return (int64x2_t)__builtin_neon_vqdmlslv2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vsub_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vsubv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vsub_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vsubv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vsub_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vsubv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vsub_s64 (int64x1_t __a, int64x1_t __b)
+{
+ return (int64x1_t)__builtin_neon_vsubdi (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vsub_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (float32x2_t)__builtin_neon_vsubv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vsub_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vsubv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vsub_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vsubv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vsub_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vsubv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vsub_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+ return (uint64x1_t)__builtin_neon_vsubdi ((int64x1_t) __a, (int64x1_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vsubq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t)__builtin_neon_vsubv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vsubq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vsubv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vsubq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vsubv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vsubq_s64 (int64x2_t __a, int64x2_t __b)
+{
+ return (int64x2_t)__builtin_neon_vsubv2di (__a, __b, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vsubq_f32 (float32x4_t __a, float32x4_t __b)
+{
+ return (float32x4_t)__builtin_neon_vsubv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vsubq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vsubv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vsubq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vsubv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vsubq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vsubv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vsubq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vsubv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vsubl_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vsublv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vsubl_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vsublv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vsubl_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int64x2_t)__builtin_neon_vsublv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vsubl_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vsublv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vsubl_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vsublv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vsubl_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vsublv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vsubw_s8 (int16x8_t __a, int8x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vsubwv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vsubw_s16 (int32x4_t __a, int16x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vsubwv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vsubw_s32 (int64x2_t __a, int32x2_t __b)
+{
+ return (int64x2_t)__builtin_neon_vsubwv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vsubw_u8 (uint16x8_t __a, uint8x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vsubwv8qi ((int16x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vsubw_u16 (uint32x4_t __a, uint16x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vsubwv4hi ((int32x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vsubw_u32 (uint64x2_t __a, uint32x2_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vsubwv2si ((int64x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vhsub_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vhsubv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vhsub_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vhsubv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vhsub_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vhsubv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vhsub_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vhsubv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vhsub_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vhsubv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vhsub_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vhsubv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vhsubq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t)__builtin_neon_vhsubv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vhsubq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vhsubv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vhsubq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vhsubv4si (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vhsubq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vhsubv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vhsubq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vhsubv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vhsubq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vhsubv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqsub_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vqsubv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqsub_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vqsubv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqsub_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vqsubv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vqsub_s64 (int64x1_t __a, int64x1_t __b)
+{
+ return (int64x1_t)__builtin_neon_vqsubdi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqsub_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vqsubv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqsub_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vqsubv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqsub_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vqsubv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vqsub_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+ return (uint64x1_t)__builtin_neon_vqsubdi ((int64x1_t) __a, (int64x1_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqsubq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t)__builtin_neon_vqsubv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqsubq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vqsubv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqsubq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vqsubv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqsubq_s64 (int64x2_t __a, int64x2_t __b)
+{
+ return (int64x2_t)__builtin_neon_vqsubv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vqsubq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vqsubv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vqsubq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vqsubv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vqsubq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vqsubv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vqsubq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vqsubv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vsubhn_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vsubhnv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vsubhn_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vsubhnv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vsubhn_s64 (int64x2_t __a, int64x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vsubhnv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vsubhnv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vsubhnv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vsubhnv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrsubhn_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vsubhnv8hi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrsubhn_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vsubhnv4si (__a, __b, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vrsubhn_s64 (int64x2_t __a, int64x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vsubhnv2di (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrsubhn_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vsubhnv8hi ((int16x8_t) __a, (int16x8_t) __b, 2);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrsubhn_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vsubhnv4si ((int32x4_t) __a, (int32x4_t) __b, 2);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrsubhn_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vsubhnv2di ((int64x2_t) __a, (int64x2_t) __b, 2);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vceq_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vceqv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vceq_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vceqv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vceq_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vceqv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vceq_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vceqv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vceq_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vceqv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vceq_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vceqv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vceq_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vceqv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vceq_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vceqv8qi ((int8x8_t) __a, (int8x8_t) __b, 4);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vceqq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vceqv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vceqq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vceqv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vceqq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vceqv4si (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vceqq_f32 (float32x4_t __a, float32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vceqv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vceqq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vceqv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vceqq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vceqv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vceqq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vceqv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vceqq_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vceqv16qi ((int8x16_t) __a, (int8x16_t) __b, 4);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcge_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vcgev8qi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcge_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vcgev4hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcge_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vcgev2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcge_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vcgev2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcge_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vcgev8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcge_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vcgev4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcge_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vcgev2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcgeq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vcgev16qi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgeq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vcgev8hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgeq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vcgev4si (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgeq_f32 (float32x4_t __a, float32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vcgev4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcgeq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vcgev16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgeq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vcgev8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgeq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vcgev4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcle_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vcgev8qi (__b, __a, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcle_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vcgev4hi (__b, __a, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcle_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vcgev2si (__b, __a, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcle_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vcgev2sf (__b, __a, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcle_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vcgev8qi ((int8x8_t) __b, (int8x8_t) __a, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcle_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vcgev4hi ((int16x4_t) __b, (int16x4_t) __a, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcle_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vcgev2si ((int32x2_t) __b, (int32x2_t) __a, 0);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcleq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vcgev16qi (__b, __a, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcleq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vcgev8hi (__b, __a, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcleq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vcgev4si (__b, __a, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcleq_f32 (float32x4_t __a, float32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vcgev4sf (__b, __a, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcleq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vcgev16qi ((int8x16_t) __b, (int8x16_t) __a, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcleq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vcgev8hi ((int16x8_t) __b, (int16x8_t) __a, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcleq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vcgev4si ((int32x4_t) __b, (int32x4_t) __a, 0);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcgt_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vcgtv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcgt_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vcgtv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcgt_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vcgtv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcgt_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vcgtv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcgt_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vcgtv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcgt_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vcgtv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcgt_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vcgtv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcgtq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vcgtv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgtq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vcgtv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgtq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vcgtv4si (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgtq_f32 (float32x4_t __a, float32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vcgtv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcgtq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vcgtv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcgtq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vcgtv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcgtq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vcgtv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vclt_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vcgtv8qi (__b, __a, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vclt_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vcgtv4hi (__b, __a, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vclt_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vcgtv2si (__b, __a, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vclt_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vcgtv2sf (__b, __a, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vclt_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vcgtv8qi ((int8x8_t) __b, (int8x8_t) __a, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vclt_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vcgtv4hi ((int16x4_t) __b, (int16x4_t) __a, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vclt_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vcgtv2si ((int32x2_t) __b, (int32x2_t) __a, 0);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcltq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vcgtv16qi (__b, __a, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcltq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vcgtv8hi (__b, __a, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcltq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vcgtv4si (__b, __a, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcltq_f32 (float32x4_t __a, float32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vcgtv4sf (__b, __a, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcltq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vcgtv16qi ((int8x16_t) __b, (int8x16_t) __a, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcltq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vcgtv8hi ((int16x8_t) __b, (int16x8_t) __a, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcltq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vcgtv4si ((int32x4_t) __b, (int32x4_t) __a, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcage_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vcagev2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcageq_f32 (float32x4_t __a, float32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vcagev4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcale_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vcagev2sf (__b, __a, 5);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcaleq_f32 (float32x4_t __a, float32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vcagev4sf (__b, __a, 5);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcagt_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vcagtv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcagtq_f32 (float32x4_t __a, float32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vcagtv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcalt_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vcagtv2sf (__b, __a, 5);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcaltq_f32 (float32x4_t __a, float32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vcagtv4sf (__b, __a, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtst_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vtstv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vtst_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vtstv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vtst_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vtstv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtst_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vtstv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vtst_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vtstv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vtst_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vtstv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtst_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vtstv8qi ((int8x8_t) __a, (int8x8_t) __b, 4);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vtstq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vtstv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vtstq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vtstv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vtstq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vtstv4si (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vtstq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vtstv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vtstq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vtstv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vtstq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vtstv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vtstq_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vtstv16qi ((int8x16_t) __a, (int8x16_t) __b, 4);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vabd_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vabdv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vabd_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vabdv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vabd_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vabdv2si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vabd_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (float32x2_t)__builtin_neon_vabdv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vabd_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vabdv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vabd_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vabdv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vabd_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vabdv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vabdq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t)__builtin_neon_vabdv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vabdq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vabdv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vabdq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vabdv4si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vabdq_f32 (float32x4_t __a, float32x4_t __b)
+{
+ return (float32x4_t)__builtin_neon_vabdv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vabdq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vabdv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vabdq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vabdv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vabdq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vabdv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vabdl_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vabdlv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vabdl_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vabdlv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vabdl_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int64x2_t)__builtin_neon_vabdlv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vabdl_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vabdlv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vabdl_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vabdlv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vabdl_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vabdlv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vaba_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
+{
+ return (int8x8_t)__builtin_neon_vabav8qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vaba_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+ return (int16x4_t)__builtin_neon_vabav4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vaba_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+ return (int32x2_t)__builtin_neon_vabav2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vaba_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
+{
+ return (uint8x8_t)__builtin_neon_vabav8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vaba_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
+{
+ return (uint16x4_t)__builtin_neon_vabav4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vaba_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
+{
+ return (uint32x2_t)__builtin_neon_vabav2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vabaq_s8 (int8x16_t __a, int8x16_t __b, int8x16_t __c)
+{
+ return (int8x16_t)__builtin_neon_vabav16qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vabaq_s16 (int16x8_t __a, int16x8_t __b, int16x8_t __c)
+{
+ return (int16x8_t)__builtin_neon_vabav8hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vabaq_s32 (int32x4_t __a, int32x4_t __b, int32x4_t __c)
+{
+ return (int32x4_t)__builtin_neon_vabav4si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vabaq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
+{
+ return (uint8x16_t)__builtin_neon_vabav16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vabaq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
+{
+ return (uint16x8_t)__builtin_neon_vabav8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vabaq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
+{
+ return (uint32x4_t)__builtin_neon_vabav4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vabal_s8 (int16x8_t __a, int8x8_t __b, int8x8_t __c)
+{
+ return (int16x8_t)__builtin_neon_vabalv8qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vabal_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+ return (int32x4_t)__builtin_neon_vabalv4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vabal_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+ return (int64x2_t)__builtin_neon_vabalv2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vabal_u8 (uint16x8_t __a, uint8x8_t __b, uint8x8_t __c)
+{
+ return (uint16x8_t)__builtin_neon_vabalv8qi ((int16x8_t) __a, (int8x8_t) __b, (int8x8_t) __c, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vabal_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c)
+{
+ return (uint32x4_t)__builtin_neon_vabalv4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vabal_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c)
+{
+ return (uint64x2_t)__builtin_neon_vabalv2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vmax_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vmaxv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmax_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vmaxv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmax_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vmaxv2si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmax_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (float32x2_t)__builtin_neon_vmaxv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vmax_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vmaxv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmax_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vmaxv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmax_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vmaxv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vmaxq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t)__builtin_neon_vmaxv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmaxq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vmaxv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmaxq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vmaxv4si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmaxq_f32 (float32x4_t __a, float32x4_t __b)
+{
+ return (float32x4_t)__builtin_neon_vmaxv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vmaxq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vmaxv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmaxq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vmaxv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmaxq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vmaxv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vmin_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vminv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmin_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vminv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmin_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vminv2si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmin_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (float32x2_t)__builtin_neon_vminv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vmin_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vminv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmin_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vminv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmin_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vminv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vminq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t)__builtin_neon_vminv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vminq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vminv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vminq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vminv4si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vminq_f32 (float32x4_t __a, float32x4_t __b)
+{
+ return (float32x4_t)__builtin_neon_vminv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vminq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vminv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vminq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vminv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vminq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vminv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vpadd_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vpaddv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vpadd_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vpaddv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vpadd_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vpaddv2si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vpadd_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (float32x2_t)__builtin_neon_vpaddv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vpadd_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vpaddv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vpadd_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vpaddv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vpadd_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vpaddv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vpaddl_s8 (int8x8_t __a)
+{
+ return (int16x4_t)__builtin_neon_vpaddlv8qi (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vpaddl_s16 (int16x4_t __a)
+{
+ return (int32x2_t)__builtin_neon_vpaddlv4hi (__a, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vpaddl_s32 (int32x2_t __a)
+{
+ return (int64x1_t)__builtin_neon_vpaddlv2si (__a, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vpaddl_u8 (uint8x8_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vpaddlv8qi ((int8x8_t) __a, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vpaddl_u16 (uint16x4_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vpaddlv4hi ((int16x4_t) __a, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vpaddl_u32 (uint32x2_t __a)
+{
+ return (uint64x1_t)__builtin_neon_vpaddlv2si ((int32x2_t) __a, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vpaddlq_s8 (int8x16_t __a)
+{
+ return (int16x8_t)__builtin_neon_vpaddlv16qi (__a, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vpaddlq_s16 (int16x8_t __a)
+{
+ return (int32x4_t)__builtin_neon_vpaddlv8hi (__a, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vpaddlq_s32 (int32x4_t __a)
+{
+ return (int64x2_t)__builtin_neon_vpaddlv4si (__a, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vpaddlq_u8 (uint8x16_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vpaddlv16qi ((int8x16_t) __a, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vpaddlq_u16 (uint16x8_t __a)
+{
+ return (uint32x4_t)__builtin_neon_vpaddlv8hi ((int16x8_t) __a, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vpaddlq_u32 (uint32x4_t __a)
+{
+ return (uint64x2_t)__builtin_neon_vpaddlv4si ((int32x4_t) __a, 0);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vpadal_s8 (int16x4_t __a, int8x8_t __b)
+{
+ return (int16x4_t)__builtin_neon_vpadalv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vpadal_s16 (int32x2_t __a, int16x4_t __b)
+{
+ return (int32x2_t)__builtin_neon_vpadalv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vpadal_s32 (int64x1_t __a, int32x2_t __b)
+{
+ return (int64x1_t)__builtin_neon_vpadalv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vpadal_u8 (uint16x4_t __a, uint8x8_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vpadalv8qi ((int16x4_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vpadal_u16 (uint32x2_t __a, uint16x4_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vpadalv4hi ((int32x2_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vpadal_u32 (uint64x1_t __a, uint32x2_t __b)
+{
+ return (uint64x1_t)__builtin_neon_vpadalv2si ((int64x1_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vpadalq_s8 (int16x8_t __a, int8x16_t __b)
+{
+ return (int16x8_t)__builtin_neon_vpadalv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vpadalq_s16 (int32x4_t __a, int16x8_t __b)
+{
+ return (int32x4_t)__builtin_neon_vpadalv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vpadalq_s32 (int64x2_t __a, int32x4_t __b)
+{
+ return (int64x2_t)__builtin_neon_vpadalv4si (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vpadalq_u8 (uint16x8_t __a, uint8x16_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vpadalv16qi ((int16x8_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vpadalq_u16 (uint32x4_t __a, uint16x8_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vpadalv8hi ((int32x4_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vpadalq_u32 (uint64x2_t __a, uint32x4_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vpadalv4si ((int64x2_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vpmax_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vpmaxv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vpmax_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vpmaxv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vpmax_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vpmaxv2si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vpmax_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (float32x2_t)__builtin_neon_vpmaxv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vpmax_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vpmaxv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vpmax_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vpmaxv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vpmax_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vpmaxv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vpmin_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vpminv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vpmin_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vpminv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vpmin_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vpminv2si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vpmin_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (float32x2_t)__builtin_neon_vpminv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vpmin_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vpminv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vpmin_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vpminv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vpmin_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vpminv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vrecps_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (float32x2_t)__builtin_neon_vrecpsv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vrecpsq_f32 (float32x4_t __a, float32x4_t __b)
+{
+ return (float32x4_t)__builtin_neon_vrecpsv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vrsqrts_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (float32x2_t)__builtin_neon_vrsqrtsv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vrsqrtsq_f32 (float32x4_t __a, float32x4_t __b)
+{
+ return (float32x4_t)__builtin_neon_vrsqrtsv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vshl_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vshlv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vshl_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vshlv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vshl_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vshlv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vshl_s64 (int64x1_t __a, int64x1_t __b)
+{
+ return (int64x1_t)__builtin_neon_vshldi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vshl_u8 (uint8x8_t __a, int8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vshlv8qi ((int8x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vshl_u16 (uint16x4_t __a, int16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vshlv4hi ((int16x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vshl_u32 (uint32x2_t __a, int32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vshlv2si ((int32x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vshl_u64 (uint64x1_t __a, int64x1_t __b)
+{
+ return (uint64x1_t)__builtin_neon_vshldi ((int64x1_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vshlq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t)__builtin_neon_vshlv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vshlq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vshlv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vshlq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vshlv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vshlq_s64 (int64x2_t __a, int64x2_t __b)
+{
+ return (int64x2_t)__builtin_neon_vshlv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vshlq_u8 (uint8x16_t __a, int8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vshlv16qi ((int8x16_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vshlq_u16 (uint16x8_t __a, int16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vshlv8hi ((int16x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vshlq_u32 (uint32x4_t __a, int32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vshlv4si ((int32x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vshlq_u64 (uint64x2_t __a, int64x2_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vshlv2di ((int64x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrshl_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vshlv8qi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrshl_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vshlv4hi (__a, __b, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vrshl_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vshlv2si (__a, __b, 3);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vrshl_s64 (int64x1_t __a, int64x1_t __b)
+{
+ return (int64x1_t)__builtin_neon_vshldi (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrshl_u8 (uint8x8_t __a, int8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vshlv8qi ((int8x8_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrshl_u16 (uint16x4_t __a, int16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vshlv4hi ((int16x4_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrshl_u32 (uint32x2_t __a, int32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vshlv2si ((int32x2_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vrshl_u64 (uint64x1_t __a, int64x1_t __b)
+{
+ return (uint64x1_t)__builtin_neon_vshldi ((int64x1_t) __a, __b, 2);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vrshlq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t)__builtin_neon_vshlv16qi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vrshlq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vshlv8hi (__a, __b, 3);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vrshlq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vshlv4si (__a, __b, 3);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vrshlq_s64 (int64x2_t __a, int64x2_t __b)
+{
+ return (int64x2_t)__builtin_neon_vshlv2di (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vrshlq_u8 (uint8x16_t __a, int8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vshlv16qi ((int8x16_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vrshlq_u16 (uint16x8_t __a, int16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vshlv8hi ((int16x8_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrshlq_u32 (uint32x4_t __a, int32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vshlv4si ((int32x4_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vrshlq_u64 (uint64x2_t __a, int64x2_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vshlv2di ((int64x2_t) __a, __b, 2);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqshl_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vqshlv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqshl_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vqshlv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqshl_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vqshlv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vqshl_s64 (int64x1_t __a, int64x1_t __b)
+{
+ return (int64x1_t)__builtin_neon_vqshldi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqshl_u8 (uint8x8_t __a, int8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vqshlv8qi ((int8x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqshl_u16 (uint16x4_t __a, int16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vqshlv4hi ((int16x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqshl_u32 (uint32x2_t __a, int32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vqshlv2si ((int32x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vqshl_u64 (uint64x1_t __a, int64x1_t __b)
+{
+ return (uint64x1_t)__builtin_neon_vqshldi ((int64x1_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqshlq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t)__builtin_neon_vqshlv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqshlq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vqshlv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqshlq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vqshlv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqshlq_s64 (int64x2_t __a, int64x2_t __b)
+{
+ return (int64x2_t)__builtin_neon_vqshlv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vqshlq_u8 (uint8x16_t __a, int8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vqshlv16qi ((int8x16_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vqshlq_u16 (uint16x8_t __a, int16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vqshlv8hi ((int16x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vqshlq_u32 (uint32x4_t __a, int32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vqshlv4si ((int32x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vqshlq_u64 (uint64x2_t __a, int64x2_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vqshlv2di ((int64x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqrshl_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vqshlv8qi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqrshl_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vqshlv4hi (__a, __b, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqrshl_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vqshlv2si (__a, __b, 3);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vqrshl_s64 (int64x1_t __a, int64x1_t __b)
+{
+ return (int64x1_t)__builtin_neon_vqshldi (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqrshl_u8 (uint8x8_t __a, int8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vqshlv8qi ((int8x8_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqrshl_u16 (uint16x4_t __a, int16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vqshlv4hi ((int16x4_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqrshl_u32 (uint32x2_t __a, int32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vqshlv2si ((int32x2_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vqrshl_u64 (uint64x1_t __a, int64x1_t __b)
+{
+ return (uint64x1_t)__builtin_neon_vqshldi ((int64x1_t) __a, __b, 2);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqrshlq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t)__builtin_neon_vqshlv16qi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqrshlq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vqshlv8hi (__a, __b, 3);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqrshlq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vqshlv4si (__a, __b, 3);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqrshlq_s64 (int64x2_t __a, int64x2_t __b)
+{
+ return (int64x2_t)__builtin_neon_vqshlv2di (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vqrshlq_u8 (uint8x16_t __a, int8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vqshlv16qi ((int8x16_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vqrshlq_u16 (uint16x8_t __a, int16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vqshlv8hi ((int16x8_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vqrshlq_u32 (uint32x4_t __a, int32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vqshlv4si ((int32x4_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vqrshlq_u64 (uint64x2_t __a, int64x2_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vqshlv2di ((int64x2_t) __a, __b, 2);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vshr_n_s8 (int8x8_t __a, const int __b)
+{
+ return (int8x8_t)__builtin_neon_vshr_nv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vshr_n_s16 (int16x4_t __a, const int __b)
+{
+ return (int16x4_t)__builtin_neon_vshr_nv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vshr_n_s32 (int32x2_t __a, const int __b)
+{
+ return (int32x2_t)__builtin_neon_vshr_nv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vshr_n_s64 (int64x1_t __a, const int __b)
+{
+ return (int64x1_t)__builtin_neon_vshr_ndi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vshr_n_u8 (uint8x8_t __a, const int __b)
+{
+ return (uint8x8_t)__builtin_neon_vshr_nv8qi ((int8x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vshr_n_u16 (uint16x4_t __a, const int __b)
+{
+ return (uint16x4_t)__builtin_neon_vshr_nv4hi ((int16x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vshr_n_u32 (uint32x2_t __a, const int __b)
+{
+ return (uint32x2_t)__builtin_neon_vshr_nv2si ((int32x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vshr_n_u64 (uint64x1_t __a, const int __b)
+{
+ return (uint64x1_t)__builtin_neon_vshr_ndi ((int64x1_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vshrq_n_s8 (int8x16_t __a, const int __b)
+{
+ return (int8x16_t)__builtin_neon_vshr_nv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vshrq_n_s16 (int16x8_t __a, const int __b)
+{
+ return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vshrq_n_s32 (int32x4_t __a, const int __b)
+{
+ return (int32x4_t)__builtin_neon_vshr_nv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vshrq_n_s64 (int64x2_t __a, const int __b)
+{
+ return (int64x2_t)__builtin_neon_vshr_nv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vshrq_n_u8 (uint8x16_t __a, const int __b)
+{
+ return (uint8x16_t)__builtin_neon_vshr_nv16qi ((int8x16_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vshrq_n_u16 (uint16x8_t __a, const int __b)
+{
+ return (uint16x8_t)__builtin_neon_vshr_nv8hi ((int16x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vshrq_n_u32 (uint32x4_t __a, const int __b)
+{
+ return (uint32x4_t)__builtin_neon_vshr_nv4si ((int32x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vshrq_n_u64 (uint64x2_t __a, const int __b)
+{
+ return (uint64x2_t)__builtin_neon_vshr_nv2di ((int64x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrshr_n_s8 (int8x8_t __a, const int __b)
+{
+ return (int8x8_t)__builtin_neon_vshr_nv8qi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrshr_n_s16 (int16x4_t __a, const int __b)
+{
+ return (int16x4_t)__builtin_neon_vshr_nv4hi (__a, __b, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vrshr_n_s32 (int32x2_t __a, const int __b)
+{
+ return (int32x2_t)__builtin_neon_vshr_nv2si (__a, __b, 3);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vrshr_n_s64 (int64x1_t __a, const int __b)
+{
+ return (int64x1_t)__builtin_neon_vshr_ndi (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrshr_n_u8 (uint8x8_t __a, const int __b)
+{
+ return (uint8x8_t)__builtin_neon_vshr_nv8qi ((int8x8_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrshr_n_u16 (uint16x4_t __a, const int __b)
+{
+ return (uint16x4_t)__builtin_neon_vshr_nv4hi ((int16x4_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrshr_n_u32 (uint32x2_t __a, const int __b)
+{
+ return (uint32x2_t)__builtin_neon_vshr_nv2si ((int32x2_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vrshr_n_u64 (uint64x1_t __a, const int __b)
+{
+ return (uint64x1_t)__builtin_neon_vshr_ndi ((int64x1_t) __a, __b, 2);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vrshrq_n_s8 (int8x16_t __a, const int __b)
+{
+ return (int8x16_t)__builtin_neon_vshr_nv16qi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vrshrq_n_s16 (int16x8_t __a, const int __b)
+{
+ return (int16x8_t)__builtin_neon_vshr_nv8hi (__a, __b, 3);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vrshrq_n_s32 (int32x4_t __a, const int __b)
+{
+ return (int32x4_t)__builtin_neon_vshr_nv4si (__a, __b, 3);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vrshrq_n_s64 (int64x2_t __a, const int __b)
+{
+ return (int64x2_t)__builtin_neon_vshr_nv2di (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vrshrq_n_u8 (uint8x16_t __a, const int __b)
+{
+ return (uint8x16_t)__builtin_neon_vshr_nv16qi ((int8x16_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vrshrq_n_u16 (uint16x8_t __a, const int __b)
+{
+ return (uint16x8_t)__builtin_neon_vshr_nv8hi ((int16x8_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrshrq_n_u32 (uint32x4_t __a, const int __b)
+{
+ return (uint32x4_t)__builtin_neon_vshr_nv4si ((int32x4_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vrshrq_n_u64 (uint64x2_t __a, const int __b)
+{
+ return (uint64x2_t)__builtin_neon_vshr_nv2di ((int64x2_t) __a, __b, 2);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vshrn_n_s16 (int16x8_t __a, const int __b)
+{
+ return (int8x8_t)__builtin_neon_vshrn_nv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vshrn_n_s32 (int32x4_t __a, const int __b)
+{
+ return (int16x4_t)__builtin_neon_vshrn_nv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vshrn_n_s64 (int64x2_t __a, const int __b)
+{
+ return (int32x2_t)__builtin_neon_vshrn_nv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vshrn_n_u16 (uint16x8_t __a, const int __b)
+{
+ return (uint8x8_t)__builtin_neon_vshrn_nv8hi ((int16x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vshrn_n_u32 (uint32x4_t __a, const int __b)
+{
+ return (uint16x4_t)__builtin_neon_vshrn_nv4si ((int32x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vshrn_n_u64 (uint64x2_t __a, const int __b)
+{
+ return (uint32x2_t)__builtin_neon_vshrn_nv2di ((int64x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrshrn_n_s16 (int16x8_t __a, const int __b)
+{
+ return (int8x8_t)__builtin_neon_vshrn_nv8hi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrshrn_n_s32 (int32x4_t __a, const int __b)
+{
+ return (int16x4_t)__builtin_neon_vshrn_nv4si (__a, __b, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vrshrn_n_s64 (int64x2_t __a, const int __b)
+{
+ return (int32x2_t)__builtin_neon_vshrn_nv2di (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrshrn_n_u16 (uint16x8_t __a, const int __b)
+{
+ return (uint8x8_t)__builtin_neon_vshrn_nv8hi ((int16x8_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrshrn_n_u32 (uint32x4_t __a, const int __b)
+{
+ return (uint16x4_t)__builtin_neon_vshrn_nv4si ((int32x4_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrshrn_n_u64 (uint64x2_t __a, const int __b)
+{
+ return (uint32x2_t)__builtin_neon_vshrn_nv2di ((int64x2_t) __a, __b, 2);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqshrn_n_s16 (int16x8_t __a, const int __b)
+{
+ return (int8x8_t)__builtin_neon_vqshrn_nv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqshrn_n_s32 (int32x4_t __a, const int __b)
+{
+ return (int16x4_t)__builtin_neon_vqshrn_nv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqshrn_n_s64 (int64x2_t __a, const int __b)
+{
+ return (int32x2_t)__builtin_neon_vqshrn_nv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqshrn_n_u16 (uint16x8_t __a, const int __b)
+{
+ return (uint8x8_t)__builtin_neon_vqshrn_nv8hi ((int16x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqshrn_n_u32 (uint32x4_t __a, const int __b)
+{
+ return (uint16x4_t)__builtin_neon_vqshrn_nv4si ((int32x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqshrn_n_u64 (uint64x2_t __a, const int __b)
+{
+ return (uint32x2_t)__builtin_neon_vqshrn_nv2di ((int64x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqrshrn_n_s16 (int16x8_t __a, const int __b)
+{
+ return (int8x8_t)__builtin_neon_vqshrn_nv8hi (__a, __b, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqrshrn_n_s32 (int32x4_t __a, const int __b)
+{
+ return (int16x4_t)__builtin_neon_vqshrn_nv4si (__a, __b, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqrshrn_n_s64 (int64x2_t __a, const int __b)
+{
+ return (int32x2_t)__builtin_neon_vqshrn_nv2di (__a, __b, 3);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqrshrn_n_u16 (uint16x8_t __a, const int __b)
+{
+ return (uint8x8_t)__builtin_neon_vqshrn_nv8hi ((int16x8_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqrshrn_n_u32 (uint32x4_t __a, const int __b)
+{
+ return (uint16x4_t)__builtin_neon_vqshrn_nv4si ((int32x4_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqrshrn_n_u64 (uint64x2_t __a, const int __b)
+{
+ return (uint32x2_t)__builtin_neon_vqshrn_nv2di ((int64x2_t) __a, __b, 2);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqshrun_n_s16 (int16x8_t __a, const int __b)
+{
+ return (uint8x8_t)__builtin_neon_vqshrun_nv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqshrun_n_s32 (int32x4_t __a, const int __b)
+{
+ return (uint16x4_t)__builtin_neon_vqshrun_nv4si (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqshrun_n_s64 (int64x2_t __a, const int __b)
+{
+ return (uint32x2_t)__builtin_neon_vqshrun_nv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqrshrun_n_s16 (int16x8_t __a, const int __b)
+{
+ return (uint8x8_t)__builtin_neon_vqshrun_nv8hi (__a, __b, 3);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqrshrun_n_s32 (int32x4_t __a, const int __b)
+{
+ return (uint16x4_t)__builtin_neon_vqshrun_nv4si (__a, __b, 3);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqrshrun_n_s64 (int64x2_t __a, const int __b)
+{
+ return (uint32x2_t)__builtin_neon_vqshrun_nv2di (__a, __b, 3);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vshl_n_s8 (int8x8_t __a, const int __b)
+{
+ return (int8x8_t)__builtin_neon_vshl_nv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vshl_n_s16 (int16x4_t __a, const int __b)
+{
+ return (int16x4_t)__builtin_neon_vshl_nv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vshl_n_s32 (int32x2_t __a, const int __b)
+{
+ return (int32x2_t)__builtin_neon_vshl_nv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vshl_n_s64 (int64x1_t __a, const int __b)
+{
+ return (int64x1_t)__builtin_neon_vshl_ndi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vshl_n_u8 (uint8x8_t __a, const int __b)
+{
+ return (uint8x8_t)__builtin_neon_vshl_nv8qi ((int8x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vshl_n_u16 (uint16x4_t __a, const int __b)
+{
+ return (uint16x4_t)__builtin_neon_vshl_nv4hi ((int16x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vshl_n_u32 (uint32x2_t __a, const int __b)
+{
+ return (uint32x2_t)__builtin_neon_vshl_nv2si ((int32x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vshl_n_u64 (uint64x1_t __a, const int __b)
+{
+ return (uint64x1_t)__builtin_neon_vshl_ndi ((int64x1_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vshlq_n_s8 (int8x16_t __a, const int __b)
+{
+ return (int8x16_t)__builtin_neon_vshl_nv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vshlq_n_s16 (int16x8_t __a, const int __b)
+{
+ return (int16x8_t)__builtin_neon_vshl_nv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vshlq_n_s32 (int32x4_t __a, const int __b)
+{
+ return (int32x4_t)__builtin_neon_vshl_nv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vshlq_n_s64 (int64x2_t __a, const int __b)
+{
+ return (int64x2_t)__builtin_neon_vshl_nv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vshlq_n_u8 (uint8x16_t __a, const int __b)
+{
+ return (uint8x16_t)__builtin_neon_vshl_nv16qi ((int8x16_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vshlq_n_u16 (uint16x8_t __a, const int __b)
+{
+ return (uint16x8_t)__builtin_neon_vshl_nv8hi ((int16x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vshlq_n_u32 (uint32x4_t __a, const int __b)
+{
+ return (uint32x4_t)__builtin_neon_vshl_nv4si ((int32x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vshlq_n_u64 (uint64x2_t __a, const int __b)
+{
+ return (uint64x2_t)__builtin_neon_vshl_nv2di ((int64x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqshl_n_s8 (int8x8_t __a, const int __b)
+{
+ return (int8x8_t)__builtin_neon_vqshl_nv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqshl_n_s16 (int16x4_t __a, const int __b)
+{
+ return (int16x4_t)__builtin_neon_vqshl_nv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqshl_n_s32 (int32x2_t __a, const int __b)
+{
+ return (int32x2_t)__builtin_neon_vqshl_nv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vqshl_n_s64 (int64x1_t __a, const int __b)
+{
+ return (int64x1_t)__builtin_neon_vqshl_ndi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqshl_n_u8 (uint8x8_t __a, const int __b)
+{
+ return (uint8x8_t)__builtin_neon_vqshl_nv8qi ((int8x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqshl_n_u16 (uint16x4_t __a, const int __b)
+{
+ return (uint16x4_t)__builtin_neon_vqshl_nv4hi ((int16x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqshl_n_u32 (uint32x2_t __a, const int __b)
+{
+ return (uint32x2_t)__builtin_neon_vqshl_nv2si ((int32x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vqshl_n_u64 (uint64x1_t __a, const int __b)
+{
+ return (uint64x1_t)__builtin_neon_vqshl_ndi ((int64x1_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqshlq_n_s8 (int8x16_t __a, const int __b)
+{
+ return (int8x16_t)__builtin_neon_vqshl_nv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqshlq_n_s16 (int16x8_t __a, const int __b)
+{
+ return (int16x8_t)__builtin_neon_vqshl_nv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqshlq_n_s32 (int32x4_t __a, const int __b)
+{
+ return (int32x4_t)__builtin_neon_vqshl_nv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqshlq_n_s64 (int64x2_t __a, const int __b)
+{
+ return (int64x2_t)__builtin_neon_vqshl_nv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vqshlq_n_u8 (uint8x16_t __a, const int __b)
+{
+ return (uint8x16_t)__builtin_neon_vqshl_nv16qi ((int8x16_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vqshlq_n_u16 (uint16x8_t __a, const int __b)
+{
+ return (uint16x8_t)__builtin_neon_vqshl_nv8hi ((int16x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vqshlq_n_u32 (uint32x4_t __a, const int __b)
+{
+ return (uint32x4_t)__builtin_neon_vqshl_nv4si ((int32x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vqshlq_n_u64 (uint64x2_t __a, const int __b)
+{
+ return (uint64x2_t)__builtin_neon_vqshl_nv2di ((int64x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqshlu_n_s8 (int8x8_t __a, const int __b)
+{
+ return (uint8x8_t)__builtin_neon_vqshlu_nv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqshlu_n_s16 (int16x4_t __a, const int __b)
+{
+ return (uint16x4_t)__builtin_neon_vqshlu_nv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqshlu_n_s32 (int32x2_t __a, const int __b)
+{
+ return (uint32x2_t)__builtin_neon_vqshlu_nv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vqshlu_n_s64 (int64x1_t __a, const int __b)
+{
+ return (uint64x1_t)__builtin_neon_vqshlu_ndi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vqshluq_n_s8 (int8x16_t __a, const int __b)
+{
+ return (uint8x16_t)__builtin_neon_vqshlu_nv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vqshluq_n_s16 (int16x8_t __a, const int __b)
+{
+ return (uint16x8_t)__builtin_neon_vqshlu_nv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vqshluq_n_s32 (int32x4_t __a, const int __b)
+{
+ return (uint32x4_t)__builtin_neon_vqshlu_nv4si (__a, __b, 1);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vqshluq_n_s64 (int64x2_t __a, const int __b)
+{
+ return (uint64x2_t)__builtin_neon_vqshlu_nv2di (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vshll_n_s8 (int8x8_t __a, const int __b)
+{
+ return (int16x8_t)__builtin_neon_vshll_nv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vshll_n_s16 (int16x4_t __a, const int __b)
+{
+ return (int32x4_t)__builtin_neon_vshll_nv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vshll_n_s32 (int32x2_t __a, const int __b)
+{
+ return (int64x2_t)__builtin_neon_vshll_nv2si (__a, __b, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vshll_n_u8 (uint8x8_t __a, const int __b)
+{
+ return (uint16x8_t)__builtin_neon_vshll_nv8qi ((int8x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vshll_n_u16 (uint16x4_t __a, const int __b)
+{
+ return (uint32x4_t)__builtin_neon_vshll_nv4hi ((int16x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vshll_n_u32 (uint32x2_t __a, const int __b)
+{
+ return (uint64x2_t)__builtin_neon_vshll_nv2si ((int32x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+{
+ return (int8x8_t)__builtin_neon_vsra_nv8qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+ return (int16x4_t)__builtin_neon_vsra_nv4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+ return (int32x2_t)__builtin_neon_vsra_nv2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+{
+ return (int64x1_t)__builtin_neon_vsra_ndi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+{
+ return (uint8x8_t)__builtin_neon_vsra_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+{
+ return (uint16x4_t)__builtin_neon_vsra_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+{
+ return (uint32x2_t)__builtin_neon_vsra_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+{
+ return (uint64x1_t)__builtin_neon_vsra_ndi ((int64x1_t) __a, (int64x1_t) __b, __c, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+{
+ return (int8x16_t)__builtin_neon_vsra_nv16qi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+ return (int16x8_t)__builtin_neon_vsra_nv8hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+ return (int32x4_t)__builtin_neon_vsra_nv4si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+{
+ return (int64x2_t)__builtin_neon_vsra_nv2di (__a, __b, __c, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+{
+ return (uint8x16_t)__builtin_neon_vsra_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+{
+ return (uint16x8_t)__builtin_neon_vsra_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+{
+ return (uint32x4_t)__builtin_neon_vsra_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+{
+ return (uint64x2_t)__builtin_neon_vsra_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrsra_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+{
+ return (int8x8_t)__builtin_neon_vsra_nv8qi (__a, __b, __c, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrsra_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+ return (int16x4_t)__builtin_neon_vsra_nv4hi (__a, __b, __c, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vrsra_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+ return (int32x2_t)__builtin_neon_vsra_nv2si (__a, __b, __c, 3);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vrsra_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+{
+ return (int64x1_t)__builtin_neon_vsra_ndi (__a, __b, __c, 3);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrsra_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+{
+ return (uint8x8_t)__builtin_neon_vsra_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c, 2);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrsra_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+{
+ return (uint16x4_t)__builtin_neon_vsra_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c, 2);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrsra_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+{
+ return (uint32x2_t)__builtin_neon_vsra_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c, 2);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vrsra_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+{
+ return (uint64x1_t)__builtin_neon_vsra_ndi ((int64x1_t) __a, (int64x1_t) __b, __c, 2);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vrsraq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+{
+ return (int8x16_t)__builtin_neon_vsra_nv16qi (__a, __b, __c, 3);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vrsraq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+ return (int16x8_t)__builtin_neon_vsra_nv8hi (__a, __b, __c, 3);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vrsraq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+ return (int32x4_t)__builtin_neon_vsra_nv4si (__a, __b, __c, 3);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vrsraq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+{
+ return (int64x2_t)__builtin_neon_vsra_nv2di (__a, __b, __c, 3);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vrsraq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+{
+ return (uint8x16_t)__builtin_neon_vsra_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c, 2);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vrsraq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+{
+ return (uint16x8_t)__builtin_neon_vsra_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c, 2);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrsraq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+{
+ return (uint32x4_t)__builtin_neon_vsra_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c, 2);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vrsraq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+{
+ return (uint64x2_t)__builtin_neon_vsra_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c, 2);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vsri_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+{
+ return (int8x8_t)__builtin_neon_vsri_nv8qi (__a, __b, __c);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vsri_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+ return (int16x4_t)__builtin_neon_vsri_nv4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vsri_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+ return (int32x2_t)__builtin_neon_vsri_nv2si (__a, __b, __c);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vsri_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+{
+ return (int64x1_t)__builtin_neon_vsri_ndi (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vsri_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+{
+ return (uint8x8_t)__builtin_neon_vsri_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vsri_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+{
+ return (uint16x4_t)__builtin_neon_vsri_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vsri_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+{
+ return (uint32x2_t)__builtin_neon_vsri_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vsri_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+{
+ return (uint64x1_t)__builtin_neon_vsri_ndi ((int64x1_t) __a, (int64x1_t) __b, __c);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vsri_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
+{
+ return (poly8x8_t)__builtin_neon_vsri_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vsri_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
+{
+ return (poly16x4_t)__builtin_neon_vsri_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vsriq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+{
+ return (int8x16_t)__builtin_neon_vsri_nv16qi (__a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vsriq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+ return (int16x8_t)__builtin_neon_vsri_nv8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vsriq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+ return (int32x4_t)__builtin_neon_vsri_nv4si (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vsriq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+{
+ return (int64x2_t)__builtin_neon_vsri_nv2di (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vsriq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+{
+ return (uint8x16_t)__builtin_neon_vsri_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vsriq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+{
+ return (uint16x8_t)__builtin_neon_vsri_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vsriq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+{
+ return (uint32x4_t)__builtin_neon_vsri_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vsriq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+{
+ return (uint64x2_t)__builtin_neon_vsri_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vsriq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
+{
+ return (poly8x16_t)__builtin_neon_vsri_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vsriq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
+{
+ return (poly16x8_t)__builtin_neon_vsri_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vsli_n_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+{
+ return (int8x8_t)__builtin_neon_vsli_nv8qi (__a, __b, __c);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vsli_n_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+ return (int16x4_t)__builtin_neon_vsli_nv4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vsli_n_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+ return (int32x2_t)__builtin_neon_vsli_nv2si (__a, __b, __c);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vsli_n_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+{
+ return (int64x1_t)__builtin_neon_vsli_ndi (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vsli_n_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+{
+ return (uint8x8_t)__builtin_neon_vsli_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vsli_n_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+{
+ return (uint16x4_t)__builtin_neon_vsli_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vsli_n_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+{
+ return (uint32x2_t)__builtin_neon_vsli_nv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vsli_n_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+{
+ return (uint64x1_t)__builtin_neon_vsli_ndi ((int64x1_t) __a, (int64x1_t) __b, __c);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vsli_n_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
+{
+ return (poly8x8_t)__builtin_neon_vsli_nv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vsli_n_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
+{
+ return (poly16x4_t)__builtin_neon_vsli_nv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vsliq_n_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+{
+ return (int8x16_t)__builtin_neon_vsli_nv16qi (__a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vsliq_n_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+ return (int16x8_t)__builtin_neon_vsli_nv8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vsliq_n_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+ return (int32x4_t)__builtin_neon_vsli_nv4si (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vsliq_n_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+{
+ return (int64x2_t)__builtin_neon_vsli_nv2di (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vsliq_n_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+{
+ return (uint8x16_t)__builtin_neon_vsli_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vsliq_n_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+{
+ return (uint16x8_t)__builtin_neon_vsli_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vsliq_n_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+{
+ return (uint32x4_t)__builtin_neon_vsli_nv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vsliq_n_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+{
+ return (uint64x2_t)__builtin_neon_vsli_nv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vsliq_n_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
+{
+ return (poly8x16_t)__builtin_neon_vsli_nv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vsliq_n_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
+{
+ return (poly16x8_t)__builtin_neon_vsli_nv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vabs_s8 (int8x8_t __a)
+{
+ return (int8x8_t)__builtin_neon_vabsv8qi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vabs_s16 (int16x4_t __a)
+{
+ return (int16x4_t)__builtin_neon_vabsv4hi (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vabs_s32 (int32x2_t __a)
+{
+ return (int32x2_t)__builtin_neon_vabsv2si (__a, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vabs_f32 (float32x2_t __a)
+{
+ return (float32x2_t)__builtin_neon_vabsv2sf (__a, 5);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vabsq_s8 (int8x16_t __a)
+{
+ return (int8x16_t)__builtin_neon_vabsv16qi (__a, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vabsq_s16 (int16x8_t __a)
+{
+ return (int16x8_t)__builtin_neon_vabsv8hi (__a, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vabsq_s32 (int32x4_t __a)
+{
+ return (int32x4_t)__builtin_neon_vabsv4si (__a, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vabsq_f32 (float32x4_t __a)
+{
+ return (float32x4_t)__builtin_neon_vabsv4sf (__a, 5);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqabs_s8 (int8x8_t __a)
+{
+ return (int8x8_t)__builtin_neon_vqabsv8qi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqabs_s16 (int16x4_t __a)
+{
+ return (int16x4_t)__builtin_neon_vqabsv4hi (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqabs_s32 (int32x2_t __a)
+{
+ return (int32x2_t)__builtin_neon_vqabsv2si (__a, 1);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqabsq_s8 (int8x16_t __a)
+{
+ return (int8x16_t)__builtin_neon_vqabsv16qi (__a, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqabsq_s16 (int16x8_t __a)
+{
+ return (int16x8_t)__builtin_neon_vqabsv8hi (__a, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqabsq_s32 (int32x4_t __a)
+{
+ return (int32x4_t)__builtin_neon_vqabsv4si (__a, 1);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vneg_s8 (int8x8_t __a)
+{
+ return (int8x8_t)__builtin_neon_vnegv8qi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vneg_s16 (int16x4_t __a)
+{
+ return (int16x4_t)__builtin_neon_vnegv4hi (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vneg_s32 (int32x2_t __a)
+{
+ return (int32x2_t)__builtin_neon_vnegv2si (__a, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vneg_f32 (float32x2_t __a)
+{
+ return (float32x2_t)__builtin_neon_vnegv2sf (__a, 5);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vnegq_s8 (int8x16_t __a)
+{
+ return (int8x16_t)__builtin_neon_vnegv16qi (__a, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vnegq_s16 (int16x8_t __a)
+{
+ return (int16x8_t)__builtin_neon_vnegv8hi (__a, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vnegq_s32 (int32x4_t __a)
+{
+ return (int32x4_t)__builtin_neon_vnegv4si (__a, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vnegq_f32 (float32x4_t __a)
+{
+ return (float32x4_t)__builtin_neon_vnegv4sf (__a, 5);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqneg_s8 (int8x8_t __a)
+{
+ return (int8x8_t)__builtin_neon_vqnegv8qi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqneg_s16 (int16x4_t __a)
+{
+ return (int16x4_t)__builtin_neon_vqnegv4hi (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqneg_s32 (int32x2_t __a)
+{
+ return (int32x2_t)__builtin_neon_vqnegv2si (__a, 1);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vqnegq_s8 (int8x16_t __a)
+{
+ return (int8x16_t)__builtin_neon_vqnegv16qi (__a, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqnegq_s16 (int16x8_t __a)
+{
+ return (int16x8_t)__builtin_neon_vqnegv8hi (__a, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqnegq_s32 (int32x4_t __a)
+{
+ return (int32x4_t)__builtin_neon_vqnegv4si (__a, 1);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vmvn_s8 (int8x8_t __a)
+{
+ return (int8x8_t)__builtin_neon_vmvnv8qi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmvn_s16 (int16x4_t __a)
+{
+ return (int16x4_t)__builtin_neon_vmvnv4hi (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmvn_s32 (int32x2_t __a)
+{
+ return (int32x2_t)__builtin_neon_vmvnv2si (__a, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vmvn_u8 (uint8x8_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vmvnv8qi ((int8x8_t) __a, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmvn_u16 (uint16x4_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vmvnv4hi ((int16x4_t) __a, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmvn_u32 (uint32x2_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vmvnv2si ((int32x2_t) __a, 0);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vmvn_p8 (poly8x8_t __a)
+{
+ return (poly8x8_t)__builtin_neon_vmvnv8qi ((int8x8_t) __a, 4);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vmvnq_s8 (int8x16_t __a)
+{
+ return (int8x16_t)__builtin_neon_vmvnv16qi (__a, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmvnq_s16 (int16x8_t __a)
+{
+ return (int16x8_t)__builtin_neon_vmvnv8hi (__a, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmvnq_s32 (int32x4_t __a)
+{
+ return (int32x4_t)__builtin_neon_vmvnv4si (__a, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vmvnq_u8 (uint8x16_t __a)
+{
+ return (uint8x16_t)__builtin_neon_vmvnv16qi ((int8x16_t) __a, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmvnq_u16 (uint16x8_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vmvnv8hi ((int16x8_t) __a, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmvnq_u32 (uint32x4_t __a)
+{
+ return (uint32x4_t)__builtin_neon_vmvnv4si ((int32x4_t) __a, 0);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vmvnq_p8 (poly8x16_t __a)
+{
+ return (poly8x16_t)__builtin_neon_vmvnv16qi ((int8x16_t) __a, 4);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vcls_s8 (int8x8_t __a)
+{
+ return (int8x8_t)__builtin_neon_vclsv8qi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vcls_s16 (int16x4_t __a)
+{
+ return (int16x4_t)__builtin_neon_vclsv4hi (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vcls_s32 (int32x2_t __a)
+{
+ return (int32x2_t)__builtin_neon_vclsv2si (__a, 1);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vclsq_s8 (int8x16_t __a)
+{
+ return (int8x16_t)__builtin_neon_vclsv16qi (__a, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vclsq_s16 (int16x8_t __a)
+{
+ return (int16x8_t)__builtin_neon_vclsv8hi (__a, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vclsq_s32 (int32x4_t __a)
+{
+ return (int32x4_t)__builtin_neon_vclsv4si (__a, 1);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vclz_s8 (int8x8_t __a)
+{
+ return (int8x8_t)__builtin_neon_vclzv8qi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vclz_s16 (int16x4_t __a)
+{
+ return (int16x4_t)__builtin_neon_vclzv4hi (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vclz_s32 (int32x2_t __a)
+{
+ return (int32x2_t)__builtin_neon_vclzv2si (__a, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vclz_u8 (uint8x8_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vclzv8qi ((int8x8_t) __a, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vclz_u16 (uint16x4_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vclzv4hi ((int16x4_t) __a, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vclz_u32 (uint32x2_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vclzv2si ((int32x2_t) __a, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vclzq_s8 (int8x16_t __a)
+{
+ return (int8x16_t)__builtin_neon_vclzv16qi (__a, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vclzq_s16 (int16x8_t __a)
+{
+ return (int16x8_t)__builtin_neon_vclzv8hi (__a, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vclzq_s32 (int32x4_t __a)
+{
+ return (int32x4_t)__builtin_neon_vclzv4si (__a, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vclzq_u8 (uint8x16_t __a)
+{
+ return (uint8x16_t)__builtin_neon_vclzv16qi ((int8x16_t) __a, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vclzq_u16 (uint16x8_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vclzv8hi ((int16x8_t) __a, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vclzq_u32 (uint32x4_t __a)
+{
+ return (uint32x4_t)__builtin_neon_vclzv4si ((int32x4_t) __a, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vcnt_s8 (int8x8_t __a)
+{
+ return (int8x8_t)__builtin_neon_vcntv8qi (__a, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcnt_u8 (uint8x8_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vcntv8qi ((int8x8_t) __a, 0);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vcnt_p8 (poly8x8_t __a)
+{
+ return (poly8x8_t)__builtin_neon_vcntv8qi ((int8x8_t) __a, 4);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vcntq_s8 (int8x16_t __a)
+{
+ return (int8x16_t)__builtin_neon_vcntv16qi (__a, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcntq_u8 (uint8x16_t __a)
+{
+ return (uint8x16_t)__builtin_neon_vcntv16qi ((int8x16_t) __a, 0);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vcntq_p8 (poly8x16_t __a)
+{
+ return (poly8x16_t)__builtin_neon_vcntv16qi ((int8x16_t) __a, 4);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vrecpe_f32 (float32x2_t __a)
+{
+ return (float32x2_t)__builtin_neon_vrecpev2sf (__a, 5);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrecpe_u32 (uint32x2_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vrecpev2si ((int32x2_t) __a, 0);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vrecpeq_f32 (float32x4_t __a)
+{
+ return (float32x4_t)__builtin_neon_vrecpev4sf (__a, 5);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrecpeq_u32 (uint32x4_t __a)
+{
+ return (uint32x4_t)__builtin_neon_vrecpev4si ((int32x4_t) __a, 0);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vrsqrte_f32 (float32x2_t __a)
+{
+ return (float32x2_t)__builtin_neon_vrsqrtev2sf (__a, 5);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrsqrte_u32 (uint32x2_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vrsqrtev2si ((int32x2_t) __a, 0);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vrsqrteq_f32 (float32x4_t __a)
+{
+ return (float32x4_t)__builtin_neon_vrsqrtev4sf (__a, 5);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrsqrteq_u32 (uint32x4_t __a)
+{
+ return (uint32x4_t)__builtin_neon_vrsqrtev4si ((int32x4_t) __a, 0);
+}
+
+__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+vget_lane_s8 (int8x8_t __a, const int __b)
+{
+ return (int8_t)__builtin_neon_vget_lanev8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+vget_lane_s16 (int16x4_t __a, const int __b)
+{
+ return (int16_t)__builtin_neon_vget_lanev4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vget_lane_s32 (int32x2_t __a, const int __b)
+{
+ return (int32_t)__builtin_neon_vget_lanev2si (__a, __b, 1);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vget_lane_f32 (float32x2_t __a, const int __b)
+{
+ return (float32_t)__builtin_neon_vget_lanev2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
+vget_lane_u8 (uint8x8_t __a, const int __b)
+{
+ return (uint8_t)__builtin_neon_vget_lanev8qi ((int8x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vget_lane_u16 (uint16x4_t __a, const int __b)
+{
+ return (uint16_t)__builtin_neon_vget_lanev4hi ((int16x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vget_lane_u32 (uint32x2_t __a, const int __b)
+{
+ return (uint32_t)__builtin_neon_vget_lanev2si ((int32x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
+vget_lane_p8 (poly8x8_t __a, const int __b)
+{
+ return (poly8_t)__builtin_neon_vget_lanev8qi ((int8x8_t) __a, __b, 4);
+}
+
+__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
+vget_lane_p16 (poly16x4_t __a, const int __b)
+{
+ return (poly16_t)__builtin_neon_vget_lanev4hi ((int16x4_t) __a, __b, 4);
+}
+
+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+vget_lane_s64 (int64x1_t __a, const int __b)
+{
+ return (int64_t)__builtin_neon_vget_lanedi (__a, __b, 1);
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vget_lane_u64 (uint64x1_t __a, const int __b)
+{
+ return (uint64_t)__builtin_neon_vget_lanedi ((int64x1_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8_t __attribute__ ((__always_inline__))
+vgetq_lane_s8 (int8x16_t __a, const int __b)
+{
+ return (int8_t)__builtin_neon_vget_lanev16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16_t __attribute__ ((__always_inline__))
+vgetq_lane_s16 (int16x8_t __a, const int __b)
+{
+ return (int16_t)__builtin_neon_vget_lanev8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32_t __attribute__ ((__always_inline__))
+vgetq_lane_s32 (int32x4_t __a, const int __b)
+{
+ return (int32_t)__builtin_neon_vget_lanev4si (__a, __b, 1);
+}
+
+__extension__ static __inline float32_t __attribute__ ((__always_inline__))
+vgetq_lane_f32 (float32x4_t __a, const int __b)
+{
+ return (float32_t)__builtin_neon_vget_lanev4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint8_t __attribute__ ((__always_inline__))
+vgetq_lane_u8 (uint8x16_t __a, const int __b)
+{
+ return (uint8_t)__builtin_neon_vget_lanev16qi ((int8x16_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint16_t __attribute__ ((__always_inline__))
+vgetq_lane_u16 (uint16x8_t __a, const int __b)
+{
+ return (uint16_t)__builtin_neon_vget_lanev8hi ((int16x8_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32_t __attribute__ ((__always_inline__))
+vgetq_lane_u32 (uint32x4_t __a, const int __b)
+{
+ return (uint32_t)__builtin_neon_vget_lanev4si ((int32x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline poly8_t __attribute__ ((__always_inline__))
+vgetq_lane_p8 (poly8x16_t __a, const int __b)
+{
+ return (poly8_t)__builtin_neon_vget_lanev16qi ((int8x16_t) __a, __b, 4);
+}
+
+__extension__ static __inline poly16_t __attribute__ ((__always_inline__))
+vgetq_lane_p16 (poly16x8_t __a, const int __b)
+{
+ return (poly16_t)__builtin_neon_vget_lanev8hi ((int16x8_t) __a, __b, 4);
+}
+
+__extension__ static __inline int64_t __attribute__ ((__always_inline__))
+vgetq_lane_s64 (int64x2_t __a, const int __b)
+{
+ return (int64_t)__builtin_neon_vget_lanev2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint64_t __attribute__ ((__always_inline__))
+vgetq_lane_u64 (uint64x2_t __a, const int __b)
+{
+ return (uint64_t)__builtin_neon_vget_lanev2di ((int64x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vset_lane_s8 (int8_t __a, int8x8_t __b, const int __c)
+{
+ return (int8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, __b, __c);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vset_lane_s16 (int16_t __a, int16x4_t __b, const int __c)
+{
+ return (int16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, __b, __c);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vset_lane_s32 (int32_t __a, int32x2_t __b, const int __c)
+{
+ return (int32x2_t)__builtin_neon_vset_lanev2si ((__builtin_neon_si) __a, __b, __c);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vset_lane_f32 (float32_t __a, float32x2_t __b, const int __c)
+{
+ return (float32x2_t)__builtin_neon_vset_lanev2sf (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vset_lane_u8 (uint8_t __a, uint8x8_t __b, const int __c)
+{
+ return (uint8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vset_lane_u16 (uint16_t __a, uint16x4_t __b, const int __c)
+{
+ return (uint16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vset_lane_u32 (uint32_t __a, uint32x2_t __b, const int __c)
+{
+ return (uint32x2_t)__builtin_neon_vset_lanev2si ((__builtin_neon_si) __a, (int32x2_t) __b, __c);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vset_lane_p8 (poly8_t __a, poly8x8_t __b, const int __c)
+{
+ return (poly8x8_t)__builtin_neon_vset_lanev8qi ((__builtin_neon_qi) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vset_lane_p16 (poly16_t __a, poly16x4_t __b, const int __c)
+{
+ return (poly16x4_t)__builtin_neon_vset_lanev4hi ((__builtin_neon_hi) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vset_lane_s64 (int64_t __a, int64x1_t __b, const int __c)
+{
+ return (int64x1_t)__builtin_neon_vset_lanedi ((__builtin_neon_di) __a, __b, __c);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vset_lane_u64 (uint64_t __a, uint64x1_t __b, const int __c)
+{
+ return (uint64x1_t)__builtin_neon_vset_lanedi ((__builtin_neon_di) __a, (int64x1_t) __b, __c);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vsetq_lane_s8 (int8_t __a, int8x16_t __b, const int __c)
+{
+ return (int8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vsetq_lane_s16 (int16_t __a, int16x8_t __b, const int __c)
+{
+ return (int16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, __b, __c);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vsetq_lane_s32 (int32_t __a, int32x4_t __b, const int __c)
+{
+ return (int32x4_t)__builtin_neon_vset_lanev4si ((__builtin_neon_si) __a, __b, __c);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vsetq_lane_f32 (float32_t __a, float32x4_t __b, const int __c)
+{
+ return (float32x4_t)__builtin_neon_vset_lanev4sf (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vsetq_lane_u8 (uint8_t __a, uint8x16_t __b, const int __c)
+{
+ return (uint8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vsetq_lane_u16 (uint16_t __a, uint16x8_t __b, const int __c)
+{
+ return (uint16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vsetq_lane_u32 (uint32_t __a, uint32x4_t __b, const int __c)
+{
+ return (uint32x4_t)__builtin_neon_vset_lanev4si ((__builtin_neon_si) __a, (int32x4_t) __b, __c);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vsetq_lane_p8 (poly8_t __a, poly8x16_t __b, const int __c)
+{
+ return (poly8x16_t)__builtin_neon_vset_lanev16qi ((__builtin_neon_qi) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vsetq_lane_p16 (poly16_t __a, poly16x8_t __b, const int __c)
+{
+ return (poly16x8_t)__builtin_neon_vset_lanev8hi ((__builtin_neon_hi) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vsetq_lane_s64 (int64_t __a, int64x2_t __b, const int __c)
+{
+ return (int64x2_t)__builtin_neon_vset_lanev2di ((__builtin_neon_di) __a, __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vsetq_lane_u64 (uint64_t __a, uint64x2_t __b, const int __c)
+{
+ return (uint64x2_t)__builtin_neon_vset_lanev2di ((__builtin_neon_di) __a, (int64x2_t) __b, __c);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vcreate_s8 (uint64_t __a)
+{
+ return (int8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vcreate_s16 (uint64_t __a)
+{
+ return (int16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vcreate_s32 (uint64_t __a)
+{
+ return (int32x2_t)__builtin_neon_vcreatev2si ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vcreate_s64 (uint64_t __a)
+{
+ return (int64x1_t)__builtin_neon_vcreatedi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vcreate_f32 (uint64_t __a)
+{
+ return (float32x2_t)__builtin_neon_vcreatev2sf ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vcreate_u8 (uint64_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vcreate_u16 (uint64_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcreate_u32 (uint64_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vcreatev2si ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vcreate_u64 (uint64_t __a)
+{
+ return (uint64x1_t)__builtin_neon_vcreatedi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vcreate_p8 (uint64_t __a)
+{
+ return (poly8x8_t)__builtin_neon_vcreatev8qi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vcreate_p16 (uint64_t __a)
+{
+ return (poly16x4_t)__builtin_neon_vcreatev4hi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vdup_n_s8 (int8_t __a)
+{
+ return (int8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vdup_n_s16 (int16_t __a)
+{
+ return (int16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vdup_n_s32 (int32_t __a)
+{
+ return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vdup_n_f32 (float32_t __a)
+{
+ return (float32x2_t)__builtin_neon_vdup_nv2sf (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vdup_n_u8 (uint8_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vdup_n_u16 (uint16_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vdup_n_u32 (uint32_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vdup_n_p8 (poly8_t __a)
+{
+ return (poly8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vdup_n_p16 (poly16_t __a)
+{
+ return (poly16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vdup_n_s64 (int64_t __a)
+{
+ return (int64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vdup_n_u64 (uint64_t __a)
+{
+ return (uint64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vdupq_n_s8 (int8_t __a)
+{
+ return (int8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vdupq_n_s16 (int16_t __a)
+{
+ return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vdupq_n_s32 (int32_t __a)
+{
+ return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vdupq_n_f32 (float32_t __a)
+{
+ return (float32x4_t)__builtin_neon_vdup_nv4sf (__a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vdupq_n_u8 (uint8_t __a)
+{
+ return (uint8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vdupq_n_u16 (uint16_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vdupq_n_u32 (uint32_t __a)
+{
+ return (uint32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vdupq_n_p8 (poly8_t __a)
+{
+ return (poly8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vdupq_n_p16 (poly16_t __a)
+{
+ return (poly16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vdupq_n_s64 (int64_t __a)
+{
+ return (int64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vdupq_n_u64 (uint64_t __a)
+{
+ return (uint64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vmov_n_s8 (int8_t __a)
+{
+ return (int8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmov_n_s16 (int16_t __a)
+{
+ return (int16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmov_n_s32 (int32_t __a)
+{
+ return (int32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmov_n_f32 (float32_t __a)
+{
+ return (float32x2_t)__builtin_neon_vdup_nv2sf (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vmov_n_u8 (uint8_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmov_n_u16 (uint16_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmov_n_u32 (uint32_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vdup_nv2si ((__builtin_neon_si) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vmov_n_p8 (poly8_t __a)
+{
+ return (poly8x8_t)__builtin_neon_vdup_nv8qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vmov_n_p16 (poly16_t __a)
+{
+ return (poly16x4_t)__builtin_neon_vdup_nv4hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vmov_n_s64 (int64_t __a)
+{
+ return (int64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vmov_n_u64 (uint64_t __a)
+{
+ return (uint64x1_t)__builtin_neon_vdup_ndi ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vmovq_n_s8 (int8_t __a)
+{
+ return (int8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmovq_n_s16 (int16_t __a)
+{
+ return (int16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmovq_n_s32 (int32_t __a)
+{
+ return (int32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmovq_n_f32 (float32_t __a)
+{
+ return (float32x4_t)__builtin_neon_vdup_nv4sf (__a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vmovq_n_u8 (uint8_t __a)
+{
+ return (uint8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmovq_n_u16 (uint16_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmovq_n_u32 (uint32_t __a)
+{
+ return (uint32x4_t)__builtin_neon_vdup_nv4si ((__builtin_neon_si) __a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vmovq_n_p8 (poly8_t __a)
+{
+ return (poly8x16_t)__builtin_neon_vdup_nv16qi ((__builtin_neon_qi) __a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vmovq_n_p16 (poly16_t __a)
+{
+ return (poly16x8_t)__builtin_neon_vdup_nv8hi ((__builtin_neon_hi) __a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmovq_n_s64 (int64_t __a)
+{
+ return (int64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmovq_n_u64 (uint64_t __a)
+{
+ return (uint64x2_t)__builtin_neon_vdup_nv2di ((__builtin_neon_di) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vdup_lane_s8 (int8x8_t __a, const int __b)
+{
+ return (int8x8_t)__builtin_neon_vdup_lanev8qi (__a, __b);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vdup_lane_s16 (int16x4_t __a, const int __b)
+{
+ return (int16x4_t)__builtin_neon_vdup_lanev4hi (__a, __b);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vdup_lane_s32 (int32x2_t __a, const int __b)
+{
+ return (int32x2_t)__builtin_neon_vdup_lanev2si (__a, __b);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vdup_lane_f32 (float32x2_t __a, const int __b)
+{
+ return (float32x2_t)__builtin_neon_vdup_lanev2sf (__a, __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vdup_lane_u8 (uint8x8_t __a, const int __b)
+{
+ return (uint8x8_t)__builtin_neon_vdup_lanev8qi ((int8x8_t) __a, __b);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vdup_lane_u16 (uint16x4_t __a, const int __b)
+{
+ return (uint16x4_t)__builtin_neon_vdup_lanev4hi ((int16x4_t) __a, __b);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vdup_lane_u32 (uint32x2_t __a, const int __b)
+{
+ return (uint32x2_t)__builtin_neon_vdup_lanev2si ((int32x2_t) __a, __b);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vdup_lane_p8 (poly8x8_t __a, const int __b)
+{
+ return (poly8x8_t)__builtin_neon_vdup_lanev8qi ((int8x8_t) __a, __b);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vdup_lane_p16 (poly16x4_t __a, const int __b)
+{
+ return (poly16x4_t)__builtin_neon_vdup_lanev4hi ((int16x4_t) __a, __b);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vdup_lane_s64 (int64x1_t __a, const int __b)
+{
+ return (int64x1_t)__builtin_neon_vdup_lanedi (__a, __b);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vdup_lane_u64 (uint64x1_t __a, const int __b)
+{
+ return (uint64x1_t)__builtin_neon_vdup_lanedi ((int64x1_t) __a, __b);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vdupq_lane_s8 (int8x8_t __a, const int __b)
+{
+ return (int8x16_t)__builtin_neon_vdup_lanev16qi (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vdupq_lane_s16 (int16x4_t __a, const int __b)
+{
+ return (int16x8_t)__builtin_neon_vdup_lanev8hi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vdupq_lane_s32 (int32x2_t __a, const int __b)
+{
+ return (int32x4_t)__builtin_neon_vdup_lanev4si (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vdupq_lane_f32 (float32x2_t __a, const int __b)
+{
+ return (float32x4_t)__builtin_neon_vdup_lanev4sf (__a, __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vdupq_lane_u8 (uint8x8_t __a, const int __b)
+{
+ return (uint8x16_t)__builtin_neon_vdup_lanev16qi ((int8x8_t) __a, __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vdupq_lane_u16 (uint16x4_t __a, const int __b)
+{
+ return (uint16x8_t)__builtin_neon_vdup_lanev8hi ((int16x4_t) __a, __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vdupq_lane_u32 (uint32x2_t __a, const int __b)
+{
+ return (uint32x4_t)__builtin_neon_vdup_lanev4si ((int32x2_t) __a, __b);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vdupq_lane_p8 (poly8x8_t __a, const int __b)
+{
+ return (poly8x16_t)__builtin_neon_vdup_lanev16qi ((int8x8_t) __a, __b);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vdupq_lane_p16 (poly16x4_t __a, const int __b)
+{
+ return (poly16x8_t)__builtin_neon_vdup_lanev8hi ((int16x4_t) __a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vdupq_lane_s64 (int64x1_t __a, const int __b)
+{
+ return (int64x2_t)__builtin_neon_vdup_lanev2di (__a, __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vdupq_lane_u64 (uint64x1_t __a, const int __b)
+{
+ return (uint64x2_t)__builtin_neon_vdup_lanev2di ((int64x1_t) __a, __b);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vcombine_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x16_t)__builtin_neon_vcombinev8qi (__a, __b);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vcombine_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x8_t)__builtin_neon_vcombinev4hi (__a, __b);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vcombine_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x4_t)__builtin_neon_vcombinev2si (__a, __b);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vcombine_s64 (int64x1_t __a, int64x1_t __b)
+{
+ return (int64x2_t)__builtin_neon_vcombinedi (__a, __b);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcombine_f32 (float32x2_t __a, float32x2_t __b)
+{
+ return (float32x4_t)__builtin_neon_vcombinev2sf (__a, __b);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vcombine_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vcombinev8qi ((int8x8_t) __a, (int8x8_t) __b);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vcombine_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vcombinev4hi ((int16x4_t) __a, (int16x4_t) __b);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcombine_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vcombinev2si ((int32x2_t) __a, (int32x2_t) __b);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vcombine_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vcombinedi ((int64x1_t) __a, (int64x1_t) __b);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vcombine_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+ return (poly8x16_t)__builtin_neon_vcombinev8qi ((int8x8_t) __a, (int8x8_t) __b);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vcombine_p16 (poly16x4_t __a, poly16x4_t __b)
+{
+ return (poly16x8_t)__builtin_neon_vcombinev4hi ((int16x4_t) __a, (int16x4_t) __b);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vget_high_s8 (int8x16_t __a)
+{
+ return (int8x8_t)__builtin_neon_vget_highv16qi (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vget_high_s16 (int16x8_t __a)
+{
+ return (int16x4_t)__builtin_neon_vget_highv8hi (__a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vget_high_s32 (int32x4_t __a)
+{
+ return (int32x2_t)__builtin_neon_vget_highv4si (__a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vget_high_s64 (int64x2_t __a)
+{
+ return (int64x1_t)__builtin_neon_vget_highv2di (__a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vget_high_f32 (float32x4_t __a)
+{
+ return (float32x2_t)__builtin_neon_vget_highv4sf (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vget_high_u8 (uint8x16_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vget_highv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vget_high_u16 (uint16x8_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vget_highv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vget_high_u32 (uint32x4_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vget_highv4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vget_high_u64 (uint64x2_t __a)
+{
+ return (uint64x1_t)__builtin_neon_vget_highv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vget_high_p8 (poly8x16_t __a)
+{
+ return (poly8x8_t)__builtin_neon_vget_highv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vget_high_p16 (poly16x8_t __a)
+{
+ return (poly16x4_t)__builtin_neon_vget_highv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vget_low_s8 (int8x16_t __a)
+{
+ return (int8x8_t)__builtin_neon_vget_lowv16qi (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vget_low_s16 (int16x8_t __a)
+{
+ return (int16x4_t)__builtin_neon_vget_lowv8hi (__a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vget_low_s32 (int32x4_t __a)
+{
+ return (int32x2_t)__builtin_neon_vget_lowv4si (__a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vget_low_s64 (int64x2_t __a)
+{
+ return (int64x1_t)__builtin_neon_vget_lowv2di (__a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vget_low_f32 (float32x4_t __a)
+{
+ return (float32x2_t)__builtin_neon_vget_lowv4sf (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vget_low_u8 (uint8x16_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vget_lowv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vget_low_u16 (uint16x8_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vget_lowv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vget_low_u32 (uint32x4_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vget_lowv4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vget_low_u64 (uint64x2_t __a)
+{
+ return (uint64x1_t)__builtin_neon_vget_lowv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vget_low_p8 (poly8x16_t __a)
+{
+ return (poly8x8_t)__builtin_neon_vget_lowv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vget_low_p16 (poly16x8_t __a)
+{
+ return (poly16x4_t)__builtin_neon_vget_lowv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vcvt_s32_f32 (float32x2_t __a)
+{
+ return (int32x2_t)__builtin_neon_vcvtv2sf (__a, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vcvt_f32_s32 (int32x2_t __a)
+{
+ return (float32x2_t)__builtin_neon_vcvtv2si (__a, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vcvt_f32_u32 (uint32x2_t __a)
+{
+ return (float32x2_t)__builtin_neon_vcvtv2si ((int32x2_t) __a, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcvt_u32_f32 (float32x2_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vcvtv2sf (__a, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vcvtq_s32_f32 (float32x4_t __a)
+{
+ return (int32x4_t)__builtin_neon_vcvtv4sf (__a, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcvtq_f32_s32 (int32x4_t __a)
+{
+ return (float32x4_t)__builtin_neon_vcvtv4si (__a, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcvtq_f32_u32 (uint32x4_t __a)
+{
+ return (float32x4_t)__builtin_neon_vcvtv4si ((int32x4_t) __a, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcvtq_u32_f32 (float32x4_t __a)
+{
+ return (uint32x4_t)__builtin_neon_vcvtv4sf (__a, 0);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vcvt_n_s32_f32 (float32x2_t __a, const int __b)
+{
+ return (int32x2_t)__builtin_neon_vcvt_nv2sf (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vcvt_n_f32_s32 (int32x2_t __a, const int __b)
+{
+ return (float32x2_t)__builtin_neon_vcvt_nv2si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vcvt_n_f32_u32 (uint32x2_t __a, const int __b)
+{
+ return (float32x2_t)__builtin_neon_vcvt_nv2si ((int32x2_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vcvt_n_u32_f32 (float32x2_t __a, const int __b)
+{
+ return (uint32x2_t)__builtin_neon_vcvt_nv2sf (__a, __b, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vcvtq_n_s32_f32 (float32x4_t __a, const int __b)
+{
+ return (int32x4_t)__builtin_neon_vcvt_nv4sf (__a, __b, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcvtq_n_f32_s32 (int32x4_t __a, const int __b)
+{
+ return (float32x4_t)__builtin_neon_vcvt_nv4si (__a, __b, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vcvtq_n_f32_u32 (uint32x4_t __a, const int __b)
+{
+ return (float32x4_t)__builtin_neon_vcvt_nv4si ((int32x4_t) __a, __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vcvtq_n_u32_f32 (float32x4_t __a, const int __b)
+{
+ return (uint32x4_t)__builtin_neon_vcvt_nv4sf (__a, __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vmovn_s16 (int16x8_t __a)
+{
+ return (int8x8_t)__builtin_neon_vmovnv8hi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmovn_s32 (int32x4_t __a)
+{
+ return (int16x4_t)__builtin_neon_vmovnv4si (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmovn_s64 (int64x2_t __a)
+{
+ return (int32x2_t)__builtin_neon_vmovnv2di (__a, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vmovn_u16 (uint16x8_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vmovnv8hi ((int16x8_t) __a, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmovn_u32 (uint32x4_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vmovnv4si ((int32x4_t) __a, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmovn_u64 (uint64x2_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vmovnv2di ((int64x2_t) __a, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vqmovn_s16 (int16x8_t __a)
+{
+ return (int8x8_t)__builtin_neon_vqmovnv8hi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqmovn_s32 (int32x4_t __a)
+{
+ return (int16x4_t)__builtin_neon_vqmovnv4si (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqmovn_s64 (int64x2_t __a)
+{
+ return (int32x2_t)__builtin_neon_vqmovnv2di (__a, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqmovn_u16 (uint16x8_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vqmovnv8hi ((int16x8_t) __a, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqmovn_u32 (uint32x4_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vqmovnv4si ((int32x4_t) __a, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqmovn_u64 (uint64x2_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vqmovnv2di ((int64x2_t) __a, 0);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vqmovun_s16 (int16x8_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vqmovunv8hi (__a, 1);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vqmovun_s32 (int32x4_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vqmovunv4si (__a, 1);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vqmovun_s64 (int64x2_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vqmovunv2di (__a, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmovl_s8 (int8x8_t __a)
+{
+ return (int16x8_t)__builtin_neon_vmovlv8qi (__a, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmovl_s16 (int16x4_t __a)
+{
+ return (int32x4_t)__builtin_neon_vmovlv4hi (__a, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmovl_s32 (int32x2_t __a)
+{
+ return (int64x2_t)__builtin_neon_vmovlv2si (__a, 1);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmovl_u8 (uint8x8_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vmovlv8qi ((int8x8_t) __a, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmovl_u16 (uint16x4_t __a)
+{
+ return (uint32x4_t)__builtin_neon_vmovlv4hi ((int16x4_t) __a, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmovl_u32 (uint32x2_t __a)
+{
+ return (uint64x2_t)__builtin_neon_vmovlv2si ((int32x2_t) __a, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbl1_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vtbl1v8qi (__a, __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbl1_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vtbl1v8qi ((int8x8_t) __a, (int8x8_t) __b);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbl1_p8 (poly8x8_t __a, uint8x8_t __b)
+{
+ return (poly8x8_t)__builtin_neon_vtbl1v8qi ((int8x8_t) __a, (int8x8_t) __b);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbl2_s8 (int8x8x2_t __a, int8x8_t __b)
+{
+ union { int8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a };
+ return (int8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbl2_u8 (uint8x8x2_t __a, uint8x8_t __b)
+{
+ union { uint8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a };
+ return (uint8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, (int8x8_t) __b);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbl2_p8 (poly8x8x2_t __a, uint8x8_t __b)
+{
+ union { poly8x8x2_t __i; __builtin_neon_ti __o; } __au = { __a };
+ return (poly8x8_t)__builtin_neon_vtbl2v8qi (__au.__o, (int8x8_t) __b);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbl3_s8 (int8x8x3_t __a, int8x8_t __b)
+{
+ union { int8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a };
+ return (int8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbl3_u8 (uint8x8x3_t __a, uint8x8_t __b)
+{
+ union { uint8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a };
+ return (uint8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, (int8x8_t) __b);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbl3_p8 (poly8x8x3_t __a, uint8x8_t __b)
+{
+ union { poly8x8x3_t __i; __builtin_neon_ei __o; } __au = { __a };
+ return (poly8x8_t)__builtin_neon_vtbl3v8qi (__au.__o, (int8x8_t) __b);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbl4_s8 (int8x8x4_t __a, int8x8_t __b)
+{
+ union { int8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a };
+ return (int8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, __b);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbl4_u8 (uint8x8x4_t __a, uint8x8_t __b)
+{
+ union { uint8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a };
+ return (uint8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, (int8x8_t) __b);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbl4_p8 (poly8x8x4_t __a, uint8x8_t __b)
+{
+ union { poly8x8x4_t __i; __builtin_neon_oi __o; } __au = { __a };
+ return (poly8x8_t)__builtin_neon_vtbl4v8qi (__au.__o, (int8x8_t) __b);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbx1_s8 (int8x8_t __a, int8x8_t __b, int8x8_t __c)
+{
+ return (int8x8_t)__builtin_neon_vtbx1v8qi (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbx1_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
+{
+ return (uint8x8_t)__builtin_neon_vtbx1v8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbx1_p8 (poly8x8_t __a, poly8x8_t __b, uint8x8_t __c)
+{
+ return (poly8x8_t)__builtin_neon_vtbx1v8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbx2_s8 (int8x8_t __a, int8x8x2_t __b, int8x8_t __c)
+{
+ union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ return (int8x8_t)__builtin_neon_vtbx2v8qi (__a, __bu.__o, __c);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbx2_u8 (uint8x8_t __a, uint8x8x2_t __b, uint8x8_t __c)
+{
+ union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ return (uint8x8_t)__builtin_neon_vtbx2v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbx2_p8 (poly8x8_t __a, poly8x8x2_t __b, uint8x8_t __c)
+{
+ union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ return (poly8x8_t)__builtin_neon_vtbx2v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbx3_s8 (int8x8_t __a, int8x8x3_t __b, int8x8_t __c)
+{
+ union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ return (int8x8_t)__builtin_neon_vtbx3v8qi (__a, __bu.__o, __c);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbx3_u8 (uint8x8_t __a, uint8x8x3_t __b, uint8x8_t __c)
+{
+ union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ return (uint8x8_t)__builtin_neon_vtbx3v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbx3_p8 (poly8x8_t __a, poly8x8x3_t __b, uint8x8_t __c)
+{
+ union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ return (poly8x8_t)__builtin_neon_vtbx3v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vtbx4_s8 (int8x8_t __a, int8x8x4_t __b, int8x8_t __c)
+{
+ union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ return (int8x8_t)__builtin_neon_vtbx4v8qi (__a, __bu.__o, __c);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vtbx4_u8 (uint8x8_t __a, uint8x8x4_t __b, uint8x8_t __c)
+{
+ union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ return (uint8x8_t)__builtin_neon_vtbx4v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vtbx4_p8 (poly8x8_t __a, poly8x8x4_t __b, uint8x8_t __c)
+{
+ union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ return (poly8x8_t)__builtin_neon_vtbx4v8qi ((int8x8_t) __a, __bu.__o, (int8x8_t) __c);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmul_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+ return (int16x4_t)__builtin_neon_vmul_lanev4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmul_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+ return (int32x2_t)__builtin_neon_vmul_lanev2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmul_lane_f32 (float32x2_t __a, float32x2_t __b, const int __c)
+{
+ return (float32x2_t)__builtin_neon_vmul_lanev2sf (__a, __b, __c, 5);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmul_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+{
+ return (uint16x4_t)__builtin_neon_vmul_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, __c, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmul_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+{
+ return (uint32x2_t)__builtin_neon_vmul_lanev2si ((int32x2_t) __a, (int32x2_t) __b, __c, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmulq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
+{
+ return (int16x8_t)__builtin_neon_vmul_lanev8hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmulq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
+{
+ return (int32x4_t)__builtin_neon_vmul_lanev4si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulq_lane_f32 (float32x4_t __a, float32x2_t __b, const int __c)
+{
+ return (float32x4_t)__builtin_neon_vmul_lanev4sf (__a, __b, __c, 5);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmulq_lane_u16 (uint16x8_t __a, uint16x4_t __b, const int __c)
+{
+ return (uint16x8_t)__builtin_neon_vmul_lanev8hi ((int16x8_t) __a, (int16x4_t) __b, __c, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmulq_lane_u32 (uint32x4_t __a, uint32x2_t __b, const int __c)
+{
+ return (uint32x4_t)__builtin_neon_vmul_lanev4si ((int32x4_t) __a, (int32x2_t) __b, __c, 0);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmla_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+{
+ return (int16x4_t)__builtin_neon_vmla_lanev4hi (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmla_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+{
+ return (int32x2_t)__builtin_neon_vmla_lanev2si (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmla_lane_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c, const int __d)
+{
+ return (float32x2_t)__builtin_neon_vmla_lanev2sf (__a, __b, __c, __d, 5);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmla_lane_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
+{
+ return (uint16x4_t)__builtin_neon_vmla_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmla_lane_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
+{
+ return (uint32x2_t)__builtin_neon_vmla_lanev2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlaq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
+{
+ return (int16x8_t)__builtin_neon_vmla_lanev8hi (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlaq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
+{
+ return (int32x4_t)__builtin_neon_vmla_lanev4si (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmlaq_lane_f32 (float32x4_t __a, float32x4_t __b, float32x2_t __c, const int __d)
+{
+ return (float32x4_t)__builtin_neon_vmla_lanev4sf (__a, __b, __c, __d, 5);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlaq_lane_u16 (uint16x8_t __a, uint16x8_t __b, uint16x4_t __c, const int __d)
+{
+ return (uint16x8_t)__builtin_neon_vmla_lanev8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x4_t) __c, __d, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlaq_lane_u32 (uint32x4_t __a, uint32x4_t __b, uint32x2_t __c, const int __d)
+{
+ return (uint32x4_t)__builtin_neon_vmla_lanev4si ((int32x4_t) __a, (int32x4_t) __b, (int32x2_t) __c, __d, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+{
+ return (int32x4_t)__builtin_neon_vmlal_lanev4hi (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+{
+ return (int64x2_t)__builtin_neon_vmlal_lanev2si (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlal_lane_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
+{
+ return (uint32x4_t)__builtin_neon_vmlal_lanev4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmlal_lane_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
+{
+ return (uint64x2_t)__builtin_neon_vmlal_lanev2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmlal_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+{
+ return (int32x4_t)__builtin_neon_vqdmlal_lanev4hi (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqdmlal_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+{
+ return (int64x2_t)__builtin_neon_vqdmlal_lanev2si (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmls_lane_s16 (int16x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+{
+ return (int16x4_t)__builtin_neon_vmls_lanev4hi (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmls_lane_s32 (int32x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+{
+ return (int32x2_t)__builtin_neon_vmls_lanev2si (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmls_lane_f32 (float32x2_t __a, float32x2_t __b, float32x2_t __c, const int __d)
+{
+ return (float32x2_t)__builtin_neon_vmls_lanev2sf (__a, __b, __c, __d, 5);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmls_lane_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
+{
+ return (uint16x4_t)__builtin_neon_vmls_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmls_lane_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
+{
+ return (uint32x2_t)__builtin_neon_vmls_lanev2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlsq_lane_s16 (int16x8_t __a, int16x8_t __b, int16x4_t __c, const int __d)
+{
+ return (int16x8_t)__builtin_neon_vmls_lanev8hi (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlsq_lane_s32 (int32x4_t __a, int32x4_t __b, int32x2_t __c, const int __d)
+{
+ return (int32x4_t)__builtin_neon_vmls_lanev4si (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmlsq_lane_f32 (float32x4_t __a, float32x4_t __b, float32x2_t __c, const int __d)
+{
+ return (float32x4_t)__builtin_neon_vmls_lanev4sf (__a, __b, __c, __d, 5);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlsq_lane_u16 (uint16x8_t __a, uint16x8_t __b, uint16x4_t __c, const int __d)
+{
+ return (uint16x8_t)__builtin_neon_vmls_lanev8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x4_t) __c, __d, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlsq_lane_u32 (uint32x4_t __a, uint32x4_t __b, uint32x2_t __c, const int __d)
+{
+ return (uint32x4_t)__builtin_neon_vmls_lanev4si ((int32x4_t) __a, (int32x4_t) __b, (int32x2_t) __c, __d, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+{
+ return (int32x4_t)__builtin_neon_vmlsl_lanev4hi (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+{
+ return (int64x2_t)__builtin_neon_vmlsl_lanev2si (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlsl_lane_u16 (uint32x4_t __a, uint16x4_t __b, uint16x4_t __c, const int __d)
+{
+ return (uint32x4_t)__builtin_neon_vmlsl_lanev4hi ((int32x4_t) __a, (int16x4_t) __b, (int16x4_t) __c, __d, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmlsl_lane_u32 (uint64x2_t __a, uint32x2_t __b, uint32x2_t __c, const int __d)
+{
+ return (uint64x2_t)__builtin_neon_vmlsl_lanev2si ((int64x2_t) __a, (int32x2_t) __b, (int32x2_t) __c, __d, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmlsl_lane_s16 (int32x4_t __a, int16x4_t __b, int16x4_t __c, const int __d)
+{
+ return (int32x4_t)__builtin_neon_vqdmlsl_lanev4hi (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqdmlsl_lane_s32 (int64x2_t __a, int32x2_t __b, int32x2_t __c, const int __d)
+{
+ return (int64x2_t)__builtin_neon_vqdmlsl_lanev2si (__a, __b, __c, __d, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+ return (int32x4_t)__builtin_neon_vmull_lanev4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+ return (int64x2_t)__builtin_neon_vmull_lanev2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_lane_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+{
+ return (uint32x4_t)__builtin_neon_vmull_lanev4hi ((int16x4_t) __a, (int16x4_t) __b, __c, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_lane_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+{
+ return (uint64x2_t)__builtin_neon_vmull_lanev2si ((int32x2_t) __a, (int32x2_t) __b, __c, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmull_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+ return (int32x4_t)__builtin_neon_vqdmull_lanev4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqdmull_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+ return (int64x2_t)__builtin_neon_vqdmull_lanev2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
+{
+ return (int16x8_t)__builtin_neon_vqdmulh_lanev8hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
+{
+ return (int32x4_t)__builtin_neon_vqdmulh_lanev4si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+ return (int16x4_t)__builtin_neon_vqdmulh_lanev4hi (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+ return (int32x2_t)__builtin_neon_vqdmulh_lanev2si (__a, __b, __c, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqrdmulhq_lane_s16 (int16x8_t __a, int16x4_t __b, const int __c)
+{
+ return (int16x8_t)__builtin_neon_vqdmulh_lanev8hi (__a, __b, __c, 3);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqrdmulhq_lane_s32 (int32x4_t __a, int32x2_t __b, const int __c)
+{
+ return (int32x4_t)__builtin_neon_vqdmulh_lanev4si (__a, __b, __c, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqrdmulh_lane_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+ return (int16x4_t)__builtin_neon_vqdmulh_lanev4hi (__a, __b, __c, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqrdmulh_lane_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+ return (int32x2_t)__builtin_neon_vqdmulh_lanev2si (__a, __b, __c, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmul_n_s16 (int16x4_t __a, int16_t __b)
+{
+ return (int16x4_t)__builtin_neon_vmul_nv4hi (__a, (__builtin_neon_hi) __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmul_n_s32 (int32x2_t __a, int32_t __b)
+{
+ return (int32x2_t)__builtin_neon_vmul_nv2si (__a, (__builtin_neon_si) __b, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmul_n_f32 (float32x2_t __a, float32_t __b)
+{
+ return (float32x2_t)__builtin_neon_vmul_nv2sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmul_n_u16 (uint16x4_t __a, uint16_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vmul_nv4hi ((int16x4_t) __a, (__builtin_neon_hi) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmul_n_u32 (uint32x2_t __a, uint32_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vmul_nv2si ((int32x2_t) __a, (__builtin_neon_si) __b, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmulq_n_s16 (int16x8_t __a, int16_t __b)
+{
+ return (int16x8_t)__builtin_neon_vmul_nv8hi (__a, (__builtin_neon_hi) __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmulq_n_s32 (int32x4_t __a, int32_t __b)
+{
+ return (int32x4_t)__builtin_neon_vmul_nv4si (__a, (__builtin_neon_si) __b, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmulq_n_f32 (float32x4_t __a, float32_t __b)
+{
+ return (float32x4_t)__builtin_neon_vmul_nv4sf (__a, __b, 5);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmulq_n_u16 (uint16x8_t __a, uint16_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vmul_nv8hi ((int16x8_t) __a, (__builtin_neon_hi) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmulq_n_u32 (uint32x4_t __a, uint32_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vmul_nv4si ((int32x4_t) __a, (__builtin_neon_si) __b, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmull_n_s16 (int16x4_t __a, int16_t __b)
+{
+ return (int32x4_t)__builtin_neon_vmull_nv4hi (__a, (__builtin_neon_hi) __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmull_n_s32 (int32x2_t __a, int32_t __b)
+{
+ return (int64x2_t)__builtin_neon_vmull_nv2si (__a, (__builtin_neon_si) __b, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmull_n_u16 (uint16x4_t __a, uint16_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vmull_nv4hi ((int16x4_t) __a, (__builtin_neon_hi) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmull_n_u32 (uint32x2_t __a, uint32_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vmull_nv2si ((int32x2_t) __a, (__builtin_neon_si) __b, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmull_n_s16 (int16x4_t __a, int16_t __b)
+{
+ return (int32x4_t)__builtin_neon_vqdmull_nv4hi (__a, (__builtin_neon_hi) __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqdmull_n_s32 (int32x2_t __a, int32_t __b)
+{
+ return (int64x2_t)__builtin_neon_vqdmull_nv2si (__a, (__builtin_neon_si) __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqdmulhq_n_s16 (int16x8_t __a, int16_t __b)
+{
+ return (int16x8_t)__builtin_neon_vqdmulh_nv8hi (__a, (__builtin_neon_hi) __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmulhq_n_s32 (int32x4_t __a, int32_t __b)
+{
+ return (int32x4_t)__builtin_neon_vqdmulh_nv4si (__a, (__builtin_neon_si) __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqdmulh_n_s16 (int16x4_t __a, int16_t __b)
+{
+ return (int16x4_t)__builtin_neon_vqdmulh_nv4hi (__a, (__builtin_neon_hi) __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqdmulh_n_s32 (int32x2_t __a, int32_t __b)
+{
+ return (int32x2_t)__builtin_neon_vqdmulh_nv2si (__a, (__builtin_neon_si) __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vqrdmulhq_n_s16 (int16x8_t __a, int16_t __b)
+{
+ return (int16x8_t)__builtin_neon_vqdmulh_nv8hi (__a, (__builtin_neon_hi) __b, 3);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqrdmulhq_n_s32 (int32x4_t __a, int32_t __b)
+{
+ return (int32x4_t)__builtin_neon_vqdmulh_nv4si (__a, (__builtin_neon_si) __b, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vqrdmulh_n_s16 (int16x4_t __a, int16_t __b)
+{
+ return (int16x4_t)__builtin_neon_vqdmulh_nv4hi (__a, (__builtin_neon_hi) __b, 3);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vqrdmulh_n_s32 (int32x2_t __a, int32_t __b)
+{
+ return (int32x2_t)__builtin_neon_vqdmulh_nv2si (__a, (__builtin_neon_si) __b, 3);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmla_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c)
+{
+ return (int16x4_t)__builtin_neon_vmla_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmla_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c)
+{
+ return (int32x2_t)__builtin_neon_vmla_nv2si (__a, __b, (__builtin_neon_si) __c, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmla_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
+{
+ return (float32x2_t)__builtin_neon_vmla_nv2sf (__a, __b, __c, 5);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmla_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c)
+{
+ return (uint16x4_t)__builtin_neon_vmla_nv4hi ((int16x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmla_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c)
+{
+ return (uint32x2_t)__builtin_neon_vmla_nv2si ((int32x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlaq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
+{
+ return (int16x8_t)__builtin_neon_vmla_nv8hi (__a, __b, (__builtin_neon_hi) __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlaq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
+{
+ return (int32x4_t)__builtin_neon_vmla_nv4si (__a, __b, (__builtin_neon_si) __c, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmlaq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
+{
+ return (float32x4_t)__builtin_neon_vmla_nv4sf (__a, __b, __c, 5);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlaq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
+{
+ return (uint16x8_t)__builtin_neon_vmla_nv8hi ((int16x8_t) __a, (int16x8_t) __b, (__builtin_neon_hi) __c, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlaq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
+{
+ return (uint32x4_t)__builtin_neon_vmla_nv4si ((int32x4_t) __a, (int32x4_t) __b, (__builtin_neon_si) __c, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
+{
+ return (int32x4_t)__builtin_neon_vmlal_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
+{
+ return (int64x2_t)__builtin_neon_vmlal_nv2si (__a, __b, (__builtin_neon_si) __c, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlal_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c)
+{
+ return (uint32x4_t)__builtin_neon_vmlal_nv4hi ((int32x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmlal_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c)
+{
+ return (uint64x2_t)__builtin_neon_vmlal_nv2si ((int64x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmlal_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
+{
+ return (int32x4_t)__builtin_neon_vqdmlal_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqdmlal_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
+{
+ return (int64x2_t)__builtin_neon_vqdmlal_nv2si (__a, __b, (__builtin_neon_si) __c, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vmls_n_s16 (int16x4_t __a, int16x4_t __b, int16_t __c)
+{
+ return (int16x4_t)__builtin_neon_vmls_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vmls_n_s32 (int32x2_t __a, int32x2_t __b, int32_t __c)
+{
+ return (int32x2_t)__builtin_neon_vmls_nv2si (__a, __b, (__builtin_neon_si) __c, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vmls_n_f32 (float32x2_t __a, float32x2_t __b, float32_t __c)
+{
+ return (float32x2_t)__builtin_neon_vmls_nv2sf (__a, __b, __c, 5);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vmls_n_u16 (uint16x4_t __a, uint16x4_t __b, uint16_t __c)
+{
+ return (uint16x4_t)__builtin_neon_vmls_nv4hi ((int16x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vmls_n_u32 (uint32x2_t __a, uint32x2_t __b, uint32_t __c)
+{
+ return (uint32x2_t)__builtin_neon_vmls_nv2si ((int32x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c, 0);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vmlsq_n_s16 (int16x8_t __a, int16x8_t __b, int16_t __c)
+{
+ return (int16x8_t)__builtin_neon_vmls_nv8hi (__a, __b, (__builtin_neon_hi) __c, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlsq_n_s32 (int32x4_t __a, int32x4_t __b, int32_t __c)
+{
+ return (int32x4_t)__builtin_neon_vmls_nv4si (__a, __b, (__builtin_neon_si) __c, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vmlsq_n_f32 (float32x4_t __a, float32x4_t __b, float32_t __c)
+{
+ return (float32x4_t)__builtin_neon_vmls_nv4sf (__a, __b, __c, 5);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vmlsq_n_u16 (uint16x8_t __a, uint16x8_t __b, uint16_t __c)
+{
+ return (uint16x8_t)__builtin_neon_vmls_nv8hi ((int16x8_t) __a, (int16x8_t) __b, (__builtin_neon_hi) __c, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlsq_n_u32 (uint32x4_t __a, uint32x4_t __b, uint32_t __c)
+{
+ return (uint32x4_t)__builtin_neon_vmls_nv4si ((int32x4_t) __a, (int32x4_t) __b, (__builtin_neon_si) __c, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
+{
+ return (int32x4_t)__builtin_neon_vmlsl_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
+{
+ return (int64x2_t)__builtin_neon_vmlsl_nv2si (__a, __b, (__builtin_neon_si) __c, 1);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vmlsl_n_u16 (uint32x4_t __a, uint16x4_t __b, uint16_t __c)
+{
+ return (uint32x4_t)__builtin_neon_vmlsl_nv4hi ((int32x4_t) __a, (int16x4_t) __b, (__builtin_neon_hi) __c, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vmlsl_n_u32 (uint64x2_t __a, uint32x2_t __b, uint32_t __c)
+{
+ return (uint64x2_t)__builtin_neon_vmlsl_nv2si ((int64x2_t) __a, (int32x2_t) __b, (__builtin_neon_si) __c, 0);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vqdmlsl_n_s16 (int32x4_t __a, int16x4_t __b, int16_t __c)
+{
+ return (int32x4_t)__builtin_neon_vqdmlsl_nv4hi (__a, __b, (__builtin_neon_hi) __c, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vqdmlsl_n_s32 (int64x2_t __a, int32x2_t __b, int32_t __c)
+{
+ return (int64x2_t)__builtin_neon_vqdmlsl_nv2si (__a, __b, (__builtin_neon_si) __c, 1);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vext_s8 (int8x8_t __a, int8x8_t __b, const int __c)
+{
+ return (int8x8_t)__builtin_neon_vextv8qi (__a, __b, __c);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vext_s16 (int16x4_t __a, int16x4_t __b, const int __c)
+{
+ return (int16x4_t)__builtin_neon_vextv4hi (__a, __b, __c);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vext_s32 (int32x2_t __a, int32x2_t __b, const int __c)
+{
+ return (int32x2_t)__builtin_neon_vextv2si (__a, __b, __c);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vext_s64 (int64x1_t __a, int64x1_t __b, const int __c)
+{
+ return (int64x1_t)__builtin_neon_vextdi (__a, __b, __c);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vext_f32 (float32x2_t __a, float32x2_t __b, const int __c)
+{
+ return (float32x2_t)__builtin_neon_vextv2sf (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vext_u8 (uint8x8_t __a, uint8x8_t __b, const int __c)
+{
+ return (uint8x8_t)__builtin_neon_vextv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vext_u16 (uint16x4_t __a, uint16x4_t __b, const int __c)
+{
+ return (uint16x4_t)__builtin_neon_vextv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vext_u32 (uint32x2_t __a, uint32x2_t __b, const int __c)
+{
+ return (uint32x2_t)__builtin_neon_vextv2si ((int32x2_t) __a, (int32x2_t) __b, __c);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vext_u64 (uint64x1_t __a, uint64x1_t __b, const int __c)
+{
+ return (uint64x1_t)__builtin_neon_vextdi ((int64x1_t) __a, (int64x1_t) __b, __c);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vext_p8 (poly8x8_t __a, poly8x8_t __b, const int __c)
+{
+ return (poly8x8_t)__builtin_neon_vextv8qi ((int8x8_t) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vext_p16 (poly16x4_t __a, poly16x4_t __b, const int __c)
+{
+ return (poly16x4_t)__builtin_neon_vextv4hi ((int16x4_t) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vextq_s8 (int8x16_t __a, int8x16_t __b, const int __c)
+{
+ return (int8x16_t)__builtin_neon_vextv16qi (__a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vextq_s16 (int16x8_t __a, int16x8_t __b, const int __c)
+{
+ return (int16x8_t)__builtin_neon_vextv8hi (__a, __b, __c);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vextq_s32 (int32x4_t __a, int32x4_t __b, const int __c)
+{
+ return (int32x4_t)__builtin_neon_vextv4si (__a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vextq_s64 (int64x2_t __a, int64x2_t __b, const int __c)
+{
+ return (int64x2_t)__builtin_neon_vextv2di (__a, __b, __c);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vextq_f32 (float32x4_t __a, float32x4_t __b, const int __c)
+{
+ return (float32x4_t)__builtin_neon_vextv4sf (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vextq_u8 (uint8x16_t __a, uint8x16_t __b, const int __c)
+{
+ return (uint8x16_t)__builtin_neon_vextv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vextq_u16 (uint16x8_t __a, uint16x8_t __b, const int __c)
+{
+ return (uint16x8_t)__builtin_neon_vextv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vextq_u32 (uint32x4_t __a, uint32x4_t __b, const int __c)
+{
+ return (uint32x4_t)__builtin_neon_vextv4si ((int32x4_t) __a, (int32x4_t) __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vextq_u64 (uint64x2_t __a, uint64x2_t __b, const int __c)
+{
+ return (uint64x2_t)__builtin_neon_vextv2di ((int64x2_t) __a, (int64x2_t) __b, __c);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vextq_p8 (poly8x16_t __a, poly8x16_t __b, const int __c)
+{
+ return (poly8x16_t)__builtin_neon_vextv16qi ((int8x16_t) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vextq_p16 (poly16x8_t __a, poly16x8_t __b, const int __c)
+{
+ return (poly16x8_t)__builtin_neon_vextv8hi ((int16x8_t) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrev64_s8 (int8x8_t __a)
+{
+ return (int8x8_t)__builtin_neon_vrev64v8qi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrev64_s16 (int16x4_t __a)
+{
+ return (int16x4_t)__builtin_neon_vrev64v4hi (__a, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vrev64_s32 (int32x2_t __a)
+{
+ return (int32x2_t)__builtin_neon_vrev64v2si (__a, 1);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vrev64_f32 (float32x2_t __a)
+{
+ return (float32x2_t)__builtin_neon_vrev64v2sf (__a, 5);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrev64_u8 (uint8x8_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vrev64v8qi ((int8x8_t) __a, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrev64_u16 (uint16x4_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vrev64v4hi ((int16x4_t) __a, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vrev64_u32 (uint32x2_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vrev64v2si ((int32x2_t) __a, 0);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vrev64_p8 (poly8x8_t __a)
+{
+ return (poly8x8_t)__builtin_neon_vrev64v8qi ((int8x8_t) __a, 4);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vrev64_p16 (poly16x4_t __a)
+{
+ return (poly16x4_t)__builtin_neon_vrev64v4hi ((int16x4_t) __a, 4);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vrev64q_s8 (int8x16_t __a)
+{
+ return (int8x16_t)__builtin_neon_vrev64v16qi (__a, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vrev64q_s16 (int16x8_t __a)
+{
+ return (int16x8_t)__builtin_neon_vrev64v8hi (__a, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vrev64q_s32 (int32x4_t __a)
+{
+ return (int32x4_t)__builtin_neon_vrev64v4si (__a, 1);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vrev64q_f32 (float32x4_t __a)
+{
+ return (float32x4_t)__builtin_neon_vrev64v4sf (__a, 5);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vrev64q_u8 (uint8x16_t __a)
+{
+ return (uint8x16_t)__builtin_neon_vrev64v16qi ((int8x16_t) __a, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vrev64q_u16 (uint16x8_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vrev64v8hi ((int16x8_t) __a, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vrev64q_u32 (uint32x4_t __a)
+{
+ return (uint32x4_t)__builtin_neon_vrev64v4si ((int32x4_t) __a, 0);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vrev64q_p8 (poly8x16_t __a)
+{
+ return (poly8x16_t)__builtin_neon_vrev64v16qi ((int8x16_t) __a, 4);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vrev64q_p16 (poly16x8_t __a)
+{
+ return (poly16x8_t)__builtin_neon_vrev64v8hi ((int16x8_t) __a, 4);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrev32_s8 (int8x8_t __a)
+{
+ return (int8x8_t)__builtin_neon_vrev32v8qi (__a, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vrev32_s16 (int16x4_t __a)
+{
+ return (int16x4_t)__builtin_neon_vrev32v4hi (__a, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrev32_u8 (uint8x8_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vrev32v8qi ((int8x8_t) __a, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vrev32_u16 (uint16x4_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vrev32v4hi ((int16x4_t) __a, 0);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vrev32_p8 (poly8x8_t __a)
+{
+ return (poly8x8_t)__builtin_neon_vrev32v8qi ((int8x8_t) __a, 4);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vrev32_p16 (poly16x4_t __a)
+{
+ return (poly16x4_t)__builtin_neon_vrev32v4hi ((int16x4_t) __a, 4);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vrev32q_s8 (int8x16_t __a)
+{
+ return (int8x16_t)__builtin_neon_vrev32v16qi (__a, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vrev32q_s16 (int16x8_t __a)
+{
+ return (int16x8_t)__builtin_neon_vrev32v8hi (__a, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vrev32q_u8 (uint8x16_t __a)
+{
+ return (uint8x16_t)__builtin_neon_vrev32v16qi ((int8x16_t) __a, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vrev32q_u16 (uint16x8_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vrev32v8hi ((int16x8_t) __a, 0);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vrev32q_p8 (poly8x16_t __a)
+{
+ return (poly8x16_t)__builtin_neon_vrev32v16qi ((int8x16_t) __a, 4);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vrev32q_p16 (poly16x8_t __a)
+{
+ return (poly16x8_t)__builtin_neon_vrev32v8hi ((int16x8_t) __a, 4);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vrev16_s8 (int8x8_t __a)
+{
+ return (int8x8_t)__builtin_neon_vrev16v8qi (__a, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vrev16_u8 (uint8x8_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vrev16v8qi ((int8x8_t) __a, 0);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vrev16_p8 (poly8x8_t __a)
+{
+ return (poly8x8_t)__builtin_neon_vrev16v8qi ((int8x8_t) __a, 4);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vrev16q_s8 (int8x16_t __a)
+{
+ return (int8x16_t)__builtin_neon_vrev16v16qi (__a, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vrev16q_u8 (uint8x16_t __a)
+{
+ return (uint8x16_t)__builtin_neon_vrev16v16qi ((int8x16_t) __a, 0);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vrev16q_p8 (poly8x16_t __a)
+{
+ return (poly8x16_t)__builtin_neon_vrev16v16qi ((int8x16_t) __a, 4);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vbsl_s8 (uint8x8_t __a, int8x8_t __b, int8x8_t __c)
+{
+ return (int8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, __b, __c);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vbsl_s16 (uint16x4_t __a, int16x4_t __b, int16x4_t __c)
+{
+ return (int16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, __b, __c);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vbsl_s32 (uint32x2_t __a, int32x2_t __b, int32x2_t __c)
+{
+ return (int32x2_t)__builtin_neon_vbslv2si ((int32x2_t) __a, __b, __c);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vbsl_s64 (uint64x1_t __a, int64x1_t __b, int64x1_t __c)
+{
+ return (int64x1_t)__builtin_neon_vbsldi ((int64x1_t) __a, __b, __c);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vbsl_f32 (uint32x2_t __a, float32x2_t __b, float32x2_t __c)
+{
+ return (float32x2_t)__builtin_neon_vbslv2sf ((int32x2_t) __a, __b, __c);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vbsl_u8 (uint8x8_t __a, uint8x8_t __b, uint8x8_t __c)
+{
+ return (uint8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vbsl_u16 (uint16x4_t __a, uint16x4_t __b, uint16x4_t __c)
+{
+ return (uint16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vbsl_u32 (uint32x2_t __a, uint32x2_t __b, uint32x2_t __c)
+{
+ return (uint32x2_t)__builtin_neon_vbslv2si ((int32x2_t) __a, (int32x2_t) __b, (int32x2_t) __c);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vbsl_u64 (uint64x1_t __a, uint64x1_t __b, uint64x1_t __c)
+{
+ return (uint64x1_t)__builtin_neon_vbsldi ((int64x1_t) __a, (int64x1_t) __b, (int64x1_t) __c);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vbsl_p8 (uint8x8_t __a, poly8x8_t __b, poly8x8_t __c)
+{
+ return (poly8x8_t)__builtin_neon_vbslv8qi ((int8x8_t) __a, (int8x8_t) __b, (int8x8_t) __c);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vbsl_p16 (uint16x4_t __a, poly16x4_t __b, poly16x4_t __c)
+{
+ return (poly16x4_t)__builtin_neon_vbslv4hi ((int16x4_t) __a, (int16x4_t) __b, (int16x4_t) __c);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vbslq_s8 (uint8x16_t __a, int8x16_t __b, int8x16_t __c)
+{
+ return (int8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vbslq_s16 (uint16x8_t __a, int16x8_t __b, int16x8_t __c)
+{
+ return (int16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, __b, __c);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vbslq_s32 (uint32x4_t __a, int32x4_t __b, int32x4_t __c)
+{
+ return (int32x4_t)__builtin_neon_vbslv4si ((int32x4_t) __a, __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vbslq_s64 (uint64x2_t __a, int64x2_t __b, int64x2_t __c)
+{
+ return (int64x2_t)__builtin_neon_vbslv2di ((int64x2_t) __a, __b, __c);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vbslq_f32 (uint32x4_t __a, float32x4_t __b, float32x4_t __c)
+{
+ return (float32x4_t)__builtin_neon_vbslv4sf ((int32x4_t) __a, __b, __c);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vbslq_u8 (uint8x16_t __a, uint8x16_t __b, uint8x16_t __c)
+{
+ return (uint8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vbslq_u16 (uint16x8_t __a, uint16x8_t __b, uint16x8_t __c)
+{
+ return (uint16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vbslq_u32 (uint32x4_t __a, uint32x4_t __b, uint32x4_t __c)
+{
+ return (uint32x4_t)__builtin_neon_vbslv4si ((int32x4_t) __a, (int32x4_t) __b, (int32x4_t) __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vbslq_u64 (uint64x2_t __a, uint64x2_t __b, uint64x2_t __c)
+{
+ return (uint64x2_t)__builtin_neon_vbslv2di ((int64x2_t) __a, (int64x2_t) __b, (int64x2_t) __c);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vbslq_p8 (uint8x16_t __a, poly8x16_t __b, poly8x16_t __c)
+{
+ return (poly8x16_t)__builtin_neon_vbslv16qi ((int8x16_t) __a, (int8x16_t) __b, (int8x16_t) __c);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vbslq_p16 (uint16x8_t __a, poly16x8_t __b, poly16x8_t __c)
+{
+ return (poly16x8_t)__builtin_neon_vbslv8hi ((int16x8_t) __a, (int16x8_t) __b, (int16x8_t) __c);
+}
+
+__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
+vtrn_s8 (int8x8_t __a, int8x8_t __b)
+{
+ int8x8x2_t __rv;
+ __builtin_neon_vtrnv8qi (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
+vtrn_s16 (int16x4_t __a, int16x4_t __b)
+{
+ int16x4x2_t __rv;
+ __builtin_neon_vtrnv4hi (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+vtrn_s32 (int32x2_t __a, int32x2_t __b)
+{
+ int32x2x2_t __rv;
+ __builtin_neon_vtrnv2si (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+vtrn_f32 (float32x2_t __a, float32x2_t __b)
+{
+ float32x2x2_t __rv;
+ __builtin_neon_vtrnv2sf (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
+vtrn_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ uint8x8x2_t __rv;
+ __builtin_neon_vtrnv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
+vtrn_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ uint16x4x2_t __rv;
+ __builtin_neon_vtrnv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+vtrn_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ uint32x2x2_t __rv;
+ __builtin_neon_vtrnv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
+vtrn_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+ poly8x8x2_t __rv;
+ __builtin_neon_vtrnv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
+vtrn_p16 (poly16x4_t __a, poly16x4_t __b)
+{
+ poly16x4x2_t __rv;
+ __builtin_neon_vtrnv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
+vtrnq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ int8x16x2_t __rv;
+ __builtin_neon_vtrnv16qi (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
+vtrnq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ int16x8x2_t __rv;
+ __builtin_neon_vtrnv8hi (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
+vtrnq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ int32x4x2_t __rv;
+ __builtin_neon_vtrnv4si (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
+vtrnq_f32 (float32x4_t __a, float32x4_t __b)
+{
+ float32x4x2_t __rv;
+ __builtin_neon_vtrnv4sf (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
+vtrnq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ uint8x16x2_t __rv;
+ __builtin_neon_vtrnv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
+vtrnq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ uint16x8x2_t __rv;
+ __builtin_neon_vtrnv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
+vtrnq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ uint32x4x2_t __rv;
+ __builtin_neon_vtrnv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
+vtrnq_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+ poly8x16x2_t __rv;
+ __builtin_neon_vtrnv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
+vtrnq_p16 (poly16x8_t __a, poly16x8_t __b)
+{
+ poly16x8x2_t __rv;
+ __builtin_neon_vtrnv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
+vzip_s8 (int8x8_t __a, int8x8_t __b)
+{
+ int8x8x2_t __rv;
+ __builtin_neon_vzipv8qi (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
+vzip_s16 (int16x4_t __a, int16x4_t __b)
+{
+ int16x4x2_t __rv;
+ __builtin_neon_vzipv4hi (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+vzip_s32 (int32x2_t __a, int32x2_t __b)
+{
+ int32x2x2_t __rv;
+ __builtin_neon_vzipv2si (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+vzip_f32 (float32x2_t __a, float32x2_t __b)
+{
+ float32x2x2_t __rv;
+ __builtin_neon_vzipv2sf (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
+vzip_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ uint8x8x2_t __rv;
+ __builtin_neon_vzipv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
+vzip_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ uint16x4x2_t __rv;
+ __builtin_neon_vzipv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+vzip_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ uint32x2x2_t __rv;
+ __builtin_neon_vzipv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
+vzip_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+ poly8x8x2_t __rv;
+ __builtin_neon_vzipv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
+vzip_p16 (poly16x4_t __a, poly16x4_t __b)
+{
+ poly16x4x2_t __rv;
+ __builtin_neon_vzipv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
+vzipq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ int8x16x2_t __rv;
+ __builtin_neon_vzipv16qi (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
+vzipq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ int16x8x2_t __rv;
+ __builtin_neon_vzipv8hi (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
+vzipq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ int32x4x2_t __rv;
+ __builtin_neon_vzipv4si (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
+vzipq_f32 (float32x4_t __a, float32x4_t __b)
+{
+ float32x4x2_t __rv;
+ __builtin_neon_vzipv4sf (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
+vzipq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ uint8x16x2_t __rv;
+ __builtin_neon_vzipv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
+vzipq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ uint16x8x2_t __rv;
+ __builtin_neon_vzipv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
+vzipq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ uint32x4x2_t __rv;
+ __builtin_neon_vzipv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
+vzipq_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+ poly8x16x2_t __rv;
+ __builtin_neon_vzipv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
+vzipq_p16 (poly16x8_t __a, poly16x8_t __b)
+{
+ poly16x8x2_t __rv;
+ __builtin_neon_vzipv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
+vuzp_s8 (int8x8_t __a, int8x8_t __b)
+{
+ int8x8x2_t __rv;
+ __builtin_neon_vuzpv8qi (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
+vuzp_s16 (int16x4_t __a, int16x4_t __b)
+{
+ int16x4x2_t __rv;
+ __builtin_neon_vuzpv4hi (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+vuzp_s32 (int32x2_t __a, int32x2_t __b)
+{
+ int32x2x2_t __rv;
+ __builtin_neon_vuzpv2si (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+vuzp_f32 (float32x2_t __a, float32x2_t __b)
+{
+ float32x2x2_t __rv;
+ __builtin_neon_vuzpv2sf (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
+vuzp_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ uint8x8x2_t __rv;
+ __builtin_neon_vuzpv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
+vuzp_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ uint16x4x2_t __rv;
+ __builtin_neon_vuzpv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+vuzp_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ uint32x2x2_t __rv;
+ __builtin_neon_vuzpv2si ((int32x2_t *) &__rv.val[0], (int32x2_t) __a, (int32x2_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
+vuzp_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+ poly8x8x2_t __rv;
+ __builtin_neon_vuzpv8qi ((int8x8_t *) &__rv.val[0], (int8x8_t) __a, (int8x8_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
+vuzp_p16 (poly16x4_t __a, poly16x4_t __b)
+{
+ poly16x4x2_t __rv;
+ __builtin_neon_vuzpv4hi ((int16x4_t *) &__rv.val[0], (int16x4_t) __a, (int16x4_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
+vuzpq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ int8x16x2_t __rv;
+ __builtin_neon_vuzpv16qi (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
+vuzpq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ int16x8x2_t __rv;
+ __builtin_neon_vuzpv8hi (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
+vuzpq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ int32x4x2_t __rv;
+ __builtin_neon_vuzpv4si (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
+vuzpq_f32 (float32x4_t __a, float32x4_t __b)
+{
+ float32x4x2_t __rv;
+ __builtin_neon_vuzpv4sf (&__rv.val[0], __a, __b);
+ return __rv;
+}
+
+__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
+vuzpq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ uint8x16x2_t __rv;
+ __builtin_neon_vuzpv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
+vuzpq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ uint16x8x2_t __rv;
+ __builtin_neon_vuzpv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
+vuzpq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ uint32x4x2_t __rv;
+ __builtin_neon_vuzpv4si ((int32x4_t *) &__rv.val[0], (int32x4_t) __a, (int32x4_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
+vuzpq_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+ poly8x16x2_t __rv;
+ __builtin_neon_vuzpv16qi ((int8x16_t *) &__rv.val[0], (int8x16_t) __a, (int8x16_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
+vuzpq_p16 (poly16x8_t __a, poly16x8_t __b)
+{
+ poly16x8x2_t __rv;
+ __builtin_neon_vuzpv8hi ((int16x8_t *) &__rv.val[0], (int16x8_t) __a, (int16x8_t) __b);
+ return __rv;
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vld1_s8 (const int8_t * __a)
+{
+ return (int8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vld1_s16 (const int16_t * __a)
+{
+ return (int16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vld1_s32 (const int32_t * __a)
+{
+ return (int32x2_t)__builtin_neon_vld1v2si ((const __builtin_neon_si *) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vld1_s64 (const int64_t * __a)
+{
+ return (int64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vld1_f32 (const float32_t * __a)
+{
+ return (float32x2_t)__builtin_neon_vld1v2sf (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vld1_u8 (const uint8_t * __a)
+{
+ return (uint8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vld1_u16 (const uint16_t * __a)
+{
+ return (uint16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vld1_u32 (const uint32_t * __a)
+{
+ return (uint32x2_t)__builtin_neon_vld1v2si ((const __builtin_neon_si *) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vld1_u64 (const uint64_t * __a)
+{
+ return (uint64x1_t)__builtin_neon_vld1di ((const __builtin_neon_di *) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vld1_p8 (const poly8_t * __a)
+{
+ return (poly8x8_t)__builtin_neon_vld1v8qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vld1_p16 (const poly16_t * __a)
+{
+ return (poly16x4_t)__builtin_neon_vld1v4hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vld1q_s8 (const int8_t * __a)
+{
+ return (int8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vld1q_s16 (const int16_t * __a)
+{
+ return (int16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vld1q_s32 (const int32_t * __a)
+{
+ return (int32x4_t)__builtin_neon_vld1v4si ((const __builtin_neon_si *) __a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vld1q_s64 (const int64_t * __a)
+{
+ return (int64x2_t)__builtin_neon_vld1v2di ((const __builtin_neon_di *) __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vld1q_f32 (const float32_t * __a)
+{
+ return (float32x4_t)__builtin_neon_vld1v4sf (__a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vld1q_u8 (const uint8_t * __a)
+{
+ return (uint8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vld1q_u16 (const uint16_t * __a)
+{
+ return (uint16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vld1q_u32 (const uint32_t * __a)
+{
+ return (uint32x4_t)__builtin_neon_vld1v4si ((const __builtin_neon_si *) __a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vld1q_u64 (const uint64_t * __a)
+{
+ return (uint64x2_t)__builtin_neon_vld1v2di ((const __builtin_neon_di *) __a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vld1q_p8 (const poly8_t * __a)
+{
+ return (poly8x16_t)__builtin_neon_vld1v16qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vld1q_p16 (const poly16_t * __a)
+{
+ return (poly16x8_t)__builtin_neon_vld1v8hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vld1_lane_s8 (const int8_t * __a, int8x8_t __b, const int __c)
+{
+ return (int8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, __b, __c);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vld1_lane_s16 (const int16_t * __a, int16x4_t __b, const int __c)
+{
+ return (int16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, __b, __c);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vld1_lane_s32 (const int32_t * __a, int32x2_t __b, const int __c)
+{
+ return (int32x2_t)__builtin_neon_vld1_lanev2si ((const __builtin_neon_si *) __a, __b, __c);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vld1_lane_f32 (const float32_t * __a, float32x2_t __b, const int __c)
+{
+ return (float32x2_t)__builtin_neon_vld1_lanev2sf (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vld1_lane_u8 (const uint8_t * __a, uint8x8_t __b, const int __c)
+{
+ return (uint8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vld1_lane_u16 (const uint16_t * __a, uint16x4_t __b, const int __c)
+{
+ return (uint16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vld1_lane_u32 (const uint32_t * __a, uint32x2_t __b, const int __c)
+{
+ return (uint32x2_t)__builtin_neon_vld1_lanev2si ((const __builtin_neon_si *) __a, (int32x2_t) __b, __c);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vld1_lane_p8 (const poly8_t * __a, poly8x8_t __b, const int __c)
+{
+ return (poly8x8_t)__builtin_neon_vld1_lanev8qi ((const __builtin_neon_qi *) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vld1_lane_p16 (const poly16_t * __a, poly16x4_t __b, const int __c)
+{
+ return (poly16x4_t)__builtin_neon_vld1_lanev4hi ((const __builtin_neon_hi *) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vld1_lane_s64 (const int64_t * __a, int64x1_t __b, const int __c)
+{
+ return (int64x1_t)__builtin_neon_vld1_lanedi ((const __builtin_neon_di *) __a, __b, __c);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vld1_lane_u64 (const uint64_t * __a, uint64x1_t __b, const int __c)
+{
+ return (uint64x1_t)__builtin_neon_vld1_lanedi ((const __builtin_neon_di *) __a, (int64x1_t) __b, __c);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vld1q_lane_s8 (const int8_t * __a, int8x16_t __b, const int __c)
+{
+ return (int8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, __b, __c);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vld1q_lane_s16 (const int16_t * __a, int16x8_t __b, const int __c)
+{
+ return (int16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, __b, __c);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vld1q_lane_s32 (const int32_t * __a, int32x4_t __b, const int __c)
+{
+ return (int32x4_t)__builtin_neon_vld1_lanev4si ((const __builtin_neon_si *) __a, __b, __c);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vld1q_lane_f32 (const float32_t * __a, float32x4_t __b, const int __c)
+{
+ return (float32x4_t)__builtin_neon_vld1_lanev4sf (__a, __b, __c);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vld1q_lane_u8 (const uint8_t * __a, uint8x16_t __b, const int __c)
+{
+ return (uint8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vld1q_lane_u16 (const uint16_t * __a, uint16x8_t __b, const int __c)
+{
+ return (uint16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vld1q_lane_u32 (const uint32_t * __a, uint32x4_t __b, const int __c)
+{
+ return (uint32x4_t)__builtin_neon_vld1_lanev4si ((const __builtin_neon_si *) __a, (int32x4_t) __b, __c);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vld1q_lane_p8 (const poly8_t * __a, poly8x16_t __b, const int __c)
+{
+ return (poly8x16_t)__builtin_neon_vld1_lanev16qi ((const __builtin_neon_qi *) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vld1q_lane_p16 (const poly16_t * __a, poly16x8_t __b, const int __c)
+{
+ return (poly16x8_t)__builtin_neon_vld1_lanev8hi ((const __builtin_neon_hi *) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vld1q_lane_s64 (const int64_t * __a, int64x2_t __b, const int __c)
+{
+ return (int64x2_t)__builtin_neon_vld1_lanev2di ((const __builtin_neon_di *) __a, __b, __c);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vld1q_lane_u64 (const uint64_t * __a, uint64x2_t __b, const int __c)
+{
+ return (uint64x2_t)__builtin_neon_vld1_lanev2di ((const __builtin_neon_di *) __a, (int64x2_t) __b, __c);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vld1_dup_s8 (const int8_t * __a)
+{
+ return (int8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vld1_dup_s16 (const int16_t * __a)
+{
+ return (int16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vld1_dup_s32 (const int32_t * __a)
+{
+ return (int32x2_t)__builtin_neon_vld1_dupv2si ((const __builtin_neon_si *) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vld1_dup_f32 (const float32_t * __a)
+{
+ return (float32x2_t)__builtin_neon_vld1_dupv2sf (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vld1_dup_u8 (const uint8_t * __a)
+{
+ return (uint8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vld1_dup_u16 (const uint16_t * __a)
+{
+ return (uint16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vld1_dup_u32 (const uint32_t * __a)
+{
+ return (uint32x2_t)__builtin_neon_vld1_dupv2si ((const __builtin_neon_si *) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vld1_dup_p8 (const poly8_t * __a)
+{
+ return (poly8x8_t)__builtin_neon_vld1_dupv8qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vld1_dup_p16 (const poly16_t * __a)
+{
+ return (poly16x4_t)__builtin_neon_vld1_dupv4hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vld1_dup_s64 (const int64_t * __a)
+{
+ return (int64x1_t)__builtin_neon_vld1_dupdi ((const __builtin_neon_di *) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vld1_dup_u64 (const uint64_t * __a)
+{
+ return (uint64x1_t)__builtin_neon_vld1_dupdi ((const __builtin_neon_di *) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vld1q_dup_s8 (const int8_t * __a)
+{
+ return (int8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vld1q_dup_s16 (const int16_t * __a)
+{
+ return (int16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vld1q_dup_s32 (const int32_t * __a)
+{
+ return (int32x4_t)__builtin_neon_vld1_dupv4si ((const __builtin_neon_si *) __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vld1q_dup_f32 (const float32_t * __a)
+{
+ return (float32x4_t)__builtin_neon_vld1_dupv4sf (__a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vld1q_dup_u8 (const uint8_t * __a)
+{
+ return (uint8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vld1q_dup_u16 (const uint16_t * __a)
+{
+ return (uint16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vld1q_dup_u32 (const uint32_t * __a)
+{
+ return (uint32x4_t)__builtin_neon_vld1_dupv4si ((const __builtin_neon_si *) __a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vld1q_dup_p8 (const poly8_t * __a)
+{
+ return (poly8x16_t)__builtin_neon_vld1_dupv16qi ((const __builtin_neon_qi *) __a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vld1q_dup_p16 (const poly16_t * __a)
+{
+ return (poly16x8_t)__builtin_neon_vld1_dupv8hi ((const __builtin_neon_hi *) __a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vld1q_dup_s64 (const int64_t * __a)
+{
+ return (int64x2_t)__builtin_neon_vld1_dupv2di ((const __builtin_neon_di *) __a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vld1q_dup_u64 (const uint64_t * __a)
+{
+ return (uint64x2_t)__builtin_neon_vld1_dupv2di ((const __builtin_neon_di *) __a);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_s8 (int8_t * __a, int8x8_t __b)
+{
+ __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_s16 (int16_t * __a, int16x4_t __b)
+{
+ __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_s32 (int32_t * __a, int32x2_t __b)
+{
+ __builtin_neon_vst1v2si ((__builtin_neon_si *) __a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_s64 (int64_t * __a, int64x1_t __b)
+{
+ __builtin_neon_vst1di ((__builtin_neon_di *) __a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_f32 (float32_t * __a, float32x2_t __b)
+{
+ __builtin_neon_vst1v2sf (__a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_u8 (uint8_t * __a, uint8x8_t __b)
+{
+ __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_u16 (uint16_t * __a, uint16x4_t __b)
+{
+ __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_u32 (uint32_t * __a, uint32x2_t __b)
+{
+ __builtin_neon_vst1v2si ((__builtin_neon_si *) __a, (int32x2_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_u64 (uint64_t * __a, uint64x1_t __b)
+{
+ __builtin_neon_vst1di ((__builtin_neon_di *) __a, (int64x1_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_p8 (poly8_t * __a, poly8x8_t __b)
+{
+ __builtin_neon_vst1v8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_p16 (poly16_t * __a, poly16x4_t __b)
+{
+ __builtin_neon_vst1v4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_s8 (int8_t * __a, int8x16_t __b)
+{
+ __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_s16 (int16_t * __a, int16x8_t __b)
+{
+ __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_s32 (int32_t * __a, int32x4_t __b)
+{
+ __builtin_neon_vst1v4si ((__builtin_neon_si *) __a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_s64 (int64_t * __a, int64x2_t __b)
+{
+ __builtin_neon_vst1v2di ((__builtin_neon_di *) __a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_f32 (float32_t * __a, float32x4_t __b)
+{
+ __builtin_neon_vst1v4sf (__a, __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_u8 (uint8_t * __a, uint8x16_t __b)
+{
+ __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_u16 (uint16_t * __a, uint16x8_t __b)
+{
+ __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_u32 (uint32_t * __a, uint32x4_t __b)
+{
+ __builtin_neon_vst1v4si ((__builtin_neon_si *) __a, (int32x4_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_u64 (uint64_t * __a, uint64x2_t __b)
+{
+ __builtin_neon_vst1v2di ((__builtin_neon_di *) __a, (int64x2_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_p8 (poly8_t * __a, poly8x16_t __b)
+{
+ __builtin_neon_vst1v16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_p16 (poly16_t * __a, poly16x8_t __b)
+{
+ __builtin_neon_vst1v8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_s8 (int8_t * __a, int8x8_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_s16 (int16_t * __a, int16x4_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_s32 (int32_t * __a, int32x2_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanev2si ((__builtin_neon_si *) __a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_f32 (float32_t * __a, float32x2_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanev2sf (__a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_u8 (uint8_t * __a, uint8x8_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_u16 (uint16_t * __a, uint16x4_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_u32 (uint32_t * __a, uint32x2_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanev2si ((__builtin_neon_si *) __a, (int32x2_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_p8 (poly8_t * __a, poly8x8_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanev8qi ((__builtin_neon_qi *) __a, (int8x8_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_p16 (poly16_t * __a, poly16x4_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanev4hi ((__builtin_neon_hi *) __a, (int16x4_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_s64 (int64_t * __a, int64x1_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanedi ((__builtin_neon_di *) __a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1_lane_u64 (uint64_t * __a, uint64x1_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanedi ((__builtin_neon_di *) __a, (int64x1_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_s8 (int8_t * __a, int8x16_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_s16 (int16_t * __a, int16x8_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_s32 (int32_t * __a, int32x4_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanev4si ((__builtin_neon_si *) __a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_f32 (float32_t * __a, float32x4_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanev4sf (__a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_u8 (uint8_t * __a, uint8x16_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_u16 (uint16_t * __a, uint16x8_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_u32 (uint32_t * __a, uint32x4_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanev4si ((__builtin_neon_si *) __a, (int32x4_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_p8 (poly8_t * __a, poly8x16_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanev16qi ((__builtin_neon_qi *) __a, (int8x16_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_p16 (poly16_t * __a, poly16x8_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanev8hi ((__builtin_neon_hi *) __a, (int16x8_t) __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_s64 (int64_t * __a, int64x2_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanev2di ((__builtin_neon_di *) __a, __b, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst1q_lane_u64 (uint64_t * __a, uint64x2_t __b, const int __c)
+{
+ __builtin_neon_vst1_lanev2di ((__builtin_neon_di *) __a, (int64x2_t) __b, __c);
+}
+
+__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
+vld2_s8 (const int8_t * __a)
+{
+ union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2v8qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
+vld2_s16 (const int16_t * __a)
+{
+ union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2v4hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+vld2_s32 (const int32_t * __a)
+{
+ union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2v2si ((const __builtin_neon_si *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+vld2_f32 (const float32_t * __a)
+{
+ union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2v2sf (__a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
+vld2_u8 (const uint8_t * __a)
+{
+ union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2v8qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
+vld2_u16 (const uint16_t * __a)
+{
+ union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2v4hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+vld2_u32 (const uint32_t * __a)
+{
+ union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2v2si ((const __builtin_neon_si *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
+vld2_p8 (const poly8_t * __a)
+{
+ union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2v8qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
+vld2_p16 (const poly16_t * __a)
+{
+ union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2v4hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
+vld2_s64 (const int64_t * __a)
+{
+ union { int64x1x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2di ((const __builtin_neon_di *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__))
+vld2_u64 (const uint64_t * __a)
+{
+ union { uint64x1x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2di ((const __builtin_neon_di *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int8x16x2_t __attribute__ ((__always_inline__))
+vld2q_s8 (const int8_t * __a)
+{
+ union { int8x16x2_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld2v16qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
+vld2q_s16 (const int16_t * __a)
+{
+ union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld2v8hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
+vld2q_s32 (const int32_t * __a)
+{
+ union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld2v4si ((const __builtin_neon_si *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
+vld2q_f32 (const float32_t * __a)
+{
+ union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld2v4sf (__a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint8x16x2_t __attribute__ ((__always_inline__))
+vld2q_u8 (const uint8_t * __a)
+{
+ union { uint8x16x2_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld2v16qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
+vld2q_u16 (const uint16_t * __a)
+{
+ union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld2v8hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
+vld2q_u32 (const uint32_t * __a)
+{
+ union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld2v4si ((const __builtin_neon_si *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly8x16x2_t __attribute__ ((__always_inline__))
+vld2q_p8 (const poly8_t * __a)
+{
+ union { poly8x16x2_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld2v16qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
+vld2q_p16 (const poly16_t * __a)
+{
+ union { poly16x8x2_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld2v8hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
+vld2_lane_s8 (const int8_t * __a, int8x8x2_t __b, const int __c)
+{
+ union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
+vld2_lane_s16 (const int16_t * __a, int16x4x2_t __b, const int __c)
+{
+ union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+vld2_lane_s32 (const int32_t * __a, int32x2x2_t __b, const int __c)
+{
+ union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+vld2_lane_f32 (const float32_t * __a, float32x2x2_t __b, const int __c)
+{
+ union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_lanev2sf (__a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
+vld2_lane_u8 (const uint8_t * __a, uint8x8x2_t __b, const int __c)
+{
+ union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
+vld2_lane_u16 (const uint16_t * __a, uint16x4x2_t __b, const int __c)
+{
+ union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+vld2_lane_u32 (const uint32_t * __a, uint32x2x2_t __b, const int __c)
+{
+ union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
+vld2_lane_p8 (const poly8_t * __a, poly8x8x2_t __b, const int __c)
+{
+ union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
+vld2_lane_p16 (const poly16_t * __a, poly16x4x2_t __b, const int __c)
+{
+ union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline int16x8x2_t __attribute__ ((__always_inline__))
+vld2q_lane_s16 (const int16_t * __a, int16x8x2_t __b, const int __c)
+{
+ union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ union { int16x8x2_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline int32x4x2_t __attribute__ ((__always_inline__))
+vld2q_lane_s32 (const int32_t * __a, int32x4x2_t __b, const int __c)
+{
+ union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ union { int32x4x2_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline float32x4x2_t __attribute__ ((__always_inline__))
+vld2q_lane_f32 (const float32_t * __a, float32x4x2_t __b, const int __c)
+{
+ union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ union { float32x4x2_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_lanev4sf (__a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint16x8x2_t __attribute__ ((__always_inline__))
+vld2q_lane_u16 (const uint16_t * __a, uint16x8x2_t __b, const int __c)
+{
+ union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ union { uint16x8x2_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint32x4x2_t __attribute__ ((__always_inline__))
+vld2q_lane_u32 (const uint32_t * __a, uint32x4x2_t __b, const int __c)
+{
+ union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ union { uint32x4x2_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly16x8x2_t __attribute__ ((__always_inline__))
+vld2q_lane_p16 (const poly16_t * __a, poly16x8x2_t __b, const int __c)
+{
+ union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ union { poly16x8x2_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline int8x8x2_t __attribute__ ((__always_inline__))
+vld2_dup_s8 (const int8_t * __a)
+{
+ union { int8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_dupv8qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int16x4x2_t __attribute__ ((__always_inline__))
+vld2_dup_s16 (const int16_t * __a)
+{
+ union { int16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_dupv4hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int32x2x2_t __attribute__ ((__always_inline__))
+vld2_dup_s32 (const int32_t * __a)
+{
+ union { int32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_dupv2si ((const __builtin_neon_si *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline float32x2x2_t __attribute__ ((__always_inline__))
+vld2_dup_f32 (const float32_t * __a)
+{
+ union { float32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_dupv2sf (__a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint8x8x2_t __attribute__ ((__always_inline__))
+vld2_dup_u8 (const uint8_t * __a)
+{
+ union { uint8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_dupv8qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint16x4x2_t __attribute__ ((__always_inline__))
+vld2_dup_u16 (const uint16_t * __a)
+{
+ union { uint16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_dupv4hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint32x2x2_t __attribute__ ((__always_inline__))
+vld2_dup_u32 (const uint32_t * __a)
+{
+ union { uint32x2x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_dupv2si ((const __builtin_neon_si *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly8x8x2_t __attribute__ ((__always_inline__))
+vld2_dup_p8 (const poly8_t * __a)
+{
+ union { poly8x8x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_dupv8qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly16x4x2_t __attribute__ ((__always_inline__))
+vld2_dup_p16 (const poly16_t * __a)
+{
+ union { poly16x4x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_dupv4hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int64x1x2_t __attribute__ ((__always_inline__))
+vld2_dup_s64 (const int64_t * __a)
+{
+ union { int64x1x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_dupdi ((const __builtin_neon_di *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint64x1x2_t __attribute__ ((__always_inline__))
+vld2_dup_u64 (const uint64_t * __a)
+{
+ union { uint64x1x2_t __i; __builtin_neon_ti __o; } __rv;
+ __rv.__o = __builtin_neon_vld2_dupdi ((const __builtin_neon_di *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_s8 (int8_t * __a, int8x8x2_t __b)
+{
+ union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_s16 (int16_t * __a, int16x4x2_t __b)
+{
+ union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_s32 (int32_t * __a, int32x2x2_t __b)
+{
+ union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2v2si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_f32 (float32_t * __a, float32x2x2_t __b)
+{
+ union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2v2sf (__a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_u8 (uint8_t * __a, uint8x8x2_t __b)
+{
+ union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_u16 (uint16_t * __a, uint16x4x2_t __b)
+{
+ union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_u32 (uint32_t * __a, uint32x2x2_t __b)
+{
+ union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2v2si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_p8 (poly8_t * __a, poly8x8x2_t __b)
+{
+ union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_p16 (poly16_t * __a, poly16x4x2_t __b)
+{
+ union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_s64 (int64_t * __a, int64x1x2_t __b)
+{
+ union { int64x1x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2di ((__builtin_neon_di *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_u64 (uint64_t * __a, uint64x1x2_t __b)
+{
+ union { uint64x1x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2di ((__builtin_neon_di *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_s8 (int8_t * __a, int8x16x2_t __b)
+{
+ union { int8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_s16 (int16_t * __a, int16x8x2_t __b)
+{
+ union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_s32 (int32_t * __a, int32x4x2_t __b)
+{
+ union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2v4si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_f32 (float32_t * __a, float32x4x2_t __b)
+{
+ union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2v4sf (__a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_u8 (uint8_t * __a, uint8x16x2_t __b)
+{
+ union { uint8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_u16 (uint16_t * __a, uint16x8x2_t __b)
+{
+ union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_u32 (uint32_t * __a, uint32x4x2_t __b)
+{
+ union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2v4si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_p8 (poly8_t * __a, poly8x16x2_t __b)
+{
+ union { poly8x16x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_p16 (poly16_t * __a, poly16x8x2_t __b)
+{
+ union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_lane_s8 (int8_t * __a, int8x8x2_t __b, const int __c)
+{
+ union { int8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_lane_s16 (int16_t * __a, int16x4x2_t __b, const int __c)
+{
+ union { int16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_lane_s32 (int32_t * __a, int32x2x2_t __b, const int __c)
+{
+ union { int32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_lane_f32 (float32_t * __a, float32x2x2_t __b, const int __c)
+{
+ union { float32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev2sf (__a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_lane_u8 (uint8_t * __a, uint8x8x2_t __b, const int __c)
+{
+ union { uint8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_lane_u16 (uint16_t * __a, uint16x4x2_t __b, const int __c)
+{
+ union { uint16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_lane_u32 (uint32_t * __a, uint32x2x2_t __b, const int __c)
+{
+ union { uint32x2x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_lane_p8 (poly8_t * __a, poly8x8x2_t __b, const int __c)
+{
+ union { poly8x8x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2_lane_p16 (poly16_t * __a, poly16x4x2_t __b, const int __c)
+{
+ union { poly16x4x2_t __i; __builtin_neon_ti __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_lane_s16 (int16_t * __a, int16x8x2_t __b, const int __c)
+{
+ union { int16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_lane_s32 (int32_t * __a, int32x4x2_t __b, const int __c)
+{
+ union { int32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_lane_f32 (float32_t * __a, float32x4x2_t __b, const int __c)
+{
+ union { float32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev4sf (__a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_lane_u16 (uint16_t * __a, uint16x8x2_t __b, const int __c)
+{
+ union { uint16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_lane_u32 (uint32_t * __a, uint32x4x2_t __b, const int __c)
+{
+ union { uint32x4x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst2q_lane_p16 (poly16_t * __a, poly16x8x2_t __b, const int __c)
+{
+ union { poly16x8x2_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst2_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
+vld3_s8 (const int8_t * __a)
+{
+ union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3v8qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
+vld3_s16 (const int16_t * __a)
+{
+ union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3v4hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
+vld3_s32 (const int32_t * __a)
+{
+ union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3v2si ((const __builtin_neon_si *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
+vld3_f32 (const float32_t * __a)
+{
+ union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3v2sf (__a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
+vld3_u8 (const uint8_t * __a)
+{
+ union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3v8qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
+vld3_u16 (const uint16_t * __a)
+{
+ union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3v4hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
+vld3_u32 (const uint32_t * __a)
+{
+ union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3v2si ((const __builtin_neon_si *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
+vld3_p8 (const poly8_t * __a)
+{
+ union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3v8qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
+vld3_p16 (const poly16_t * __a)
+{
+ union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3v4hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
+vld3_s64 (const int64_t * __a)
+{
+ union { int64x1x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3di ((const __builtin_neon_di *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__))
+vld3_u64 (const uint64_t * __a)
+{
+ union { uint64x1x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3di ((const __builtin_neon_di *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int8x16x3_t __attribute__ ((__always_inline__))
+vld3q_s8 (const int8_t * __a)
+{
+ union { int8x16x3_t __i; __builtin_neon_ci __o; } __rv;
+ __rv.__o = __builtin_neon_vld3v16qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
+vld3q_s16 (const int16_t * __a)
+{
+ union { int16x8x3_t __i; __builtin_neon_ci __o; } __rv;
+ __rv.__o = __builtin_neon_vld3v8hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
+vld3q_s32 (const int32_t * __a)
+{
+ union { int32x4x3_t __i; __builtin_neon_ci __o; } __rv;
+ __rv.__o = __builtin_neon_vld3v4si ((const __builtin_neon_si *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
+vld3q_f32 (const float32_t * __a)
+{
+ union { float32x4x3_t __i; __builtin_neon_ci __o; } __rv;
+ __rv.__o = __builtin_neon_vld3v4sf (__a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint8x16x3_t __attribute__ ((__always_inline__))
+vld3q_u8 (const uint8_t * __a)
+{
+ union { uint8x16x3_t __i; __builtin_neon_ci __o; } __rv;
+ __rv.__o = __builtin_neon_vld3v16qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
+vld3q_u16 (const uint16_t * __a)
+{
+ union { uint16x8x3_t __i; __builtin_neon_ci __o; } __rv;
+ __rv.__o = __builtin_neon_vld3v8hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
+vld3q_u32 (const uint32_t * __a)
+{
+ union { uint32x4x3_t __i; __builtin_neon_ci __o; } __rv;
+ __rv.__o = __builtin_neon_vld3v4si ((const __builtin_neon_si *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly8x16x3_t __attribute__ ((__always_inline__))
+vld3q_p8 (const poly8_t * __a)
+{
+ union { poly8x16x3_t __i; __builtin_neon_ci __o; } __rv;
+ __rv.__o = __builtin_neon_vld3v16qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
+vld3q_p16 (const poly16_t * __a)
+{
+ union { poly16x8x3_t __i; __builtin_neon_ci __o; } __rv;
+ __rv.__o = __builtin_neon_vld3v8hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
+vld3_lane_s8 (const int8_t * __a, int8x8x3_t __b, const int __c)
+{
+ union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
+vld3_lane_s16 (const int16_t * __a, int16x4x3_t __b, const int __c)
+{
+ union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
+vld3_lane_s32 (const int32_t * __a, int32x2x3_t __b, const int __c)
+{
+ union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
+vld3_lane_f32 (const float32_t * __a, float32x2x3_t __b, const int __c)
+{
+ union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_lanev2sf (__a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
+vld3_lane_u8 (const uint8_t * __a, uint8x8x3_t __b, const int __c)
+{
+ union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
+vld3_lane_u16 (const uint16_t * __a, uint16x4x3_t __b, const int __c)
+{
+ union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
+vld3_lane_u32 (const uint32_t * __a, uint32x2x3_t __b, const int __c)
+{
+ union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
+vld3_lane_p8 (const poly8_t * __a, poly8x8x3_t __b, const int __c)
+{
+ union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
+vld3_lane_p16 (const poly16_t * __a, poly16x4x3_t __b, const int __c)
+{
+ union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline int16x8x3_t __attribute__ ((__always_inline__))
+vld3q_lane_s16 (const int16_t * __a, int16x8x3_t __b, const int __c)
+{
+ union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ union { int16x8x3_t __i; __builtin_neon_ci __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline int32x4x3_t __attribute__ ((__always_inline__))
+vld3q_lane_s32 (const int32_t * __a, int32x4x3_t __b, const int __c)
+{
+ union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ union { int32x4x3_t __i; __builtin_neon_ci __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline float32x4x3_t __attribute__ ((__always_inline__))
+vld3q_lane_f32 (const float32_t * __a, float32x4x3_t __b, const int __c)
+{
+ union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ union { float32x4x3_t __i; __builtin_neon_ci __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_lanev4sf (__a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint16x8x3_t __attribute__ ((__always_inline__))
+vld3q_lane_u16 (const uint16_t * __a, uint16x8x3_t __b, const int __c)
+{
+ union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ union { uint16x8x3_t __i; __builtin_neon_ci __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint32x4x3_t __attribute__ ((__always_inline__))
+vld3q_lane_u32 (const uint32_t * __a, uint32x4x3_t __b, const int __c)
+{
+ union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ union { uint32x4x3_t __i; __builtin_neon_ci __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly16x8x3_t __attribute__ ((__always_inline__))
+vld3q_lane_p16 (const poly16_t * __a, poly16x8x3_t __b, const int __c)
+{
+ union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ union { poly16x8x3_t __i; __builtin_neon_ci __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline int8x8x3_t __attribute__ ((__always_inline__))
+vld3_dup_s8 (const int8_t * __a)
+{
+ union { int8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_dupv8qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int16x4x3_t __attribute__ ((__always_inline__))
+vld3_dup_s16 (const int16_t * __a)
+{
+ union { int16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_dupv4hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int32x2x3_t __attribute__ ((__always_inline__))
+vld3_dup_s32 (const int32_t * __a)
+{
+ union { int32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_dupv2si ((const __builtin_neon_si *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline float32x2x3_t __attribute__ ((__always_inline__))
+vld3_dup_f32 (const float32_t * __a)
+{
+ union { float32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_dupv2sf (__a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint8x8x3_t __attribute__ ((__always_inline__))
+vld3_dup_u8 (const uint8_t * __a)
+{
+ union { uint8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_dupv8qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint16x4x3_t __attribute__ ((__always_inline__))
+vld3_dup_u16 (const uint16_t * __a)
+{
+ union { uint16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_dupv4hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint32x2x3_t __attribute__ ((__always_inline__))
+vld3_dup_u32 (const uint32_t * __a)
+{
+ union { uint32x2x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_dupv2si ((const __builtin_neon_si *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly8x8x3_t __attribute__ ((__always_inline__))
+vld3_dup_p8 (const poly8_t * __a)
+{
+ union { poly8x8x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_dupv8qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly16x4x3_t __attribute__ ((__always_inline__))
+vld3_dup_p16 (const poly16_t * __a)
+{
+ union { poly16x4x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_dupv4hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int64x1x3_t __attribute__ ((__always_inline__))
+vld3_dup_s64 (const int64_t * __a)
+{
+ union { int64x1x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_dupdi ((const __builtin_neon_di *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint64x1x3_t __attribute__ ((__always_inline__))
+vld3_dup_u64 (const uint64_t * __a)
+{
+ union { uint64x1x3_t __i; __builtin_neon_ei __o; } __rv;
+ __rv.__o = __builtin_neon_vld3_dupdi ((const __builtin_neon_di *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_s8 (int8_t * __a, int8x8x3_t __b)
+{
+ union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_s16 (int16_t * __a, int16x4x3_t __b)
+{
+ union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_s32 (int32_t * __a, int32x2x3_t __b)
+{
+ union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3v2si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_f32 (float32_t * __a, float32x2x3_t __b)
+{
+ union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3v2sf (__a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_u8 (uint8_t * __a, uint8x8x3_t __b)
+{
+ union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_u16 (uint16_t * __a, uint16x4x3_t __b)
+{
+ union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_u32 (uint32_t * __a, uint32x2x3_t __b)
+{
+ union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3v2si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_p8 (poly8_t * __a, poly8x8x3_t __b)
+{
+ union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_p16 (poly16_t * __a, poly16x4x3_t __b)
+{
+ union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_s64 (int64_t * __a, int64x1x3_t __b)
+{
+ union { int64x1x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3di ((__builtin_neon_di *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_u64 (uint64_t * __a, uint64x1x3_t __b)
+{
+ union { uint64x1x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3di ((__builtin_neon_di *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_s8 (int8_t * __a, int8x16x3_t __b)
+{
+ union { int8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_s16 (int16_t * __a, int16x8x3_t __b)
+{
+ union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_s32 (int32_t * __a, int32x4x3_t __b)
+{
+ union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3v4si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_f32 (float32_t * __a, float32x4x3_t __b)
+{
+ union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3v4sf (__a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_u8 (uint8_t * __a, uint8x16x3_t __b)
+{
+ union { uint8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_u16 (uint16_t * __a, uint16x8x3_t __b)
+{
+ union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_u32 (uint32_t * __a, uint32x4x3_t __b)
+{
+ union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3v4si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_p8 (poly8_t * __a, poly8x16x3_t __b)
+{
+ union { poly8x16x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_p16 (poly16_t * __a, poly16x8x3_t __b)
+{
+ union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_lane_s8 (int8_t * __a, int8x8x3_t __b, const int __c)
+{
+ union { int8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_lane_s16 (int16_t * __a, int16x4x3_t __b, const int __c)
+{
+ union { int16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_lane_s32 (int32_t * __a, int32x2x3_t __b, const int __c)
+{
+ union { int32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_lane_f32 (float32_t * __a, float32x2x3_t __b, const int __c)
+{
+ union { float32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev2sf (__a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_lane_u8 (uint8_t * __a, uint8x8x3_t __b, const int __c)
+{
+ union { uint8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_lane_u16 (uint16_t * __a, uint16x4x3_t __b, const int __c)
+{
+ union { uint16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_lane_u32 (uint32_t * __a, uint32x2x3_t __b, const int __c)
+{
+ union { uint32x2x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_lane_p8 (poly8_t * __a, poly8x8x3_t __b, const int __c)
+{
+ union { poly8x8x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3_lane_p16 (poly16_t * __a, poly16x4x3_t __b, const int __c)
+{
+ union { poly16x4x3_t __i; __builtin_neon_ei __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_lane_s16 (int16_t * __a, int16x8x3_t __b, const int __c)
+{
+ union { int16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_lane_s32 (int32_t * __a, int32x4x3_t __b, const int __c)
+{
+ union { int32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_lane_f32 (float32_t * __a, float32x4x3_t __b, const int __c)
+{
+ union { float32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev4sf (__a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_lane_u16 (uint16_t * __a, uint16x8x3_t __b, const int __c)
+{
+ union { uint16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_lane_u32 (uint32_t * __a, uint32x4x3_t __b, const int __c)
+{
+ union { uint32x4x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst3q_lane_p16 (poly16_t * __a, poly16x8x3_t __b, const int __c)
+{
+ union { poly16x8x3_t __i; __builtin_neon_ci __o; } __bu = { __b };
+ __builtin_neon_vst3_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
+vld4_s8 (const int8_t * __a)
+{
+ union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4v8qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
+vld4_s16 (const int16_t * __a)
+{
+ union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4v4hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
+vld4_s32 (const int32_t * __a)
+{
+ union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4v2si ((const __builtin_neon_si *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
+vld4_f32 (const float32_t * __a)
+{
+ union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4v2sf (__a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
+vld4_u8 (const uint8_t * __a)
+{
+ union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4v8qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
+vld4_u16 (const uint16_t * __a)
+{
+ union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4v4hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
+vld4_u32 (const uint32_t * __a)
+{
+ union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4v2si ((const __builtin_neon_si *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
+vld4_p8 (const poly8_t * __a)
+{
+ union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4v8qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
+vld4_p16 (const poly16_t * __a)
+{
+ union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4v4hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
+vld4_s64 (const int64_t * __a)
+{
+ union { int64x1x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4di ((const __builtin_neon_di *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__))
+vld4_u64 (const uint64_t * __a)
+{
+ union { uint64x1x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4di ((const __builtin_neon_di *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int8x16x4_t __attribute__ ((__always_inline__))
+vld4q_s8 (const int8_t * __a)
+{
+ union { int8x16x4_t __i; __builtin_neon_xi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4v16qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
+vld4q_s16 (const int16_t * __a)
+{
+ union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4v8hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
+vld4q_s32 (const int32_t * __a)
+{
+ union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4v4si ((const __builtin_neon_si *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
+vld4q_f32 (const float32_t * __a)
+{
+ union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4v4sf (__a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint8x16x4_t __attribute__ ((__always_inline__))
+vld4q_u8 (const uint8_t * __a)
+{
+ union { uint8x16x4_t __i; __builtin_neon_xi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4v16qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
+vld4q_u16 (const uint16_t * __a)
+{
+ union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4v8hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
+vld4q_u32 (const uint32_t * __a)
+{
+ union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4v4si ((const __builtin_neon_si *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly8x16x4_t __attribute__ ((__always_inline__))
+vld4q_p8 (const poly8_t * __a)
+{
+ union { poly8x16x4_t __i; __builtin_neon_xi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4v16qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
+vld4q_p16 (const poly16_t * __a)
+{
+ union { poly16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4v8hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
+vld4_lane_s8 (const int8_t * __a, int8x8x4_t __b, const int __c)
+{
+ union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
+vld4_lane_s16 (const int16_t * __a, int16x4x4_t __b, const int __c)
+{
+ union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
+vld4_lane_s32 (const int32_t * __a, int32x2x4_t __b, const int __c)
+{
+ union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
+vld4_lane_f32 (const float32_t * __a, float32x2x4_t __b, const int __c)
+{
+ union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_lanev2sf (__a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
+vld4_lane_u8 (const uint8_t * __a, uint8x8x4_t __b, const int __c)
+{
+ union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
+vld4_lane_u16 (const uint16_t * __a, uint16x4x4_t __b, const int __c)
+{
+ union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
+vld4_lane_u32 (const uint32_t * __a, uint32x2x4_t __b, const int __c)
+{
+ union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_lanev2si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
+vld4_lane_p8 (const poly8_t * __a, poly8x8x4_t __b, const int __c)
+{
+ union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_lanev8qi ((const __builtin_neon_qi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
+vld4_lane_p16 (const poly16_t * __a, poly16x4x4_t __b, const int __c)
+{
+ union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_lanev4hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline int16x8x4_t __attribute__ ((__always_inline__))
+vld4q_lane_s16 (const int16_t * __a, int16x8x4_t __b, const int __c)
+{
+ union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ union { int16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline int32x4x4_t __attribute__ ((__always_inline__))
+vld4q_lane_s32 (const int32_t * __a, int32x4x4_t __b, const int __c)
+{
+ union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ union { int32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline float32x4x4_t __attribute__ ((__always_inline__))
+vld4q_lane_f32 (const float32_t * __a, float32x4x4_t __b, const int __c)
+{
+ union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ union { float32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_lanev4sf (__a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint16x8x4_t __attribute__ ((__always_inline__))
+vld4q_lane_u16 (const uint16_t * __a, uint16x8x4_t __b, const int __c)
+{
+ union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ union { uint16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint32x4x4_t __attribute__ ((__always_inline__))
+vld4q_lane_u32 (const uint32_t * __a, uint32x4x4_t __b, const int __c)
+{
+ union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ union { uint32x4x4_t __i; __builtin_neon_xi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_lanev4si ((const __builtin_neon_si *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly16x8x4_t __attribute__ ((__always_inline__))
+vld4q_lane_p16 (const poly16_t * __a, poly16x8x4_t __b, const int __c)
+{
+ union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ union { poly16x8x4_t __i; __builtin_neon_xi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_lanev8hi ((const __builtin_neon_hi *) __a, __bu.__o, __c);
+ return __rv.__i;
+}
+
+__extension__ static __inline int8x8x4_t __attribute__ ((__always_inline__))
+vld4_dup_s8 (const int8_t * __a)
+{
+ union { int8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_dupv8qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int16x4x4_t __attribute__ ((__always_inline__))
+vld4_dup_s16 (const int16_t * __a)
+{
+ union { int16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_dupv4hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int32x2x4_t __attribute__ ((__always_inline__))
+vld4_dup_s32 (const int32_t * __a)
+{
+ union { int32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_dupv2si ((const __builtin_neon_si *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline float32x2x4_t __attribute__ ((__always_inline__))
+vld4_dup_f32 (const float32_t * __a)
+{
+ union { float32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_dupv2sf (__a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint8x8x4_t __attribute__ ((__always_inline__))
+vld4_dup_u8 (const uint8_t * __a)
+{
+ union { uint8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_dupv8qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint16x4x4_t __attribute__ ((__always_inline__))
+vld4_dup_u16 (const uint16_t * __a)
+{
+ union { uint16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_dupv4hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint32x2x4_t __attribute__ ((__always_inline__))
+vld4_dup_u32 (const uint32_t * __a)
+{
+ union { uint32x2x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_dupv2si ((const __builtin_neon_si *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly8x8x4_t __attribute__ ((__always_inline__))
+vld4_dup_p8 (const poly8_t * __a)
+{
+ union { poly8x8x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_dupv8qi ((const __builtin_neon_qi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline poly16x4x4_t __attribute__ ((__always_inline__))
+vld4_dup_p16 (const poly16_t * __a)
+{
+ union { poly16x4x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_dupv4hi ((const __builtin_neon_hi *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline int64x1x4_t __attribute__ ((__always_inline__))
+vld4_dup_s64 (const int64_t * __a)
+{
+ union { int64x1x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_dupdi ((const __builtin_neon_di *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline uint64x1x4_t __attribute__ ((__always_inline__))
+vld4_dup_u64 (const uint64_t * __a)
+{
+ union { uint64x1x4_t __i; __builtin_neon_oi __o; } __rv;
+ __rv.__o = __builtin_neon_vld4_dupdi ((const __builtin_neon_di *) __a);
+ return __rv.__i;
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_s8 (int8_t * __a, int8x8x4_t __b)
+{
+ union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_s16 (int16_t * __a, int16x4x4_t __b)
+{
+ union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_s32 (int32_t * __a, int32x2x4_t __b)
+{
+ union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4v2si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_f32 (float32_t * __a, float32x2x4_t __b)
+{
+ union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4v2sf (__a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_u8 (uint8_t * __a, uint8x8x4_t __b)
+{
+ union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_u16 (uint16_t * __a, uint16x4x4_t __b)
+{
+ union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_u32 (uint32_t * __a, uint32x2x4_t __b)
+{
+ union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4v2si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_p8 (poly8_t * __a, poly8x8x4_t __b)
+{
+ union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4v8qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_p16 (poly16_t * __a, poly16x4x4_t __b)
+{
+ union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4v4hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_s64 (int64_t * __a, int64x1x4_t __b)
+{
+ union { int64x1x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4di ((__builtin_neon_di *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_u64 (uint64_t * __a, uint64x1x4_t __b)
+{
+ union { uint64x1x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4di ((__builtin_neon_di *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_s8 (int8_t * __a, int8x16x4_t __b)
+{
+ union { int8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_s16 (int16_t * __a, int16x8x4_t __b)
+{
+ union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_s32 (int32_t * __a, int32x4x4_t __b)
+{
+ union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4v4si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_f32 (float32_t * __a, float32x4x4_t __b)
+{
+ union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4v4sf (__a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_u8 (uint8_t * __a, uint8x16x4_t __b)
+{
+ union { uint8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_u16 (uint16_t * __a, uint16x8x4_t __b)
+{
+ union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_u32 (uint32_t * __a, uint32x4x4_t __b)
+{
+ union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4v4si ((__builtin_neon_si *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_p8 (poly8_t * __a, poly8x16x4_t __b)
+{
+ union { poly8x16x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4v16qi ((__builtin_neon_qi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_p16 (poly16_t * __a, poly16x8x4_t __b)
+{
+ union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4v8hi ((__builtin_neon_hi *) __a, __bu.__o);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_lane_s8 (int8_t * __a, int8x8x4_t __b, const int __c)
+{
+ union { int8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_lane_s16 (int16_t * __a, int16x4x4_t __b, const int __c)
+{
+ union { int16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_lane_s32 (int32_t * __a, int32x2x4_t __b, const int __c)
+{
+ union { int32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_lane_f32 (float32_t * __a, float32x2x4_t __b, const int __c)
+{
+ union { float32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev2sf (__a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_lane_u8 (uint8_t * __a, uint8x8x4_t __b, const int __c)
+{
+ union { uint8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_lane_u16 (uint16_t * __a, uint16x4x4_t __b, const int __c)
+{
+ union { uint16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_lane_u32 (uint32_t * __a, uint32x2x4_t __b, const int __c)
+{
+ union { uint32x2x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev2si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_lane_p8 (poly8_t * __a, poly8x8x4_t __b, const int __c)
+{
+ union { poly8x8x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev8qi ((__builtin_neon_qi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4_lane_p16 (poly16_t * __a, poly16x4x4_t __b, const int __c)
+{
+ union { poly16x4x4_t __i; __builtin_neon_oi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev4hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_lane_s16 (int16_t * __a, int16x8x4_t __b, const int __c)
+{
+ union { int16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_lane_s32 (int32_t * __a, int32x4x4_t __b, const int __c)
+{
+ union { int32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_lane_f32 (float32_t * __a, float32x4x4_t __b, const int __c)
+{
+ union { float32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev4sf (__a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_lane_u16 (uint16_t * __a, uint16x8x4_t __b, const int __c)
+{
+ union { uint16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_lane_u32 (uint32_t * __a, uint32x4x4_t __b, const int __c)
+{
+ union { uint32x4x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev4si ((__builtin_neon_si *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline void __attribute__ ((__always_inline__))
+vst4q_lane_p16 (poly16_t * __a, poly16x8x4_t __b, const int __c)
+{
+ union { poly16x8x4_t __i; __builtin_neon_xi __o; } __bu = { __b };
+ __builtin_neon_vst4_lanev8hi ((__builtin_neon_hi *) __a, __bu.__o, __c);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vand_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vandv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vand_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vandv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vand_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vandv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vand_s64 (int64x1_t __a, int64x1_t __b)
+{
+ return (int64x1_t)__builtin_neon_vanddi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vand_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vandv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vand_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vandv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vand_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vandv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vand_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+ return (uint64x1_t)__builtin_neon_vanddi ((int64x1_t) __a, (int64x1_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vandq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t)__builtin_neon_vandv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vandq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vandv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vandq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vandv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vandq_s64 (int64x2_t __a, int64x2_t __b)
+{
+ return (int64x2_t)__builtin_neon_vandv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vandq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vandv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vandq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vandv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vandq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vandv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vandq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vandv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vorr_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vorrv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vorr_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vorrv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vorr_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vorrv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vorr_s64 (int64x1_t __a, int64x1_t __b)
+{
+ return (int64x1_t)__builtin_neon_vorrdi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vorr_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vorrv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vorr_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vorrv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vorr_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vorrv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vorr_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+ return (uint64x1_t)__builtin_neon_vorrdi ((int64x1_t) __a, (int64x1_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vorrq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t)__builtin_neon_vorrv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vorrq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vorrv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vorrq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vorrv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vorrq_s64 (int64x2_t __a, int64x2_t __b)
+{
+ return (int64x2_t)__builtin_neon_vorrv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vorrq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vorrv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vorrq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vorrv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vorrq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vorrv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vorrq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vorrv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+veor_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_veorv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+veor_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_veorv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+veor_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_veorv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+veor_s64 (int64x1_t __a, int64x1_t __b)
+{
+ return (int64x1_t)__builtin_neon_veordi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+veor_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_veorv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+veor_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_veorv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+veor_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_veorv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+veor_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+ return (uint64x1_t)__builtin_neon_veordi ((int64x1_t) __a, (int64x1_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+veorq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t)__builtin_neon_veorv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+veorq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_veorv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+veorq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_veorv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+veorq_s64 (int64x2_t __a, int64x2_t __b)
+{
+ return (int64x2_t)__builtin_neon_veorv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+veorq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_veorv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+veorq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_veorv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+veorq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_veorv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+veorq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+ return (uint64x2_t)__builtin_neon_veorv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vbic_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vbicv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vbic_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vbicv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vbic_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vbicv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vbic_s64 (int64x1_t __a, int64x1_t __b)
+{
+ return (int64x1_t)__builtin_neon_vbicdi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vbic_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vbicv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vbic_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vbicv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vbic_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vbicv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vbic_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+ return (uint64x1_t)__builtin_neon_vbicdi ((int64x1_t) __a, (int64x1_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vbicq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t)__builtin_neon_vbicv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vbicq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vbicv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vbicq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vbicv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vbicq_s64 (int64x2_t __a, int64x2_t __b)
+{
+ return (int64x2_t)__builtin_neon_vbicv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vbicq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vbicv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vbicq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vbicv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vbicq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vbicv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vbicq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vbicv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vorn_s8 (int8x8_t __a, int8x8_t __b)
+{
+ return (int8x8_t)__builtin_neon_vornv8qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vorn_s16 (int16x4_t __a, int16x4_t __b)
+{
+ return (int16x4_t)__builtin_neon_vornv4hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vorn_s32 (int32x2_t __a, int32x2_t __b)
+{
+ return (int32x2_t)__builtin_neon_vornv2si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vorn_s64 (int64x1_t __a, int64x1_t __b)
+{
+ return (int64x1_t)__builtin_neon_vorndi (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vorn_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+ return (uint8x8_t)__builtin_neon_vornv8qi ((int8x8_t) __a, (int8x8_t) __b, 0);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vorn_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+ return (uint16x4_t)__builtin_neon_vornv4hi ((int16x4_t) __a, (int16x4_t) __b, 0);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vorn_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+ return (uint32x2_t)__builtin_neon_vornv2si ((int32x2_t) __a, (int32x2_t) __b, 0);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vorn_u64 (uint64x1_t __a, uint64x1_t __b)
+{
+ return (uint64x1_t)__builtin_neon_vorndi ((int64x1_t) __a, (int64x1_t) __b, 0);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vornq_s8 (int8x16_t __a, int8x16_t __b)
+{
+ return (int8x16_t)__builtin_neon_vornv16qi (__a, __b, 1);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vornq_s16 (int16x8_t __a, int16x8_t __b)
+{
+ return (int16x8_t)__builtin_neon_vornv8hi (__a, __b, 1);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vornq_s32 (int32x4_t __a, int32x4_t __b)
+{
+ return (int32x4_t)__builtin_neon_vornv4si (__a, __b, 1);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vornq_s64 (int64x2_t __a, int64x2_t __b)
+{
+ return (int64x2_t)__builtin_neon_vornv2di (__a, __b, 1);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vornq_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+ return (uint8x16_t)__builtin_neon_vornv16qi ((int8x16_t) __a, (int8x16_t) __b, 0);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vornq_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+ return (uint16x8_t)__builtin_neon_vornv8hi ((int16x8_t) __a, (int16x8_t) __b, 0);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vornq_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+ return (uint32x4_t)__builtin_neon_vornv4si ((int32x4_t) __a, (int32x4_t) __b, 0);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vornq_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+ return (uint64x2_t)__builtin_neon_vornv2di ((int64x2_t) __a, (int64x2_t) __b, 0);
+}
+
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_s8 (int8x8_t __a)
+{
+ return (poly8x8_t)__builtin_neon_vreinterpretv8qiv8qi (__a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_s16 (int16x4_t __a)
+{
+ return (poly8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_s32 (int32x2_t __a)
+{
+ return (poly8x8_t)__builtin_neon_vreinterpretv8qiv2si (__a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_s64 (int64x1_t __a)
+{
+ return (poly8x8_t)__builtin_neon_vreinterpretv8qidi (__a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_f32 (float32x2_t __a)
+{
+ return (poly8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_u8 (uint8x8_t __a)
+{
+ return (poly8x8_t)__builtin_neon_vreinterpretv8qiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_u16 (uint16x4_t __a)
+{
+ return (poly8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_u32 (uint32x2_t __a)
+{
+ return (poly8x8_t)__builtin_neon_vreinterpretv8qiv2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_u64 (uint64x1_t __a)
+{
+ return (poly8x8_t)__builtin_neon_vreinterpretv8qidi ((int64x1_t) __a);
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vreinterpret_p8_p16 (poly16x4_t __a)
+{
+ return (poly8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_s8 (int8x16_t __a)
+{
+ return (poly8x16_t)__builtin_neon_vreinterpretv16qiv16qi (__a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_s16 (int16x8_t __a)
+{
+ return (poly8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_s32 (int32x4_t __a)
+{
+ return (poly8x16_t)__builtin_neon_vreinterpretv16qiv4si (__a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_s64 (int64x2_t __a)
+{
+ return (poly8x16_t)__builtin_neon_vreinterpretv16qiv2di (__a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_f32 (float32x4_t __a)
+{
+ return (poly8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_u8 (uint8x16_t __a)
+{
+ return (poly8x16_t)__builtin_neon_vreinterpretv16qiv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_u16 (uint16x8_t __a)
+{
+ return (poly8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_u32 (uint32x4_t __a)
+{
+ return (poly8x16_t)__builtin_neon_vreinterpretv16qiv4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_u64 (uint64x2_t __a)
+{
+ return (poly8x16_t)__builtin_neon_vreinterpretv16qiv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_p8_p16 (poly16x8_t __a)
+{
+ return (poly8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_s8 (int8x8_t __a)
+{
+ return (poly16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_s16 (int16x4_t __a)
+{
+ return (poly16x4_t)__builtin_neon_vreinterpretv4hiv4hi (__a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_s32 (int32x2_t __a)
+{
+ return (poly16x4_t)__builtin_neon_vreinterpretv4hiv2si (__a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_s64 (int64x1_t __a)
+{
+ return (poly16x4_t)__builtin_neon_vreinterpretv4hidi (__a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_f32 (float32x2_t __a)
+{
+ return (poly16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_u8 (uint8x8_t __a)
+{
+ return (poly16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_u16 (uint16x4_t __a)
+{
+ return (poly16x4_t)__builtin_neon_vreinterpretv4hiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_u32 (uint32x2_t __a)
+{
+ return (poly16x4_t)__builtin_neon_vreinterpretv4hiv2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_u64 (uint64x1_t __a)
+{
+ return (poly16x4_t)__builtin_neon_vreinterpretv4hidi ((int64x1_t) __a);
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vreinterpret_p16_p8 (poly8x8_t __a)
+{
+ return (poly16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_s8 (int8x16_t __a)
+{
+ return (poly16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_s16 (int16x8_t __a)
+{
+ return (poly16x8_t)__builtin_neon_vreinterpretv8hiv8hi (__a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_s32 (int32x4_t __a)
+{
+ return (poly16x8_t)__builtin_neon_vreinterpretv8hiv4si (__a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_s64 (int64x2_t __a)
+{
+ return (poly16x8_t)__builtin_neon_vreinterpretv8hiv2di (__a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_f32 (float32x4_t __a)
+{
+ return (poly16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_u8 (uint8x16_t __a)
+{
+ return (poly16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_u16 (uint16x8_t __a)
+{
+ return (poly16x8_t)__builtin_neon_vreinterpretv8hiv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_u32 (uint32x4_t __a)
+{
+ return (poly16x8_t)__builtin_neon_vreinterpretv8hiv4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_u64 (uint64x2_t __a)
+{
+ return (poly16x8_t)__builtin_neon_vreinterpretv8hiv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_p16_p8 (poly8x16_t __a)
+{
+ return (poly16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_s8 (int8x8_t __a)
+{
+ return (float32x2_t)__builtin_neon_vreinterpretv2sfv8qi (__a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_s16 (int16x4_t __a)
+{
+ return (float32x2_t)__builtin_neon_vreinterpretv2sfv4hi (__a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_s32 (int32x2_t __a)
+{
+ return (float32x2_t)__builtin_neon_vreinterpretv2sfv2si (__a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_s64 (int64x1_t __a)
+{
+ return (float32x2_t)__builtin_neon_vreinterpretv2sfdi (__a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_u8 (uint8x8_t __a)
+{
+ return (float32x2_t)__builtin_neon_vreinterpretv2sfv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_u16 (uint16x4_t __a)
+{
+ return (float32x2_t)__builtin_neon_vreinterpretv2sfv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_u32 (uint32x2_t __a)
+{
+ return (float32x2_t)__builtin_neon_vreinterpretv2sfv2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_u64 (uint64x1_t __a)
+{
+ return (float32x2_t)__builtin_neon_vreinterpretv2sfdi ((int64x1_t) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_p8 (poly8x8_t __a)
+{
+ return (float32x2_t)__builtin_neon_vreinterpretv2sfv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vreinterpret_f32_p16 (poly16x4_t __a)
+{
+ return (float32x2_t)__builtin_neon_vreinterpretv2sfv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_s8 (int8x16_t __a)
+{
+ return (float32x4_t)__builtin_neon_vreinterpretv4sfv16qi (__a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_s16 (int16x8_t __a)
+{
+ return (float32x4_t)__builtin_neon_vreinterpretv4sfv8hi (__a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_s32 (int32x4_t __a)
+{
+ return (float32x4_t)__builtin_neon_vreinterpretv4sfv4si (__a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_s64 (int64x2_t __a)
+{
+ return (float32x4_t)__builtin_neon_vreinterpretv4sfv2di (__a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_u8 (uint8x16_t __a)
+{
+ return (float32x4_t)__builtin_neon_vreinterpretv4sfv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_u16 (uint16x8_t __a)
+{
+ return (float32x4_t)__builtin_neon_vreinterpretv4sfv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_u32 (uint32x4_t __a)
+{
+ return (float32x4_t)__builtin_neon_vreinterpretv4sfv4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_u64 (uint64x2_t __a)
+{
+ return (float32x4_t)__builtin_neon_vreinterpretv4sfv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_p8 (poly8x16_t __a)
+{
+ return (float32x4_t)__builtin_neon_vreinterpretv4sfv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_f32_p16 (poly16x8_t __a)
+{
+ return (float32x4_t)__builtin_neon_vreinterpretv4sfv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_s8 (int8x8_t __a)
+{
+ return (int64x1_t)__builtin_neon_vreinterpretdiv8qi (__a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_s16 (int16x4_t __a)
+{
+ return (int64x1_t)__builtin_neon_vreinterpretdiv4hi (__a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_s32 (int32x2_t __a)
+{
+ return (int64x1_t)__builtin_neon_vreinterpretdiv2si (__a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_f32 (float32x2_t __a)
+{
+ return (int64x1_t)__builtin_neon_vreinterpretdiv2sf (__a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_u8 (uint8x8_t __a)
+{
+ return (int64x1_t)__builtin_neon_vreinterpretdiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_u16 (uint16x4_t __a)
+{
+ return (int64x1_t)__builtin_neon_vreinterpretdiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_u32 (uint32x2_t __a)
+{
+ return (int64x1_t)__builtin_neon_vreinterpretdiv2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_u64 (uint64x1_t __a)
+{
+ return (int64x1_t)__builtin_neon_vreinterpretdidi ((int64x1_t) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_p8 (poly8x8_t __a)
+{
+ return (int64x1_t)__builtin_neon_vreinterpretdiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline int64x1_t __attribute__ ((__always_inline__))
+vreinterpret_s64_p16 (poly16x4_t __a)
+{
+ return (int64x1_t)__builtin_neon_vreinterpretdiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_s8 (int8x16_t __a)
+{
+ return (int64x2_t)__builtin_neon_vreinterpretv2div16qi (__a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_s16 (int16x8_t __a)
+{
+ return (int64x2_t)__builtin_neon_vreinterpretv2div8hi (__a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_s32 (int32x4_t __a)
+{
+ return (int64x2_t)__builtin_neon_vreinterpretv2div4si (__a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_f32 (float32x4_t __a)
+{
+ return (int64x2_t)__builtin_neon_vreinterpretv2div4sf (__a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_u8 (uint8x16_t __a)
+{
+ return (int64x2_t)__builtin_neon_vreinterpretv2div16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_u16 (uint16x8_t __a)
+{
+ return (int64x2_t)__builtin_neon_vreinterpretv2div8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_u32 (uint32x4_t __a)
+{
+ return (int64x2_t)__builtin_neon_vreinterpretv2div4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_u64 (uint64x2_t __a)
+{
+ return (int64x2_t)__builtin_neon_vreinterpretv2div2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_p8 (poly8x16_t __a)
+{
+ return (int64x2_t)__builtin_neon_vreinterpretv2div16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_s64_p16 (poly16x8_t __a)
+{
+ return (int64x2_t)__builtin_neon_vreinterpretv2div8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_s8 (int8x8_t __a)
+{
+ return (uint64x1_t)__builtin_neon_vreinterpretdiv8qi (__a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_s16 (int16x4_t __a)
+{
+ return (uint64x1_t)__builtin_neon_vreinterpretdiv4hi (__a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_s32 (int32x2_t __a)
+{
+ return (uint64x1_t)__builtin_neon_vreinterpretdiv2si (__a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_s64 (int64x1_t __a)
+{
+ return (uint64x1_t)__builtin_neon_vreinterpretdidi (__a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_f32 (float32x2_t __a)
+{
+ return (uint64x1_t)__builtin_neon_vreinterpretdiv2sf (__a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_u8 (uint8x8_t __a)
+{
+ return (uint64x1_t)__builtin_neon_vreinterpretdiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_u16 (uint16x4_t __a)
+{
+ return (uint64x1_t)__builtin_neon_vreinterpretdiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_u32 (uint32x2_t __a)
+{
+ return (uint64x1_t)__builtin_neon_vreinterpretdiv2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_p8 (poly8x8_t __a)
+{
+ return (uint64x1_t)__builtin_neon_vreinterpretdiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline uint64x1_t __attribute__ ((__always_inline__))
+vreinterpret_u64_p16 (poly16x4_t __a)
+{
+ return (uint64x1_t)__builtin_neon_vreinterpretdiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_s8 (int8x16_t __a)
+{
+ return (uint64x2_t)__builtin_neon_vreinterpretv2div16qi (__a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_s16 (int16x8_t __a)
+{
+ return (uint64x2_t)__builtin_neon_vreinterpretv2div8hi (__a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_s32 (int32x4_t __a)
+{
+ return (uint64x2_t)__builtin_neon_vreinterpretv2div4si (__a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_s64 (int64x2_t __a)
+{
+ return (uint64x2_t)__builtin_neon_vreinterpretv2div2di (__a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_f32 (float32x4_t __a)
+{
+ return (uint64x2_t)__builtin_neon_vreinterpretv2div4sf (__a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_u8 (uint8x16_t __a)
+{
+ return (uint64x2_t)__builtin_neon_vreinterpretv2div16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_u16 (uint16x8_t __a)
+{
+ return (uint64x2_t)__builtin_neon_vreinterpretv2div8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_u32 (uint32x4_t __a)
+{
+ return (uint64x2_t)__builtin_neon_vreinterpretv2div4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_p8 (poly8x16_t __a)
+{
+ return (uint64x2_t)__builtin_neon_vreinterpretv2div16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vreinterpretq_u64_p16 (poly16x8_t __a)
+{
+ return (uint64x2_t)__builtin_neon_vreinterpretv2div8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_s16 (int16x4_t __a)
+{
+ return (int8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_s32 (int32x2_t __a)
+{
+ return (int8x8_t)__builtin_neon_vreinterpretv8qiv2si (__a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_s64 (int64x1_t __a)
+{
+ return (int8x8_t)__builtin_neon_vreinterpretv8qidi (__a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_f32 (float32x2_t __a)
+{
+ return (int8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_u8 (uint8x8_t __a)
+{
+ return (int8x8_t)__builtin_neon_vreinterpretv8qiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_u16 (uint16x4_t __a)
+{
+ return (int8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_u32 (uint32x2_t __a)
+{
+ return (int8x8_t)__builtin_neon_vreinterpretv8qiv2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_u64 (uint64x1_t __a)
+{
+ return (int8x8_t)__builtin_neon_vreinterpretv8qidi ((int64x1_t) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_p8 (poly8x8_t __a)
+{
+ return (int8x8_t)__builtin_neon_vreinterpretv8qiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vreinterpret_s8_p16 (poly16x4_t __a)
+{
+ return (int8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_s16 (int16x8_t __a)
+{
+ return (int8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_s32 (int32x4_t __a)
+{
+ return (int8x16_t)__builtin_neon_vreinterpretv16qiv4si (__a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_s64 (int64x2_t __a)
+{
+ return (int8x16_t)__builtin_neon_vreinterpretv16qiv2di (__a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_f32 (float32x4_t __a)
+{
+ return (int8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_u8 (uint8x16_t __a)
+{
+ return (int8x16_t)__builtin_neon_vreinterpretv16qiv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_u16 (uint16x8_t __a)
+{
+ return (int8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_u32 (uint32x4_t __a)
+{
+ return (int8x16_t)__builtin_neon_vreinterpretv16qiv4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_u64 (uint64x2_t __a)
+{
+ return (int8x16_t)__builtin_neon_vreinterpretv16qiv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_p8 (poly8x16_t __a)
+{
+ return (int8x16_t)__builtin_neon_vreinterpretv16qiv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_s8_p16 (poly16x8_t __a)
+{
+ return (int8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_s8 (int8x8_t __a)
+{
+ return (int16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_s32 (int32x2_t __a)
+{
+ return (int16x4_t)__builtin_neon_vreinterpretv4hiv2si (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_s64 (int64x1_t __a)
+{
+ return (int16x4_t)__builtin_neon_vreinterpretv4hidi (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_f32 (float32x2_t __a)
+{
+ return (int16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_u8 (uint8x8_t __a)
+{
+ return (int16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_u16 (uint16x4_t __a)
+{
+ return (int16x4_t)__builtin_neon_vreinterpretv4hiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_u32 (uint32x2_t __a)
+{
+ return (int16x4_t)__builtin_neon_vreinterpretv4hiv2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_u64 (uint64x1_t __a)
+{
+ return (int16x4_t)__builtin_neon_vreinterpretv4hidi ((int64x1_t) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_p8 (poly8x8_t __a)
+{
+ return (int16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vreinterpret_s16_p16 (poly16x4_t __a)
+{
+ return (int16x4_t)__builtin_neon_vreinterpretv4hiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_s8 (int8x16_t __a)
+{
+ return (int16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_s32 (int32x4_t __a)
+{
+ return (int16x8_t)__builtin_neon_vreinterpretv8hiv4si (__a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_s64 (int64x2_t __a)
+{
+ return (int16x8_t)__builtin_neon_vreinterpretv8hiv2di (__a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_f32 (float32x4_t __a)
+{
+ return (int16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_u8 (uint8x16_t __a)
+{
+ return (int16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_u16 (uint16x8_t __a)
+{
+ return (int16x8_t)__builtin_neon_vreinterpretv8hiv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_u32 (uint32x4_t __a)
+{
+ return (int16x8_t)__builtin_neon_vreinterpretv8hiv4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_u64 (uint64x2_t __a)
+{
+ return (int16x8_t)__builtin_neon_vreinterpretv8hiv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_p8 (poly8x16_t __a)
+{
+ return (int16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_s16_p16 (poly16x8_t __a)
+{
+ return (int16x8_t)__builtin_neon_vreinterpretv8hiv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_s8 (int8x8_t __a)
+{
+ return (int32x2_t)__builtin_neon_vreinterpretv2siv8qi (__a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_s16 (int16x4_t __a)
+{
+ return (int32x2_t)__builtin_neon_vreinterpretv2siv4hi (__a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_s64 (int64x1_t __a)
+{
+ return (int32x2_t)__builtin_neon_vreinterpretv2sidi (__a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_f32 (float32x2_t __a)
+{
+ return (int32x2_t)__builtin_neon_vreinterpretv2siv2sf (__a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_u8 (uint8x8_t __a)
+{
+ return (int32x2_t)__builtin_neon_vreinterpretv2siv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_u16 (uint16x4_t __a)
+{
+ return (int32x2_t)__builtin_neon_vreinterpretv2siv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_u32 (uint32x2_t __a)
+{
+ return (int32x2_t)__builtin_neon_vreinterpretv2siv2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_u64 (uint64x1_t __a)
+{
+ return (int32x2_t)__builtin_neon_vreinterpretv2sidi ((int64x1_t) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_p8 (poly8x8_t __a)
+{
+ return (int32x2_t)__builtin_neon_vreinterpretv2siv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vreinterpret_s32_p16 (poly16x4_t __a)
+{
+ return (int32x2_t)__builtin_neon_vreinterpretv2siv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_s8 (int8x16_t __a)
+{
+ return (int32x4_t)__builtin_neon_vreinterpretv4siv16qi (__a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_s16 (int16x8_t __a)
+{
+ return (int32x4_t)__builtin_neon_vreinterpretv4siv8hi (__a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_s64 (int64x2_t __a)
+{
+ return (int32x4_t)__builtin_neon_vreinterpretv4siv2di (__a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_f32 (float32x4_t __a)
+{
+ return (int32x4_t)__builtin_neon_vreinterpretv4siv4sf (__a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_u8 (uint8x16_t __a)
+{
+ return (int32x4_t)__builtin_neon_vreinterpretv4siv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_u16 (uint16x8_t __a)
+{
+ return (int32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_u32 (uint32x4_t __a)
+{
+ return (int32x4_t)__builtin_neon_vreinterpretv4siv4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_u64 (uint64x2_t __a)
+{
+ return (int32x4_t)__builtin_neon_vreinterpretv4siv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_p8 (poly8x16_t __a)
+{
+ return (int32x4_t)__builtin_neon_vreinterpretv4siv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_s32_p16 (poly16x8_t __a)
+{
+ return (int32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_s8 (int8x8_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vreinterpretv8qiv8qi (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_s16 (int16x4_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vreinterpretv8qiv4hi (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_s32 (int32x2_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vreinterpretv8qiv2si (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_s64 (int64x1_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vreinterpretv8qidi (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_f32 (float32x2_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vreinterpretv8qiv2sf (__a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_u16 (uint16x4_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_u32 (uint32x2_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vreinterpretv8qiv2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_u64 (uint64x1_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vreinterpretv8qidi ((int64x1_t) __a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_p8 (poly8x8_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vreinterpretv8qiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vreinterpret_u8_p16 (poly16x4_t __a)
+{
+ return (uint8x8_t)__builtin_neon_vreinterpretv8qiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_s8 (int8x16_t __a)
+{
+ return (uint8x16_t)__builtin_neon_vreinterpretv16qiv16qi (__a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_s16 (int16x8_t __a)
+{
+ return (uint8x16_t)__builtin_neon_vreinterpretv16qiv8hi (__a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_s32 (int32x4_t __a)
+{
+ return (uint8x16_t)__builtin_neon_vreinterpretv16qiv4si (__a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_s64 (int64x2_t __a)
+{
+ return (uint8x16_t)__builtin_neon_vreinterpretv16qiv2di (__a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_f32 (float32x4_t __a)
+{
+ return (uint8x16_t)__builtin_neon_vreinterpretv16qiv4sf (__a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_u16 (uint16x8_t __a)
+{
+ return (uint8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_u32 (uint32x4_t __a)
+{
+ return (uint8x16_t)__builtin_neon_vreinterpretv16qiv4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_u64 (uint64x2_t __a)
+{
+ return (uint8x16_t)__builtin_neon_vreinterpretv16qiv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_p8 (poly8x16_t __a)
+{
+ return (uint8x16_t)__builtin_neon_vreinterpretv16qiv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vreinterpretq_u8_p16 (poly16x8_t __a)
+{
+ return (uint8x16_t)__builtin_neon_vreinterpretv16qiv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_s8 (int8x8_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vreinterpretv4hiv8qi (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_s16 (int16x4_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vreinterpretv4hiv4hi (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_s32 (int32x2_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vreinterpretv4hiv2si (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_s64 (int64x1_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vreinterpretv4hidi (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_f32 (float32x2_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vreinterpretv4hiv2sf (__a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_u8 (uint8x8_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_u32 (uint32x2_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vreinterpretv4hiv2si ((int32x2_t) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_u64 (uint64x1_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vreinterpretv4hidi ((int64x1_t) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_p8 (poly8x8_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vreinterpretv4hiv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vreinterpret_u16_p16 (poly16x4_t __a)
+{
+ return (uint16x4_t)__builtin_neon_vreinterpretv4hiv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_s8 (int8x16_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vreinterpretv8hiv16qi (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_s16 (int16x8_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vreinterpretv8hiv8hi (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_s32 (int32x4_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vreinterpretv8hiv4si (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_s64 (int64x2_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vreinterpretv8hiv2di (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_f32 (float32x4_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vreinterpretv8hiv4sf (__a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_u8 (uint8x16_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_u32 (uint32x4_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vreinterpretv8hiv4si ((int32x4_t) __a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_u64 (uint64x2_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vreinterpretv8hiv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_p8 (poly8x16_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vreinterpretv8hiv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vreinterpretq_u16_p16 (poly16x8_t __a)
+{
+ return (uint16x8_t)__builtin_neon_vreinterpretv8hiv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_s8 (int8x8_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vreinterpretv2siv8qi (__a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_s16 (int16x4_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vreinterpretv2siv4hi (__a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_s32 (int32x2_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vreinterpretv2siv2si (__a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_s64 (int64x1_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vreinterpretv2sidi (__a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_f32 (float32x2_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vreinterpretv2siv2sf (__a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_u8 (uint8x8_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vreinterpretv2siv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_u16 (uint16x4_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vreinterpretv2siv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_u64 (uint64x1_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vreinterpretv2sidi ((int64x1_t) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_p8 (poly8x8_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vreinterpretv2siv8qi ((int8x8_t) __a);
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vreinterpret_u32_p16 (poly16x4_t __a)
+{
+ return (uint32x2_t)__builtin_neon_vreinterpretv2siv4hi ((int16x4_t) __a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_s8 (int8x16_t __a)
+{
+ return (uint32x4_t)__builtin_neon_vreinterpretv4siv16qi (__a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_s16 (int16x8_t __a)
+{
+ return (uint32x4_t)__builtin_neon_vreinterpretv4siv8hi (__a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_s32 (int32x4_t __a)
+{
+ return (uint32x4_t)__builtin_neon_vreinterpretv4siv4si (__a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_s64 (int64x2_t __a)
+{
+ return (uint32x4_t)__builtin_neon_vreinterpretv4siv2di (__a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_f32 (float32x4_t __a)
+{
+ return (uint32x4_t)__builtin_neon_vreinterpretv4siv4sf (__a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_u8 (uint8x16_t __a)
+{
+ return (uint32x4_t)__builtin_neon_vreinterpretv4siv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_u16 (uint16x8_t __a)
+{
+ return (uint32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_u64 (uint64x2_t __a)
+{
+ return (uint32x4_t)__builtin_neon_vreinterpretv4siv2di ((int64x2_t) __a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_p8 (poly8x16_t __a)
+{
+ return (uint32x4_t)__builtin_neon_vreinterpretv4siv16qi ((int8x16_t) __a);
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vreinterpretq_u32_p16 (poly16x8_t __a)
+{
+ return (uint32x4_t)__builtin_neon_vreinterpretv4siv8hi ((int16x8_t) __a);
+}
+
+#ifdef __cplusplus
+}
+#endif
+#endif
+#endif
Modified: llvm-gcc-4.2/trunk/gcc/config/arm/bpabi.S
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/bpabi.S?rev=76781&r1=76780&r2=76781&view=diff
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/bpabi.S (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/bpabi.S Wed Jul 22 15:36:27 2009
@@ -44,7 +44,10 @@
ARM_FUNC_START aeabi_lcmp
subs ip, xxl, yyl
sbcs ip, xxh, yyh
- subeqs ip, xxl, yyl
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it eq
+ COND(sub,s,eq) ip, xxl, yyl
+/* APPLE LOCAL end v7 support. Merge from mainline */
mov r0, ip
RET
FUNC_END aeabi_lcmp
@@ -55,12 +58,20 @@
ARM_FUNC_START aeabi_ulcmp
cmp xxh, yyh
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it lo
movlo r0, #-1
+ do_it hi
movhi r0, #1
+ do_it ne
RETc(ne)
cmp xxl, yyl
+ do_it lo
movlo r0, #-1
+ do_it hi
movhi r0, #1
+ do_it eq
+/* APPLE LOCAL end v7 support. Merge from mainline */
moveq r0, #0
RET
FUNC_END aeabi_ulcmp
@@ -71,11 +82,18 @@
ARM_FUNC_START aeabi_ldivmod
sub sp, sp, #8
- stmfd sp!, {sp, lr}
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+#if defined(__thumb2__)
+ mov ip, sp
+ push {ip, lr}
+#else
+ do_push {sp, lr}
+#endif
bl SYM(__gnu_ldivmod_helper) __PLT__
ldr lr, [sp, #4]
add sp, sp, #8
- ldmfd sp!, {r2, r3}
+ do_pop {r2, r3}
+/* APPLE LOCAL end v7 support. Merge from mainline */
RET
#endif /* L_aeabi_ldivmod */
@@ -84,11 +102,18 @@
ARM_FUNC_START aeabi_uldivmod
sub sp, sp, #8
- stmfd sp!, {sp, lr}
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+#if defined(__thumb2__)
+ mov ip, sp
+ push {ip, lr}
+#else
+ do_push {sp, lr}
+#endif
bl SYM(__gnu_uldivmod_helper) __PLT__
ldr lr, [sp, #4]
add sp, sp, #8
- ldmfd sp!, {r2, r3}
+ do_pop {r2, r3}
+/* APPLE LOCAL end v7 support. Merge from mainline */
RET
#endif /* L_aeabi_divmod */
Modified: llvm-gcc-4.2/trunk/gcc/config/arm/cirrus.md
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/cirrus.md?rev=76781&r1=76780&r2=76781&view=diff
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/cirrus.md (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/cirrus.md Wed Jul 22 15:36:27 2009
@@ -34,7 +34,8 @@
[(set (match_operand:DI 0 "cirrus_fp_register" "=v")
(plus:DI (match_operand:DI 1 "cirrus_fp_register" "v")
(match_operand:DI 2 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfadd64%?\\t%V0, %V1, %V2"
[(set_attr "type" "mav_farith")
(set_attr "cirrus" "normal")]
@@ -44,7 +45,8 @@
[(set (match_operand:SI 0 "cirrus_fp_register" "=v")
(plus:SI (match_operand:SI 1 "cirrus_fp_register" "v")
(match_operand:SI 2 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
"cfadd32%?\\t%V0, %V1, %V2"
[(set_attr "type" "mav_farith")
(set_attr "cirrus" "normal")]
@@ -54,7 +56,8 @@
[(set (match_operand:SF 0 "cirrus_fp_register" "=v")
(plus:SF (match_operand:SF 1 "cirrus_fp_register" "v")
(match_operand:SF 2 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfadds%?\\t%V0, %V1, %V2"
[(set_attr "type" "mav_farith")
(set_attr "cirrus" "normal")]
@@ -64,7 +67,8 @@
[(set (match_operand:DF 0 "cirrus_fp_register" "=v")
(plus:DF (match_operand:DF 1 "cirrus_fp_register" "v")
(match_operand:DF 2 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfaddd%?\\t%V0, %V1, %V2"
[(set_attr "type" "mav_farith")
(set_attr "cirrus" "normal")]
@@ -74,7 +78,8 @@
[(set (match_operand:DI 0 "cirrus_fp_register" "=v")
(minus:DI (match_operand:DI 1 "cirrus_fp_register" "v")
(match_operand:DI 2 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfsub64%?\\t%V0, %V1, %V2"
[(set_attr "type" "mav_farith")
(set_attr "cirrus" "normal")]
@@ -84,7 +89,8 @@
[(set (match_operand:SI 0 "cirrus_fp_register" "=v")
(minus:SI (match_operand:SI 1 "cirrus_fp_register" "v")
(match_operand:SI 2 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
"cfsub32%?\\t%V0, %V1, %V2"
[(set_attr "type" "mav_farith")
(set_attr "cirrus" "normal")]
@@ -94,7 +100,8 @@
[(set (match_operand:SF 0 "cirrus_fp_register" "=v")
(minus:SF (match_operand:SF 1 "cirrus_fp_register" "v")
(match_operand:SF 2 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfsubs%?\\t%V0, %V1, %V2"
[(set_attr "type" "mav_farith")
(set_attr "cirrus" "normal")]
@@ -104,7 +111,8 @@
[(set (match_operand:DF 0 "cirrus_fp_register" "=v")
(minus:DF (match_operand:DF 1 "cirrus_fp_register" "v")
(match_operand:DF 2 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfsubd%?\\t%V0, %V1, %V2"
[(set_attr "type" "mav_farith")
(set_attr "cirrus" "normal")]
@@ -114,7 +122,8 @@
[(set (match_operand:SI 0 "cirrus_fp_register" "=v")
(mult:SI (match_operand:SI 2 "cirrus_fp_register" "v")
(match_operand:SI 1 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
"cfmul32%?\\t%V0, %V1, %V2"
[(set_attr "type" "mav_farith")
(set_attr "cirrus" "normal")]
@@ -125,7 +134,8 @@
[(set (match_operand:DI 0 "cirrus_fp_register" "=v")
(mult:DI (match_operand:DI 2 "cirrus_fp_register" "v")
(match_operand:DI 1 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfmul64%?\\t%V0, %V1, %V2"
[(set_attr "type" "mav_dmult")
(set_attr "cirrus" "normal")]
@@ -137,7 +147,8 @@
(mult:SI (match_operand:SI 1 "cirrus_fp_register" "v")
(match_operand:SI 2 "cirrus_fp_register" "v"))
(match_operand:SI 3 "cirrus_fp_register" "0")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
"cfmac32%?\\t%V0, %V1, %V2"
[(set_attr "type" "mav_farith")
(set_attr "cirrus" "normal")]
@@ -150,7 +161,8 @@
(match_operand:SI 1 "cirrus_fp_register" "0")
(mult:SI (match_operand:SI 2 "cirrus_fp_register" "v")
(match_operand:SI 3 "cirrus_fp_register" "v"))))]
- "0 && TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "0 && TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfmsc32%?\\t%V0, %V2, %V3"
[(set_attr "type" "mav_farith")
(set_attr "cirrus" "normal")]
@@ -160,7 +172,8 @@
[(set (match_operand:SF 0 "cirrus_fp_register" "=v")
(mult:SF (match_operand:SF 1 "cirrus_fp_register" "v")
(match_operand:SF 2 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfmuls%?\\t%V0, %V1, %V2"
[(set_attr "type" "mav_farith")
(set_attr "cirrus" "normal")]
@@ -170,7 +183,8 @@
[(set (match_operand:DF 0 "cirrus_fp_register" "=v")
(mult:DF (match_operand:DF 1 "cirrus_fp_register" "v")
(match_operand:DF 2 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfmuld%?\\t%V0, %V1, %V2"
[(set_attr "type" "mav_dmult")
(set_attr "cirrus" "normal")]
@@ -180,7 +194,8 @@
[(set (match_operand:SI 0 "cirrus_fp_register" "=v")
(ashift:SI (match_operand:SI 1 "cirrus_fp_register" "v")
(match_operand:SI 2 "cirrus_shift_const" "")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
"cfsh32%?\\t%V0, %V1, #%s2"
[(set_attr "cirrus" "normal")]
)
@@ -189,7 +204,8 @@
[(set (match_operand:SI 0 "cirrus_fp_register" "=v")
(ashiftrt:SI (match_operand:SI 1 "cirrus_fp_register" "v")
(match_operand:SI 2 "cirrus_shift_const" "")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
"cfsh32%?\\t%V0, %V1, #-%s2"
[(set_attr "cirrus" "normal")]
)
@@ -198,7 +214,8 @@
[(set (match_operand:SI 0 "cirrus_fp_register" "=v")
(ashift:SI (match_operand:SI 1 "cirrus_fp_register" "v")
(match_operand:SI 2 "register_operand" "r")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
"cfrshl32%?\\t%V1, %V0, %s2"
[(set_attr "cirrus" "normal")]
)
@@ -207,7 +224,8 @@
[(set (match_operand:DI 0 "cirrus_fp_register" "=v")
(ashift:DI (match_operand:DI 1 "cirrus_fp_register" "v")
(match_operand:SI 2 "register_operand" "r")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfrshl64%?\\t%V1, %V0, %s2"
[(set_attr "cirrus" "normal")]
)
@@ -216,7 +234,8 @@
[(set (match_operand:DI 0 "cirrus_fp_register" "=v")
(ashift:DI (match_operand:DI 1 "cirrus_fp_register" "v")
(match_operand:SI 2 "cirrus_shift_const" "")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfsh64%?\\t%V0, %V1, #%s2"
[(set_attr "cirrus" "normal")]
)
@@ -225,7 +244,8 @@
[(set (match_operand:DI 0 "cirrus_fp_register" "=v")
(ashiftrt:DI (match_operand:DI 1 "cirrus_fp_register" "v")
(match_operand:SI 2 "cirrus_shift_const" "")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfsh64%?\\t%V0, %V1, #-%s2"
[(set_attr "cirrus" "normal")]
)
@@ -233,7 +253,8 @@
(define_insn "*cirrus_absdi2"
[(set (match_operand:DI 0 "cirrus_fp_register" "=v")
(abs:DI (match_operand:DI 1 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfabs64%?\\t%V0, %V1"
[(set_attr "cirrus" "normal")]
)
@@ -243,7 +264,8 @@
[(set (match_operand:DI 0 "cirrus_fp_register" "=v")
(neg:DI (match_operand:DI 1 "cirrus_fp_register" "v")))
(clobber (reg:CC CC_REGNUM))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfneg64%?\\t%V0, %V1"
[(set_attr "cirrus" "normal")]
)
@@ -251,7 +273,8 @@
(define_insn "*cirrus_negsi2"
[(set (match_operand:SI 0 "cirrus_fp_register" "=v")
(neg:SI (match_operand:SI 1 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
"cfneg32%?\\t%V0, %V1"
[(set_attr "cirrus" "normal")]
)
@@ -259,7 +282,8 @@
(define_insn "*cirrus_negsf2"
[(set (match_operand:SF 0 "cirrus_fp_register" "=v")
(neg:SF (match_operand:SF 1 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfnegs%?\\t%V0, %V1"
[(set_attr "cirrus" "normal")]
)
@@ -267,7 +291,8 @@
(define_insn "*cirrus_negdf2"
[(set (match_operand:DF 0 "cirrus_fp_register" "=v")
(neg:DF (match_operand:DF 1 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfnegd%?\\t%V0, %V1"
[(set_attr "cirrus" "normal")]
)
@@ -277,7 +302,8 @@
[(set (match_operand:SI 0 "cirrus_fp_register" "=v")
(abs:SI (match_operand:SI 1 "cirrus_fp_register" "v")))
(clobber (reg:CC CC_REGNUM))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0"
"cfabs32%?\\t%V0, %V1"
[(set_attr "cirrus" "normal")]
)
@@ -285,7 +311,8 @@
(define_insn "*cirrus_abssf2"
[(set (match_operand:SF 0 "cirrus_fp_register" "=v")
(abs:SF (match_operand:SF 1 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfabss%?\\t%V0, %V1"
[(set_attr "cirrus" "normal")]
)
@@ -293,7 +320,8 @@
(define_insn "*cirrus_absdf2"
[(set (match_operand:DF 0 "cirrus_fp_register" "=v")
(abs:DF (match_operand:DF 1 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfabsd%?\\t%V0, %V1"
[(set_attr "cirrus" "normal")]
)
@@ -303,7 +331,8 @@
[(set (match_operand:SF 0 "cirrus_fp_register" "=v")
(float:SF (match_operand:SI 1 "s_register_operand" "r")))
(clobber (match_scratch:DF 2 "=v"))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfmv64lr%?\\t%Z2, %1\;cfcvt32s%?\\t%V0, %Y2"
[(set_attr "length" "8")
(set_attr "cirrus" "move")]
@@ -313,7 +342,8 @@
[(set (match_operand:DF 0 "cirrus_fp_register" "=v")
(float:DF (match_operand:SI 1 "s_register_operand" "r")))
(clobber (match_scratch:DF 2 "=v"))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfmv64lr%?\\t%Z2, %1\;cfcvt32d%?\\t%V0, %Y2"
[(set_attr "length" "8")
(set_attr "cirrus" "move")]
@@ -322,14 +352,16 @@
(define_insn "floatdisf2"
[(set (match_operand:SF 0 "cirrus_fp_register" "=v")
(float:SF (match_operand:DI 1 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfcvt64s%?\\t%V0, %V1"
[(set_attr "cirrus" "normal")])
(define_insn "floatdidf2"
[(set (match_operand:DF 0 "cirrus_fp_register" "=v")
(float:DF (match_operand:DI 1 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfcvt64d%?\\t%V0, %V1"
[(set_attr "cirrus" "normal")])
@@ -337,7 +369,8 @@
[(set (match_operand:SI 0 "s_register_operand" "=r")
(fix:SI (fix:SF (match_operand:SF 1 "cirrus_fp_register" "v"))))
(clobber (match_scratch:DF 2 "=v"))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cftruncs32%?\\t%Y2, %V1\;cfmvr64l%?\\t%0, %Z2"
[(set_attr "length" "8")
(set_attr "cirrus" "normal")]
@@ -347,7 +380,8 @@
[(set (match_operand:SI 0 "s_register_operand" "=r")
(fix:SI (fix:DF (match_operand:DF 1 "cirrus_fp_register" "v"))))
(clobber (match_scratch:DF 2 "=v"))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cftruncd32%?\\t%Y2, %V1\;cfmvr64l%?\\t%0, %Z2"
[(set_attr "length" "8")]
)
@@ -356,7 +390,8 @@
[(set (match_operand:SF 0 "cirrus_fp_register" "=v")
(float_truncate:SF
(match_operand:DF 1 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfcvtds%?\\t%V0, %V1"
[(set_attr "cirrus" "normal")]
)
@@ -364,7 +399,8 @@
(define_insn "*cirrus_extendsfdf2"
[(set (match_operand:DF 0 "cirrus_fp_register" "=v")
(float_extend:DF (match_operand:SF 1 "cirrus_fp_register" "v")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_MAVERICK"
"cfcvtsd%?\\t%V0, %V1"
[(set_attr "cirrus" "normal")]
)
@@ -457,3 +493,112 @@
(set_attr "cirrus" " not, not,not, not, not,normal,double,move,normal,double")]
)
+;; APPLE LOCAL begin v7 support. Merge from mainline
+(define_insn "*cirrus_thumb2_movdi"
+ [(set (match_operand:DI 0 "nonimmediate_di_operand" "=r,r,o<>,v,r,v,m,v")
+ (match_operand:DI 1 "di_operand" "rIK,mi,r,r,v,mi,v,v"))]
+ "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_MAVERICK"
+ "*
+ {
+ switch (which_alternative)
+ {
+ case 0:
+ case 1:
+ case 2:
+ return (output_move_double (operands));
+
+ case 3: return \"cfmv64lr%?\\t%V0, %Q1\;cfmv64hr%?\\t%V0, %R1\";
+ case 4: return \"cfmvr64l%?\\t%Q0, %V1\;cfmvr64h%?\\t%R0, %V1\";
+
+ case 5: return \"cfldr64%?\\t%V0, %1\";
+ case 6: return \"cfstr64%?\\t%V1, %0\";
+
+ /* Shifting by 0 will just copy %1 into %0. */
+ case 7: return \"cfsh64%?\\t%V0, %V1, #0\";
+
+ default: abort ();
+ }
+ }"
+ [(set_attr "length" " 8, 8, 8, 8, 8, 4, 4, 4")
+ (set_attr "type" " *,load2,store2, *, *, load2,store2, *")
+ (set_attr "pool_range" " *,4096, *, *, *, 1020, *, *")
+ (set_attr "neg_pool_range" " *, 0, *, *, *, 1008, *, *")
+ (set_attr "cirrus" "not, not, not,move,normal,double,double,normal")]
+)
+
+;; Cirrus SI values have been outlawed. Look in arm.h for the comment
+;; on HARD_REGNO_MODE_OK.
+
+(define_insn "*cirrus_thumb2_movsi_insn"
+ [(set (match_operand:SI 0 "general_operand" "=r,r,r,m,*v,r,*v,T,*v")
+ (match_operand:SI 1 "general_operand" "rI,K,mi,r,r,*v,T,*v,*v"))]
+ "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_MAVERICK && 0
+ && (register_operand (operands[0], SImode)
+ || register_operand (operands[1], SImode))"
+ "@
+ mov%?\\t%0, %1
+ mvn%?\\t%0, #%B1
+ ldr%?\\t%0, %1
+ str%?\\t%1, %0
+ cfmv64lr%?\\t%Z0, %1
+ cfmvr64l%?\\t%0, %Z1
+ cfldr32%?\\t%V0, %1
+ cfstr32%?\\t%V1, %0
+ cfsh32%?\\t%V0, %V1, #0"
+ [(set_attr "type" "*, *, load1,store1, *, *, load1,store1, *")
+ (set_attr "pool_range" "*, *, 4096, *, *, *, 1024, *, *")
+ (set_attr "neg_pool_range" "*, *, 0, *, *, *, 1012, *, *")
+ (set_attr "cirrus" "not,not, not, not,move,normal,normal,normal,normal")]
+)
+
+(define_insn "*thumb2_cirrus_movsf_hard_insn"
+ [(set (match_operand:SF 0 "nonimmediate_operand" "=v,v,v,r,m,r,r,m")
+ (match_operand:SF 1 "general_operand" "v,mE,r,v,v,r,mE,r"))]
+ "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_MAVERICK
+ && (GET_CODE (operands[0]) != MEM
+ || register_operand (operands[1], SFmode))"
+ "@
+ cfcpys%?\\t%V0, %V1
+ cfldrs%?\\t%V0, %1
+ cfmvsr%?\\t%V0, %1
+ cfmvrs%?\\t%0, %V1
+ cfstrs%?\\t%V1, %0
+ mov%?\\t%0, %1
+ ldr%?\\t%0, %1\\t%@ float
+ str%?\\t%1, %0\\t%@ float"
+ [(set_attr "length" " *, *, *, *, *, 4, 4, 4")
+ (set_attr "type" " *, load1, *, *,store1, *,load1,store1")
+ (set_attr "pool_range" " *, 1020, *, *, *, *,4096, *")
+ (set_attr "neg_pool_range" " *, 1008, *, *, *, *, 0, *")
+ (set_attr "cirrus" "normal,normal,move,normal,normal,not, not, not")]
+)
+
+(define_insn "*thumb2_cirrus_movdf_hard_insn"
+ [(set (match_operand:DF 0 "nonimmediate_operand" "=r,Q,r,m,r,v,v,v,r,m")
+ (match_operand:DF 1 "general_operand" "Q,r,r,r,mF,v,mF,r,v,v"))]
+ "TARGET_THUMB2
+ && TARGET_HARD_FLOAT && TARGET_MAVERICK
+ && (GET_CODE (operands[0]) != MEM
+ || register_operand (operands[1], DFmode))"
+ "*
+ {
+ switch (which_alternative)
+ {
+ case 0: return \"ldm%?ia\\t%m1, %M0\\t%@ double\";
+ case 1: return \"stm%?ia\\t%m0, %M1\\t%@ double\";
+ case 2: case 3: case 4: return output_move_double (operands);
+ case 5: return \"cfcpyd%?\\t%V0, %V1\";
+ case 6: return \"cfldrd%?\\t%V0, %1\";
+ case 7: return \"cfmvdlr\\t%V0, %Q1\;cfmvdhr%?\\t%V0, %R1\";
+ case 8: return \"cfmvrdl%?\\t%Q0, %V1\;cfmvrdh%?\\t%R0, %V1\";
+ case 9: return \"cfstrd%?\\t%V1, %0\";
+ default: abort ();
+ }
+ }"
+ [(set_attr "type" "load1,store2, *,store2,load1, *, load1, *, *,store2")
+ (set_attr "length" " 4, 4, 8, 8, 8, 4, 4, 8, 8, 4")
+ (set_attr "pool_range" " *, *, *, *,4092, *, 1020, *, *, *")
+ (set_attr "neg_pool_range" " *, *, *, *, 0, *, 1008, *, *, *")
+ (set_attr "cirrus" " not, not,not, not, not,normal,double,move,normal,double")]
+)
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
Modified: llvm-gcc-4.2/trunk/gcc/config/arm/coff.h
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/coff.h?rev=76781&r1=76780&r2=76781&view=diff
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/coff.h (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/coff.h Wed Jul 22 15:36:27 2009
@@ -59,9 +59,12 @@
/* Define this macro if jump tables (for `tablejump' insns) should be
output in the text section, along with the assembler instructions.
Otherwise, the readonly data section is used. */
-/* We put ARM jump tables in the text section, because it makes the code
- more efficient, but for Thumb it's better to put them out of band. */
-#define JUMP_TABLES_IN_TEXT_SECTION (TARGET_ARM)
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+/* We put ARM and Thumb-2 jump tables in the text section, because it makes
+ the code more efficient, but for Thumb-1 it's better to put them out of
+ band. */
+#define JUMP_TABLES_IN_TEXT_SECTION (TARGET_32BIT)
+/* APPLE LOCAL end v7 support. Merge from mainline */
#undef READONLY_DATA_SECTION_ASM_OP
#define READONLY_DATA_SECTION_ASM_OP "\t.section .rdata"
Modified: llvm-gcc-4.2/trunk/gcc/config/arm/constraints.md
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/constraints.md?rev=76781&r1=76780&r2=76781&view=diff
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/constraints.md (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/constraints.md Wed Jul 22 15:36:27 2009
@@ -19,33 +19,49 @@
;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
;; Boston, MA 02110-1301, USA.
+;; APPLE LOCAL begin v7 support. Merge from mainline
;; The following register constraints have been used:
-;; - in ARM state: f, v, w, y, z
+;; - in ARM/Thumb-2 state: f, t, v, w, x, y, z
;; - in Thumb state: h, k, b
;; - in both states: l, c
;; In ARM state, 'l' is an alias for 'r'
;; The following normal constraints have been used:
-;; in ARM state: G, H, I, J, K, L, M
-;; in Thumb state: I, J, K, L, M, N, O
+;; in ARM/Thumb-2 state: G, H, I, J, K, L, M
+;; in Thumb-1 state: I, J, K, L, M, N, O
+;; APPLE LOCAL end v7 support. Merge from mainline
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
;; The following multi-letter normal constraints have been used:
;; APPLE LOCAL 5831562 long long constants
-;; in ARM state: Da, Db, Dc, Dd
+;; in ARM/Thumb-2 state: Da, Db, Dc, Dd, Dn, Dl, DL, Dv
;; The following memory constraints have been used:
-;; in ARM state: Q, Uq, Uv, Uy
+;; in ARM/Thumb-2 state: Q, Ut, Uv, Uy, Un, Us
+;; in ARM state: Uq
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
(define_register_constraint "f" "TARGET_ARM ? FPA_REGS : NO_REGS"
"Legacy FPA registers @code{f0}- at code{f7}.")
+;; APPLE LOCAL begin v7 support. Merge from mainline
+(define_register_constraint "t" "TARGET_32BIT ? VFP_LO_REGS : NO_REGS"
+ "The VFP registers @code{s0}- at code{s31}.")
+
+;; APPLE LOCAL end v7 support. Merge from mainline
(define_register_constraint "v" "TARGET_ARM ? CIRRUS_REGS : NO_REGS"
"The Cirrus Maverick co-processor registers.")
-(define_register_constraint "w" "TARGET_ARM ? VFP_REGS : NO_REGS"
- "The VFP registers @code{s0}- at code{s31}.")
+;; APPLE LOCAL begin v7 support. Merge from mainline
+(define_register_constraint "w"
+ "TARGET_32BIT ? (TARGET_VFP3 ? VFP_REGS : VFP_LO_REGS) : NO_REGS"
+ "The VFP registers @code{d0}- at code{d15}, or @code{d0}- at code{d31} for VFPv3.")
+
+(define_register_constraint "x" "TARGET_32BIT ? VFP_D0_D7_REGS : NO_REGS"
+ "The VFP registers @code{d0}- at code{d7}.")
+;; APPLE LOCAL end v7 support. Merge from mainline
(define_register_constraint "y" "TARGET_REALLY_IWMMXT ? IWMMXT_REGS : NO_REGS"
"The Intel iWMMX co-processor registers.")
@@ -70,109 +86,176 @@
(define_register_constraint "c" "CC_REG"
"@internal The condition code register.")
+;; APPLE LOCAL begin v7 support. Merge from mainline
(define_constraint "I"
- "In ARM state a constant that can be used as an immediate value in a Data
- Processing instruction. In Thumb state a constant in the range 0-255."
+ "In ARM/Thumb-2 state a constant that can be used as an immediate value in a
+ Data Processing instruction. In Thumb-1 state a constant in the range
+ 0-255."
(and (match_code "const_int")
- (match_test "TARGET_ARM ? const_ok_for_arm (ival)
+ (match_test "TARGET_32BIT ? const_ok_for_arm (ival)
: ival >= 0 && ival <= 255")))
(define_constraint "J"
- "In ARM state a constant in the range @minus{}4095-4095. In Thumb state
- a constant in the range @minus{}255- at minus{}1."
+ "In ARM/Thumb-2 state a constant in the range @minus{}4095-4095. In Thumb-1
+ state a constant in the range @minus{}255- at minus{}1."
(and (match_code "const_int")
- (match_test "TARGET_ARM ? (ival >= -4095 && ival <= 4095)
+ (match_test "TARGET_32BIT ? (ival >= -4095 && ival <= 4095)
: (ival >= -255 && ival <= -1)")))
(define_constraint "K"
- "In ARM state a constant that satisfies the @code{I} constraint if inverted.
- In Thumb state a constant that satisfies the @code{I} constraint multiplied
- by any power of 2."
+ "In ARM/Thumb-2 state a constant that satisfies the @code{I} constraint if
+ inverted. In Thumb-1 state a constant that satisfies the @code{I}
+ constraint multiplied by any power of 2."
(and (match_code "const_int")
- (match_test "TARGET_ARM ? const_ok_for_arm (~ival)
+ (match_test "TARGET_32BIT ? const_ok_for_arm (~ival)
: thumb_shiftable_const (ival)")))
(define_constraint "L"
- "In ARM state a constant that satisfies the @code{I} constraint if negated.
- In Thumb state a constant in the range @minus{}7-7."
+ "In ARM/Thumb-2 state a constant that satisfies the @code{I} constraint if
+ negated. In Thumb-1 state a constant in the range @minus{}7-7."
(and (match_code "const_int")
- (match_test "TARGET_ARM ? const_ok_for_arm (-ival)
+ (match_test "TARGET_32BIT ? const_ok_for_arm (-ival)
: (ival >= -7 && ival <= 7)")))
;; The ARM state version is internal...
-;; @internal In ARM state a constant in the range 0-32 or any power of 2.
+;; @internal In ARM/Thumb-2 state a constant in the range 0-32 or any
+;; power of 2.
(define_constraint "M"
- "In Thumb state a constant that is a multiple of 4 in the range 0-1020."
+ "In Thumb-1 state a constant that is a multiple of 4 in the range 0-1020."
(and (match_code "const_int")
- (match_test "TARGET_ARM ? ((ival >= 0 && ival <= 32)
+ (match_test "TARGET_32BIT ? ((ival >= 0 && ival <= 32)
|| ((ival & (ival - 1)) == 0))
: ((ival >= 0 && ival <= 1020) && ((ival & 3) == 0))")))
(define_constraint "N"
- "In Thumb state a constant in the range 0-31."
+ "In ARM/Thumb-2 state a constant suitable for a MOVW instruction.
+ In Thumb-1 state a constant in the range 0-31."
(and (match_code "const_int")
- (match_test "TARGET_THUMB && ival >= 0 && ival <= 31")))
+ (match_test "TARGET_32BIT ? arm_arch_thumb2 && ((ival & 0xffff0000) == 0)
+ : (ival >= 0 && ival <= 31)")))
(define_constraint "O"
- "In Thumb state a constant that is a multiple of 4 in the range
+ "In Thumb-1 state a constant that is a multiple of 4 in the range
@minus{}508-508."
(and (match_code "const_int")
- (match_test "TARGET_THUMB && ival >= -508 && ival <= 508
+ (match_test "TARGET_THUMB1 && ival >= -508 && ival <= 508
&& ((ival & 3) == 0)")))
(define_constraint "G"
- "In ARM state a valid FPA immediate constant."
+ "In ARM/Thumb-2 state a valid FPA immediate constant."
(and (match_code "const_double")
- (match_test "TARGET_ARM && arm_const_double_rtx (op)")))
+ (match_test "TARGET_32BIT && arm_const_double_rtx (op)")))
(define_constraint "H"
- "In ARM state a valid FPA immediate constant when negated."
+ "In ARM/Thumb-2 state a valid FPA immediate constant when negated."
(and (match_code "const_double")
- (match_test "TARGET_ARM && neg_const_double_rtx_ok_for_fpa (op)")))
+ (match_test "TARGET_32BIT && neg_const_double_rtx_ok_for_fpa (op)")))
(define_constraint "Da"
"@internal
- In ARM state a const_int, const_double or const_vector that can
+ In ARM/Thumb-2 state a const_int, const_double or const_vector that can
be generated with two Data Processing insns."
(and (match_code "const_double,const_int,const_vector")
- (match_test "TARGET_ARM && arm_const_double_inline_cost (op) == 2")))
+ (match_test "TARGET_32BIT && arm_const_double_inline_cost (op) == 2")))
(define_constraint "Db"
"@internal
- In ARM state a const_int, const_double or const_vector that can
+ In ARM/Thumb-2 state a const_int, const_double or const_vector that can
be generated with three Data Processing insns."
(and (match_code "const_double,const_int,const_vector")
- (match_test "TARGET_ARM && arm_const_double_inline_cost (op) == 3")))
+ (match_test "TARGET_32BIT && arm_const_double_inline_cost (op) == 3")))
(define_constraint "Dc"
"@internal
- In ARM state a const_int, const_double or const_vector that can
+ In ARM/Thumb-2 state a const_int, const_double or const_vector that can
be generated with four Data Processing insns. This pattern is disabled
if optimizing for space or when we have load-delay slots to fill."
(and (match_code "const_double,const_int,const_vector")
- (match_test "TARGET_ARM && arm_const_double_inline_cost (op) == 4
+ (match_test "TARGET_32BIT && arm_const_double_inline_cost (op) == 4
&& !(optimize_size || arm_ld_sched)")))
-
;; APPLE LOCAL begin 5831562 long long constants
(define_constraint "Dd"
"@internal
In ARM state a const_int, const_double or const_vector that can
used directly in arithmetic instructions as two 32-bit immediates."
(and (match_code "const_double,const_int,const_vector")
- (match_test "TARGET_ARM && const64_ok_for_arm_immediate (op)")))
+ (match_test "TARGET_32BIT && const64_ok_for_arm_immediate (op)")))
;; APPLE LOCAL end 5831562 long long constants
+;; APPLE LOCAL end v7 support. Merge from mainline
+
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
+(define_constraint "Dn"
+ "@internal
+ In ARM/Thumb-2 state a const_vector which can be loaded with a Neon vmov
+ immediate instruction."
+ (and (match_code "const_vector")
+ (match_test "TARGET_32BIT
+ && imm_for_neon_mov_operand (op, GET_MODE (op))")))
+
+(define_constraint "Dl"
+ "@internal
+ In ARM/Thumb-2 state a const_vector which can be used with a Neon vorr or
+ vbic instruction."
+ (and (match_code "const_vector")
+ (match_test "TARGET_32BIT
+ && imm_for_neon_logic_operand (op, GET_MODE (op))")))
+
+(define_constraint "DL"
+ "@internal
+ In ARM/Thumb-2 state a const_vector which can be used with a Neon vorn or
+ vand instruction."
+ (and (match_code "const_vector")
+ (match_test "TARGET_32BIT
+ && imm_for_neon_inv_logic_operand (op, GET_MODE (op))")))
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
+;; APPLE LOCAL begin v7 support. Merge from mainline
+
+(define_constraint "Dv"
+ "@internal
+ In ARM/Thumb-2 state a const_double which can be used with a VFP fconsts
+ or fconstd instruction."
+ (and (match_code "const_double")
+ (match_test "TARGET_32BIT && vfp3_const_double_rtx (op)")))
+;; APPLE LOCAL end v7 support. Merge from mainline
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
+
+(define_memory_constraint "Ut"
+ "@internal
+ In ARM/Thumb-2 state an address valid for loading/storing opaque structure
+ types wider than TImode."
+ (and (match_code "mem")
+ (match_test "TARGET_32BIT && neon_struct_mem_operand (op)")))
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
+;; APPLE LOCAL begin v7 support. Merge from mainline
(define_memory_constraint "Uv"
"@internal
- In ARM state a valid VFP load/store address."
+ In ARM/Thumb-2 state a valid VFP load/store address."
(and (match_code "mem")
- (match_test "TARGET_ARM && arm_coproc_mem_operand (op, FALSE)")))
+ (match_test "TARGET_32BIT && arm_coproc_mem_operand (op, FALSE)")))
(define_memory_constraint "Uy"
"@internal
- In ARM state a valid iWMMX load/store address."
+ In ARM/Thumb-2 state a valid iWMMX load/store address."
+ (and (match_code "mem")
+ (match_test "TARGET_32BIT && arm_coproc_mem_operand (op, TRUE)")))
+;; APPLE LOCAL end v7 support. Merge from mainline
+
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
+(define_memory_constraint "Un"
+ "@internal
+ In ARM/Thumb-2 state a valid address for Neon element and structure
+ load/store instructions."
+ (and (match_code "mem")
+ (match_test "TARGET_32BIT && neon_vector_mem_operand (op, FALSE)")))
+
+(define_memory_constraint "Us"
+ "@internal
+ In ARM/Thumb-2 state a valid address for non-offset loads/stores of
+ quad-word values in four ARM registers."
(and (match_code "mem")
- (match_test "TARGET_ARM && arm_coproc_mem_operand (op, TRUE)")))
+ (match_test "TARGET_32BIT && neon_vector_mem_operand (op, TRUE)")))
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
(define_memory_constraint "Uq"
"@internal
@@ -182,11 +265,13 @@
&& arm_legitimate_address_p (GET_MODE (op), XEXP (op, 0),
SIGN_EXTEND, 0)")))
+;; APPLE LOCAL begin v7 support. Merge from mainline
(define_memory_constraint "Q"
"@internal
- In ARM state an address that is a single base register."
+ In ARM/Thumb-2 state an address that is a single base register."
(and (match_code "mem")
(match_test "REG_P (XEXP (op, 0))")))
+;; APPLE LOCAL end v7 support. Merge from mainline
;; We used to have constraint letters for S and R in ARM state, but
;; all uses of these now appear to have been removed.
Added: llvm-gcc-4.2/trunk/gcc/config/arm/cortex-a8-neon.md
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/cortex-a8-neon.md?rev=76781&view=auto
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/cortex-a8-neon.md (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/cortex-a8-neon.md Wed Jul 22 15:36:27 2009
@@ -0,0 +1,1308 @@
+;; APPLE LOCAL file v7 support. Merge from Codesourcery
+;; ARM Cortex-A8 NEON scheduling description.
+;; Copyright (C) 2007 Free Software Foundation, Inc.
+;; Contributed by CodeSourcery.
+
+;; This file is part of GCC.
+
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
+;; License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING. If not, write to
+;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+;; Boston, MA 02110-1301, USA.
+
+(define_automaton "cortex_a8_neon")
+
+;; Only one load, store, permute, MCR or MRC instruction can be issued
+;; per cycle.
+(define_cpu_unit "cortex_a8_neon_issue_perm" "cortex_a8_neon")
+
+;; Only one data-processing instruction can be issued per cycle.
+(define_cpu_unit "cortex_a8_neon_issue_dp" "cortex_a8_neon")
+
+;; The VFPLite unit (non-pipelined).
+(define_cpu_unit "cortex_a8_vfplite" "cortex_a8_neon")
+
+;; We need a special mutual exclusion (to be used in addition to
+;; cortex_a8_neon_issue_dp) for the case when an instruction such as
+;; vmla.f is forwarded from E5 of the floating-point multiply pipeline to
+;; E2 of the floating-point add pipeline. On the cycle previous to that
+;; forward we must prevent issue of any instruction to the floating-point
+;; add pipeline, but still allow issue of a data-processing instruction
+;; to any of the other pipelines.
+(define_cpu_unit "cortex_a8_neon_issue_fadd" "cortex_a8_neon")
+
+;; Patterns of reservation.
+;; We model the NEON issue units as running in parallel with the core ones.
+;; We assume that multi-cycle NEON instructions get decomposed into
+;; micro-ops as they are issued into the NEON pipeline, and not as they
+;; are issued into the ARM pipeline. Dual issue may not occur except
+;; upon the first and last cycles of a multi-cycle instruction, but it
+;; is unclear whether two multi-cycle instructions can issue together (in
+;; this model they cannot). It is also unclear whether a pair of
+;; a multi-cycle and single-cycle instructions, that could potentially
+;; issue together, only do so if (say) the single-cycle one precedes
+;; the other.
+
+(define_reservation "cortex_a8_neon_dp"
+ "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp")
+(define_reservation "cortex_a8_neon_dp_2"
+ "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp,\
+ cortex_a8_neon_issue_dp")
+(define_reservation "cortex_a8_neon_dp_4"
+ "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp,\
+ cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\
+ cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\
+ cortex_a8_neon_issue_dp")
+
+(define_reservation "cortex_a8_neon_fadd"
+ "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp+\
+ cortex_a8_neon_issue_fadd")
+(define_reservation "cortex_a8_neon_fadd_2"
+ "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp+\
+ cortex_a8_neon_issue_fadd,\
+ cortex_a8_neon_issue_dp+cortex_a8_neon_issue_fadd")
+
+(define_reservation "cortex_a8_neon_perm"
+ "(cortex_a8_alu0|cortex_a8_alu1)+\
+ cortex_a8_neon_issue_perm")
+(define_reservation "cortex_a8_neon_perm_2"
+ "(cortex_a8_alu0|cortex_a8_alu1)+\
+ cortex_a8_neon_issue_perm,\
+ cortex_a8_neon_issue_perm")
+(define_reservation "cortex_a8_neon_perm_3"
+ "(cortex_a8_alu0|cortex_a8_alu1)+\
+ cortex_a8_neon_issue_perm,\
+ cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\
+ cortex_a8_neon_issue_perm")
+
+(define_reservation "cortex_a8_neon_ls"
+ "cortex_a8_issue_ls+cortex_a8_neon_issue_perm")
+(define_reservation "cortex_a8_neon_ls_2"
+ "cortex_a8_issue_ls+cortex_a8_neon_issue_perm,\
+ cortex_a8_neon_issue_perm")
+(define_reservation "cortex_a8_neon_ls_3"
+ "cortex_a8_issue_ls+cortex_a8_neon_issue_perm,\
+ cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\
+ cortex_a8_neon_issue_perm")
+(define_reservation "cortex_a8_neon_ls_4"
+ "cortex_a8_issue_ls+cortex_a8_neon_issue_perm,\
+ cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\
+ cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\
+ cortex_a8_neon_issue_perm")
+(define_reservation "cortex_a8_neon_ls_5"
+ "cortex_a8_issue_ls+cortex_a8_neon_issue_perm,\
+ cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\
+ cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\
+ cortex_a8_neon_issue_dp+cortex_a8_neon_issue_perm,\
+ cortex_a8_neon_issue_perm")
+
+(define_reservation "cortex_a8_neon_fmul_then_fadd"
+ "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp,\
+ nothing*3,\
+ cortex_a8_neon_issue_fadd")
+(define_reservation "cortex_a8_neon_fmul_then_fadd_2"
+ "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp,\
+ cortex_a8_neon_issue_dp,\
+ nothing*2,\
+ cortex_a8_neon_issue_fadd,\
+ cortex_a8_neon_issue_fadd")
+
+;; VFP instructions can only be single-issued into the NEON pipeline.
+(define_reservation "cortex_a8_vfp"
+ "(cortex_a8_alu0|cortex_a8_alu1)+cortex_a8_neon_issue_dp+\
+ cortex_a8_neon_issue_perm+cortex_a8_vfplite")
+
+;; VFP instructions.
+;; The VFPLite unit that executes these isn't pipelined; we give the
+;; worst-case latencies (and choose the double-precision ones where we
+;; do not distinguish on precision). We assume RunFast mode is not
+;; enabled and therefore do not model the possible VFP instruction
+;; execution in the NEON floating point pipelines, nor additional
+;; latencies for the processing of subnormals.
+;;
+;; TODO: RunFast mode could potentially be enabled when -ffast-math
+;; is specified.
+
+(define_insn_reservation "cortex_a8_vfp_add_sub" 10
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "type" "farith"))
+ "cortex_a8_vfp,cortex_a8_vfplite*9")
+
+(define_insn_reservation "cortex_a8_vfp_muls" 12
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "type" "fmuls"))
+ "cortex_a8_vfp,cortex_a8_vfplite*11")
+
+(define_insn_reservation "cortex_a8_vfp_muld" 17
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "type" "fmuld"))
+ "cortex_a8_vfp,cortex_a8_vfplite*16")
+
+(define_insn_reservation "cortex_a8_vfp_macs" 21
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "type" "fmacs"))
+ "cortex_a8_vfp,cortex_a8_vfplite*20")
+
+(define_insn_reservation "cortex_a8_vfp_macd" 26
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "type" "fmacd"))
+ "cortex_a8_vfp,cortex_a8_vfplite*25")
+
+(define_insn_reservation "cortex_a8_vfp_divs" 37
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "type" "fdivs"))
+ "cortex_a8_vfp,cortex_a8_vfplite*36")
+
+(define_insn_reservation "cortex_a8_vfp_divd" 65
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "type" "fdivd"))
+ "cortex_a8_vfp,cortex_a8_vfplite*64")
+
+;; Comparisons can actually take 7 cycles sometimes instead of four,
+;; but given all the other instructions lumped into type=ffarith that
+;; take four cycles, we pick that latency.
+(define_insn_reservation "cortex_a8_vfp_farith" 4
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "type" "ffarith"))
+ "cortex_a8_vfp,cortex_a8_vfplite*3")
+
+(define_insn_reservation "cortex_a8_vfp_cvt" 7
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "type" "f_cvt"))
+ "cortex_a8_vfp,cortex_a8_vfplite*6")
+
+;; NEON -> core transfers.
+
+(define_insn_reservation "neon_mrc" 20
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_mrc"))
+ "cortex_a8_neon_ls")
+
+(define_insn_reservation "neon_mrrc" 21
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_mrrc"))
+ "cortex_a8_neon_ls_2")
+
+;; The remainder of this file is auto-generated by neon-schedgen.
+
+;; Instructions using this reservation read their source operands at N2, and
+;; produce a result at N3.
+(define_insn_reservation "neon_int_1" 3
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_int_1"))
+ "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their (D|Q)m operands at N1,
+;; their (D|Q)n operands at N2, and produce a result at N3.
+(define_insn_reservation "neon_int_2" 3
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_int_2"))
+ "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N3.
+(define_insn_reservation "neon_int_3" 3
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_int_3"))
+ "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their source operands at N2, and
+;; produce a result at N4.
+(define_insn_reservation "neon_int_4" 4
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_int_4"))
+ "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their (D|Q)m operands at N1,
+;; their (D|Q)n operands at N2, and produce a result at N4.
+(define_insn_reservation "neon_int_5" 4
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_int_5"))
+ "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N4.
+(define_insn_reservation "neon_vqneg_vqabs" 4
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vqneg_vqabs"))
+ "cortex_a8_neon_dp")
+
+;; Instructions using this reservation produce a result at N3.
+(define_insn_reservation "neon_vmov" 3
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vmov"))
+ "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and
+;; produce a result at N6.
+(define_insn_reservation "neon_vaba" 6
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vaba"))
+ "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and
+;; produce a result at N6 on cycle 2.
+(define_insn_reservation "neon_vaba_qqq" 7
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vaba_qqq"))
+ "cortex_a8_neon_dp_2")
+
+;; Instructions using this reservation read their (D|Q)m operands at N1,
+;; their (D|Q)d operands at N3, and produce a result at N6.
+(define_insn_reservation "neon_vsma" 6
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vsma"))
+ "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their source operands at N2, and
+;; produce a result at N6.
+(define_insn_reservation "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long" 6
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long"))
+ "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their source operands at N2, and
+;; produce a result at N6 on cycle 2.
+(define_insn_reservation "neon_mul_qqq_8_16_32_ddd_32" 7
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_mul_qqq_8_16_32_ddd_32"))
+ "cortex_a8_neon_dp_2")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, and produce a result at N6 on cycle 2.
+(define_insn_reservation "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar" 7
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar"))
+ "cortex_a8_neon_dp_2")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N2, their (D|Q)d operands at N3, and
+;; produce a result at N6.
+(define_insn_reservation "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long" 6
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long"))
+ "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N2, their (D|Q)d operands at N3, and
+;; produce a result at N6 on cycle 2.
+(define_insn_reservation "neon_mla_qqq_8_16" 7
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_mla_qqq_8_16"))
+ "cortex_a8_neon_dp_2")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and
+;; produce a result at N6 on cycle 2.
+(define_insn_reservation "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long" 7
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long"))
+ "cortex_a8_neon_dp_2")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and
+;; produce a result at N6 on cycle 4.
+(define_insn_reservation "neon_mla_qqq_32_qqd_32_scalar" 9
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_mla_qqq_32_qqd_32_scalar"))
+ "cortex_a8_neon_dp_4")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, and produce a result at N6.
+(define_insn_reservation "neon_mul_ddd_16_scalar_32_16_long_scalar" 6
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_mul_ddd_16_scalar_32_16_long_scalar"))
+ "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, and produce a result at N6 on cycle 4.
+(define_insn_reservation "neon_mul_qqd_32_scalar" 9
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_mul_qqd_32_scalar"))
+ "cortex_a8_neon_dp_4")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and
+;; produce a result at N6.
+(define_insn_reservation "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar" 6
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar"))
+ "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N3.
+(define_insn_reservation "neon_shift_1" 3
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_shift_1"))
+ "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N4.
+(define_insn_reservation "neon_shift_2" 4
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_shift_2"))
+ "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N3 on cycle 2.
+(define_insn_reservation "neon_shift_3" 4
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_shift_3"))
+ "cortex_a8_neon_dp_2")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N1.
+(define_insn_reservation "neon_vshl_ddd" 1
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vshl_ddd"))
+ "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N4 on cycle 2.
+(define_insn_reservation "neon_vqshl_vrshl_vqrshl_qqq" 5
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vqshl_vrshl_vqrshl_qqq"))
+ "cortex_a8_neon_dp_2")
+
+;; Instructions using this reservation read their (D|Q)m operands at N1,
+;; their (D|Q)d operands at N3, and produce a result at N6.
+(define_insn_reservation "neon_vsra_vrsra" 6
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vsra_vrsra"))
+ "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their source operands at N2, and
+;; produce a result at N5.
+(define_insn_reservation "neon_fp_vadd_ddd_vabs_dd" 5
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_fp_vadd_ddd_vabs_dd"))
+ "cortex_a8_neon_fadd")
+
+;; Instructions using this reservation read their source operands at N2, and
+;; produce a result at N5 on cycle 2.
+(define_insn_reservation "neon_fp_vadd_qqq_vabs_qq" 6
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_fp_vadd_qqq_vabs_qq"))
+ "cortex_a8_neon_fadd_2")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N5.
+(define_insn_reservation "neon_fp_vsum" 5
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_fp_vsum"))
+ "cortex_a8_neon_fadd")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, and produce a result at N5.
+(define_insn_reservation "neon_fp_vmul_ddd" 5
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_fp_vmul_ddd"))
+ "cortex_a8_neon_dp")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, and produce a result at N5 on cycle 2.
+(define_insn_reservation "neon_fp_vmul_qqd" 6
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_fp_vmul_qqd"))
+ "cortex_a8_neon_dp_2")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N2, their (D|Q)d operands at N3, and
+;; produce a result at N9.
+(define_insn_reservation "neon_fp_vmla_ddd" 9
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_fp_vmla_ddd"))
+ "cortex_a8_neon_fmul_then_fadd")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N2, their (D|Q)d operands at N3, and
+;; produce a result at N9 on cycle 2.
+(define_insn_reservation "neon_fp_vmla_qqq" 10
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_fp_vmla_qqq"))
+ "cortex_a8_neon_fmul_then_fadd_2")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and
+;; produce a result at N9.
+(define_insn_reservation "neon_fp_vmla_ddd_scalar" 9
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_fp_vmla_ddd_scalar"))
+ "cortex_a8_neon_fmul_then_fadd")
+
+;; Instructions using this reservation read their (D|Q)n operands at N2,
+;; their (D|Q)m operands at N1, their (D|Q)d operands at N3, and
+;; produce a result at N9 on cycle 2.
+(define_insn_reservation "neon_fp_vmla_qqq_scalar" 10
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_fp_vmla_qqq_scalar"))
+ "cortex_a8_neon_fmul_then_fadd_2")
+
+;; Instructions using this reservation read their source operands at N2, and
+;; produce a result at N9.
+(define_insn_reservation "neon_fp_vrecps_vrsqrts_ddd" 9
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_fp_vrecps_vrsqrts_ddd"))
+ "cortex_a8_neon_fmul_then_fadd")
+
+;; Instructions using this reservation read their source operands at N2, and
+;; produce a result at N9 on cycle 2.
+(define_insn_reservation "neon_fp_vrecps_vrsqrts_qqq" 10
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_fp_vrecps_vrsqrts_qqq"))
+ "cortex_a8_neon_fmul_then_fadd_2")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N2.
+(define_insn_reservation "neon_bp_simple" 2
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_bp_simple"))
+ "cortex_a8_neon_perm")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N2 on cycle 2.
+(define_insn_reservation "neon_bp_2cycle" 3
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_bp_2cycle"))
+ "cortex_a8_neon_perm_2")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N2 on cycle 3.
+(define_insn_reservation "neon_bp_3cycle" 4
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_bp_3cycle"))
+ "cortex_a8_neon_perm_3")
+
+;; Instructions using this reservation produce a result at N1.
+(define_insn_reservation "neon_ldr" 1
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_ldr"))
+ "cortex_a8_neon_ls")
+
+;; Instructions using this reservation read their source operands at N1.
+(define_insn_reservation "neon_str" 0
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_str"))
+ "cortex_a8_neon_ls")
+
+;; Instructions using this reservation produce a result at N1 on cycle 2.
+(define_insn_reservation "neon_vld1_1_2_regs" 2
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vld1_1_2_regs"))
+ "cortex_a8_neon_ls_2")
+
+;; Instructions using this reservation produce a result at N1 on cycle 3.
+(define_insn_reservation "neon_vld1_3_4_regs" 3
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vld1_3_4_regs"))
+ "cortex_a8_neon_ls_3")
+
+;; Instructions using this reservation produce a result at N2 on cycle 2.
+(define_insn_reservation "neon_vld2_2_regs_vld1_vld2_all_lanes" 3
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vld2_2_regs_vld1_vld2_all_lanes"))
+ "cortex_a8_neon_ls_2")
+
+;; Instructions using this reservation produce a result at N2 on cycle 3.
+(define_insn_reservation "neon_vld2_4_regs" 4
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vld2_4_regs"))
+ "cortex_a8_neon_ls_3")
+
+;; Instructions using this reservation produce a result at N2 on cycle 4.
+(define_insn_reservation "neon_vld3_vld4" 5
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vld3_vld4"))
+ "cortex_a8_neon_ls_4")
+
+;; Instructions using this reservation read their source operands at N1.
+(define_insn_reservation "neon_vst1_1_2_regs_vst2_2_regs" 0
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vst1_1_2_regs_vst2_2_regs"))
+ "cortex_a8_neon_ls_2")
+
+;; Instructions using this reservation read their source operands at N1.
+(define_insn_reservation "neon_vst1_3_4_regs" 0
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vst1_3_4_regs"))
+ "cortex_a8_neon_ls_3")
+
+;; Instructions using this reservation read their source operands at N1.
+(define_insn_reservation "neon_vst2_4_regs_vst3_vst4" 0
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vst2_4_regs_vst3_vst4"))
+ "cortex_a8_neon_ls_4")
+
+;; Instructions using this reservation read their source operands at N1.
+(define_insn_reservation "neon_vst3_vst4" 0
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vst3_vst4"))
+ "cortex_a8_neon_ls_4")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N2 on cycle 3.
+(define_insn_reservation "neon_vld1_vld2_lane" 4
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vld1_vld2_lane"))
+ "cortex_a8_neon_ls_3")
+
+;; Instructions using this reservation read their source operands at N1, and
+;; produce a result at N2 on cycle 5.
+(define_insn_reservation "neon_vld3_vld4_lane" 6
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vld3_vld4_lane"))
+ "cortex_a8_neon_ls_5")
+
+;; Instructions using this reservation read their source operands at N1.
+(define_insn_reservation "neon_vst1_vst2_lane" 0
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vst1_vst2_lane"))
+ "cortex_a8_neon_ls_2")
+
+;; Instructions using this reservation read their source operands at N1.
+(define_insn_reservation "neon_vst3_vst4_lane" 0
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vst3_vst4_lane"))
+ "cortex_a8_neon_ls_3")
+
+;; Instructions using this reservation produce a result at N2 on cycle 2.
+(define_insn_reservation "neon_vld3_vld4_all_lanes" 3
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_vld3_vld4_all_lanes"))
+ "cortex_a8_neon_ls_3")
+
+;; Instructions using this reservation produce a result at N2.
+(define_insn_reservation "neon_mcr" 2
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_mcr"))
+ "cortex_a8_neon_perm")
+
+;; Instructions using this reservation produce a result at N2.
+(define_insn_reservation "neon_mcr_2_mcrr" 2
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "neon_type" "neon_mcr_2_mcrr"))
+ "cortex_a8_neon_perm_2")
+
+;; Exceptions to the default latencies.
+
+(define_bypass 1 "neon_mcr_2_mcrr"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 1 "neon_mcr"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 2 "neon_vld3_vld4_all_lanes"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 5 "neon_vld3_vld4_lane"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 3 "neon_vld1_vld2_lane"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 4 "neon_vld3_vld4"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 3 "neon_vld2_4_regs"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 2 "neon_vld2_2_regs_vld1_vld2_all_lanes"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 2 "neon_vld1_3_4_regs"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 1 "neon_vld1_1_2_regs"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 0 "neon_ldr"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 3 "neon_bp_3cycle"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 2 "neon_bp_2cycle"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 1 "neon_bp_simple"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 9 "neon_fp_vrecps_vrsqrts_qqq"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 8 "neon_fp_vrecps_vrsqrts_ddd"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 9 "neon_fp_vmla_qqq_scalar"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 8 "neon_fp_vmla_ddd_scalar"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 9 "neon_fp_vmla_qqq"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 8 "neon_fp_vmla_ddd"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 5 "neon_fp_vmul_qqd"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 4 "neon_fp_vmul_ddd"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 4 "neon_fp_vsum"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 5 "neon_fp_vadd_qqq_vabs_qq"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 4 "neon_fp_vadd_ddd_vabs_dd"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 5 "neon_vsra_vrsra"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 4 "neon_vqshl_vrshl_vqrshl_qqq"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 0 "neon_vshl_ddd"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 3 "neon_shift_3"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 3 "neon_shift_2"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 2 "neon_shift_1"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 5 "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 8 "neon_mul_qqd_32_scalar"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 5 "neon_mul_ddd_16_scalar_32_16_long_scalar"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 8 "neon_mla_qqq_32_qqd_32_scalar"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 6 "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 6 "neon_mla_qqq_8_16"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 5 "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 6 "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 6 "neon_mul_qqq_8_16_32_ddd_32"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 5 "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 5 "neon_vsma"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 6 "neon_vaba_qqq"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 5 "neon_vaba"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 2 "neon_vmov"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 3 "neon_vqneg_vqabs"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 3 "neon_int_5"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 3 "neon_int_4"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 2 "neon_int_3"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 2 "neon_int_2"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
+(define_bypass 2 "neon_int_1"
+ "neon_int_1,\
+ neon_int_4,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq")
+
Added: llvm-gcc-4.2/trunk/gcc/config/arm/cortex-a8.md
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/cortex-a8.md?rev=76781&view=auto
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/cortex-a8.md (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/cortex-a8.md Wed Jul 22 15:36:27 2009
@@ -0,0 +1,273 @@
+;; APPLE LOCAL file v7 support. Merge from Codesourcery
+;; ARM Cortex-A8 scheduling description.
+;; Copyright (C) 2007 Free Software Foundation, Inc.
+;; Contributed by CodeSourcery.
+
+;; This file is part of GCC.
+
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
+;; License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING. If not, write to
+;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+;; Boston, MA 02110-1301, USA.
+
+(define_automaton "cortex_a8")
+
+;; Only one load/store instruction can be issued per cycle
+;; (although reservation of this unit is only required for single
+;; loads and stores -- see below).
+(define_cpu_unit "cortex_a8_issue_ls" "cortex_a8")
+
+;; Only one branch instruction can be issued per cycle.
+(define_cpu_unit "cortex_a8_issue_branch" "cortex_a8")
+
+;; The two ALU pipelines.
+(define_cpu_unit "cortex_a8_alu0" "cortex_a8")
+(define_cpu_unit "cortex_a8_alu1" "cortex_a8")
+
+;; The usual flow of an instruction through the pipelines.
+(define_reservation "cortex_a8_default"
+ "cortex_a8_alu0|cortex_a8_alu1")
+
+;; The flow of a branch instruction through the pipelines.
+(define_reservation "cortex_a8_branch"
+ "(cortex_a8_alu0+cortex_a8_issue_branch)|\
+ (cortex_a8_alu1+cortex_a8_issue_branch)")
+
+;; The flow of a load or store instruction through the pipeline in
+;; the case where that instruction consists of only one micro-op...
+(define_reservation "cortex_a8_load_store_1"
+ "(cortex_a8_alu0+cortex_a8_issue_ls)|\
+ (cortex_a8_alu1+cortex_a8_issue_ls)")
+
+;; ...and in the case of two micro-ops. We don't need to reserve
+;; cortex_a8_issue_ls here because dual issue is altogether forbidden
+;; during the issue cycle of the first micro-op. (Instead of modelling
+;; a separate issue unit, we instead reserve alu0 and alu1 to
+;; prevent any other instructions from being issued upon that first cycle.)
+;; Even though the load/store pipeline is usually available in either
+;; ALU pipe, multi-cycle instructions always issue in pipeline 0. This
+;; reservation is therefore the same as cortex_a8_multiply_2 below.
+(define_reservation "cortex_a8_load_store_2"
+ "cortex_a8_alu0+cortex_a8_alu1,\
+ cortex_a8_alu0")
+
+;; The flow of a single-cycle multiplication.
+(define_reservation "cortex_a8_multiply"
+ "cortex_a8_alu0")
+
+;; The flow of a multiplication instruction that gets decomposed into
+;; two micro-ops. The two micro-ops will be issued to pipeline 0 on
+;; successive cycles. Dual issue cannot happen at the same time as the
+;; first of the micro-ops.
+(define_reservation "cortex_a8_multiply_2"
+ "cortex_a8_alu0+cortex_a8_alu1,\
+ cortex_a8_alu0")
+
+;; Similarly, the flow of a multiplication instruction that gets
+;; decomposed into three micro-ops. Dual issue cannot occur except on
+;; the cycle upon which the third micro-op is issued.
+(define_reservation "cortex_a8_multiply_3"
+ "cortex_a8_alu0+cortex_a8_alu1,\
+ cortex_a8_alu0+cortex_a8_alu1,\
+ cortex_a8_alu0")
+
+;; The model given here assumes that all instructions are unconditional.
+
+;; Data processing instructions, but not move instructions.
+
+;; We include CLZ with these since it has the same execution pattern
+;; (source read in E2 and destination available at the end of that cycle).
+(define_insn_reservation "cortex_a8_alu" 2
+ (and (eq_attr "tune" "cortexa8")
+ (ior (and (eq_attr "type" "alu")
+ (not (eq_attr "insn" "mov,mvn")))
+ (eq_attr "insn" "clz")))
+ "cortex_a8_default")
+
+(define_insn_reservation "cortex_a8_alu_shift" 2
+ (and (eq_attr "tune" "cortexa8")
+ (and (eq_attr "type" "alu_shift")
+ (not (eq_attr "insn" "mov,mvn"))))
+ "cortex_a8_default")
+
+(define_insn_reservation "cortex_a8_alu_shift_reg" 2
+ (and (eq_attr "tune" "cortexa8")
+ (and (eq_attr "type" "alu_shift_reg")
+ (not (eq_attr "insn" "mov,mvn"))))
+ "cortex_a8_default")
+
+;; Move instructions.
+
+(define_insn_reservation "cortex_a8_mov" 1
+ (and (eq_attr "tune" "cortexa8")
+ (and (eq_attr "type" "alu,alu_shift,alu_shift_reg")
+ (eq_attr "insn" "mov,mvn")))
+ "cortex_a8_default")
+
+;; Exceptions to the default latencies for data processing instructions.
+
+;; A move followed by an ALU instruction with no early dep.
+;; (Such a pair can be issued in parallel, hence latency zero.)
+(define_bypass 0 "cortex_a8_mov" "cortex_a8_alu")
+(define_bypass 0 "cortex_a8_mov" "cortex_a8_alu_shift"
+ "arm_no_early_alu_shift_dep")
+(define_bypass 0 "cortex_a8_mov" "cortex_a8_alu_shift_reg"
+ "arm_no_early_alu_shift_value_dep")
+
+;; An ALU instruction followed by an ALU instruction with no early dep.
+(define_bypass 1 "cortex_a8_alu,cortex_a8_alu_shift,cortex_a8_alu_shift_reg"
+ "cortex_a8_alu")
+(define_bypass 1 "cortex_a8_alu,cortex_a8_alu_shift,cortex_a8_alu_shift_reg"
+ "cortex_a8_alu_shift"
+ "arm_no_early_alu_shift_dep")
+(define_bypass 1 "cortex_a8_alu,cortex_a8_alu_shift,cortex_a8_alu_shift_reg"
+ "cortex_a8_alu_shift_reg"
+ "arm_no_early_alu_shift_value_dep")
+
+;; Multiplication instructions. These are categorized according to their
+;; reservation behaviour and the need below to distinguish certain
+;; varieties for bypasses. Results are available at the E5 stage
+;; (but some of these are multi-cycle instructions which explains the
+;; latencies below).
+
+(define_insn_reservation "cortex_a8_mul" 6
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "insn" "mul,smulxy,smmul"))
+ "cortex_a8_multiply_2")
+
+(define_insn_reservation "cortex_a8_mla" 6
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "insn" "mla,smlaxy,smlawy,smmla,smlad,smlsd"))
+ "cortex_a8_multiply_2")
+
+(define_insn_reservation "cortex_a8_mull" 7
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "insn" "smull,umull,smlal,umlal,umaal,smlalxy"))
+ "cortex_a8_multiply_3")
+
+(define_insn_reservation "cortex_a8_smulwy" 5
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "insn" "smulwy,smuad,smusd"))
+ "cortex_a8_multiply")
+
+;; smlald and smlsld are multiply-accumulate instructions but do not
+;; received bypassed data from other multiplication results; thus, they
+;; cannot go in cortex_a8_mla above. (See below for bypass details.)
+(define_insn_reservation "cortex_a8_smlald" 6
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "insn" "smlald,smlsld"))
+ "cortex_a8_multiply_2")
+
+;; A multiply with a single-register result or an MLA, followed by an
+;; MLA with an accumulator dependency, has its result forwarded so two
+;; such instructions can issue back-to-back.
+(define_bypass 1 "cortex_a8_mul,cortex_a8_mla,cortex_a8_smulwy"
+ "cortex_a8_mla"
+ "arm_mac_accumulator_is_mul_result")
+
+;; A multiply followed by an ALU instruction needing the multiply
+;; result only at E2 has lower latency than one needing it at E1.
+(define_bypass 4 "cortex_a8_mul,cortex_a8_mla,cortex_a8_mull,\
+ cortex_a8_smulwy,cortex_a8_smlald"
+ "cortex_a8_alu")
+(define_bypass 4 "cortex_a8_mul,cortex_a8_mla,cortex_a8_mull,\
+ cortex_a8_smulwy,cortex_a8_smlald"
+ "cortex_a8_alu_shift"
+ "arm_no_early_alu_shift_dep")
+(define_bypass 4 "cortex_a8_mul,cortex_a8_mla,cortex_a8_mull,\
+ cortex_a8_smulwy,cortex_a8_smlald"
+ "cortex_a8_alu_shift_reg"
+ "arm_no_early_alu_shift_value_dep")
+
+;; Load instructions.
+;; The presence of any register writeback is ignored here.
+
+;; A load result has latency 3 unless the dependent instruction has
+;; no early dep, in which case it is only latency two.
+;; We assume 64-bit alignment for doubleword loads.
+(define_insn_reservation "cortex_a8_load1_2" 3
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "type" "load1,load2,load_byte"))
+ "cortex_a8_load_store_1")
+
+(define_bypass 2 "cortex_a8_load1_2"
+ "cortex_a8_alu")
+(define_bypass 2 "cortex_a8_load1_2"
+ "cortex_a8_alu_shift"
+ "arm_no_early_alu_shift_dep")
+(define_bypass 2 "cortex_a8_load1_2"
+ "cortex_a8_alu_shift_reg"
+ "arm_no_early_alu_shift_value_dep")
+
+;; We do not currently model the fact that loads with scaled register
+;; offsets that are not LSL #2 have an extra cycle latency (they issue
+;; as two micro-ops).
+
+;; A load multiple of three registers is usually issued as two micro-ops.
+;; The first register will be available at E3 of the first iteration,
+;; the second at E3 of the second iteration, and the third at E4 of
+;; the second iteration. A load multiple of four registers is usually
+;; issued as two micro-ops.
+(define_insn_reservation "cortex_a8_load3_4" 5
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "type" "load3,load4"))
+ "cortex_a8_load_store_2")
+
+(define_bypass 4 "cortex_a8_load3_4"
+ "cortex_a8_alu")
+(define_bypass 4 "cortex_a8_load3_4"
+ "cortex_a8_alu_shift"
+ "arm_no_early_alu_shift_dep")
+(define_bypass 4 "cortex_a8_load3_4"
+ "cortex_a8_alu_shift_reg"
+ "arm_no_early_alu_shift_value_dep")
+
+;; Store instructions.
+;; Writeback is again ignored.
+
+(define_insn_reservation "cortex_a8_store1_2" 0
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "type" "store1,store2"))
+ "cortex_a8_load_store_1")
+
+(define_insn_reservation "cortex_a8_store3_4" 0
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "type" "store3,store4"))
+ "cortex_a8_load_store_2")
+
+;; An ALU instruction acting as a producer for a store instruction
+;; that only uses the result as the value to be stored (as opposed to
+;; using it to calculate the address) has latency zero; the store
+;; reads the value to be stored at the start of E3 and the ALU insn
+;; writes it at the end of E2. Move instructions actually produce the
+;; result at the end of E1, but since we don't have delay slots, the
+;; scheduling behaviour will be the same.
+(define_bypass 0 "cortex_a8_alu,cortex_a8_alu_shift,\
+ cortex_a8_alu_shift_reg,cortex_a8_mov"
+ "cortex_a8_store1_2,cortex_a8_store3_4"
+ "arm_no_early_store_addr_dep")
+
+;; Branch instructions
+
+(define_insn_reservation "cortex_a8_branch" 0
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "type" "branch"))
+ "cortex_a8_branch")
+
+;; Call latencies are not predictable. A semi-arbitrary very large
+;; number is used as "positive infinity" so that everything should be
+;; finished by the time of return.
+(define_insn_reservation "cortex_a8_call" 32
+ (and (eq_attr "tune" "cortexa8")
+ (eq_attr "type" "call"))
+ "cortex_a8_issue_branch")
+
+;; NEON (including VFP) instructions.
+
+(include "cortex-a8-neon.md")
+
Added: llvm-gcc-4.2/trunk/gcc/config/arm/cortex-r4.md
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/cortex-r4.md?rev=76781&view=auto
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/cortex-r4.md (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/cortex-r4.md Wed Jul 22 15:36:27 2009
@@ -0,0 +1,289 @@
+;; APPLE LOCAL file v7 support. Merge from Codesourcery
+;; ARM Cortex-R4 scheduling description.
+;; Copyright (C) 2007 Free Software Foundation, Inc.
+;; Contributed by CodeSourcery.
+
+;; This file is part of GCC.
+
+;; GCC is distributed in the hope that it will be useful, but WITHOUT
+;; ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+;; or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public
+;; License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING. If not, write to
+;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+;; Boston, MA 02110-1301, USA.
+
+(define_automaton "cortex_r4")
+
+;; We approximate the dual-issue constraints of this core using four
+;; "issue units" and a reservation matrix as follows. The numbers indicate
+;; the instruction groups' preferences in order. Multiple entries for
+;; the same numbered preference indicate units that must be reserved
+;; together.
+;;
+;; Issue unit: A B C ALU
+;;
+;; ALU w/o reg shift 1st 2nd 1st and 2nd
+;; ALU w/ reg shift 1st 2nd 2nd 1st and 2nd
+;; Moves 1st 2nd 2nd
+;; Multiplication 1st 1st
+;; Division 1st 1st
+;; Load/store single 1st 1st
+;; Other load/store 1st 1st
+;; Branches 1st
+
+(define_cpu_unit "cortex_r4_issue_a" "cortex_r4")
+(define_cpu_unit "cortex_r4_issue_b" "cortex_r4")
+(define_cpu_unit "cortex_r4_issue_c" "cortex_r4")
+(define_cpu_unit "cortex_r4_issue_alu" "cortex_r4")
+
+(define_reservation "cortex_r4_alu"
+ "(cortex_r4_issue_a+cortex_r4_issue_alu)|\
+ (cortex_r4_issue_b+cortex_r4_issue_alu)")
+(define_reservation "cortex_r4_alu_shift_reg"
+ "(cortex_r4_issue_a+cortex_r4_issue_alu)|\
+ (cortex_r4_issue_b+cortex_r4_issue_c+\
+ cortex_r4_issue_alu)")
+(define_reservation "cortex_r4_mov"
+ "cortex_r4_issue_a|(cortex_r4_issue_b+\
+ cortex_r4_issue_alu)")
+(define_reservation "cortex_r4_mul" "cortex_r4_issue_a+cortex_r4_issue_alu")
+(define_reservation "cortex_r4_mul_2"
+ "(cortex_r4_issue_a+cortex_r4_issue_alu)*2")
+;; Division instructions execute out-of-order with respect to the
+;; rest of the pipeline and only require reservations on their first and
+;; final cycles.
+(define_reservation "cortex_r4_div_9"
+ "cortex_r4_issue_a+cortex_r4_issue_alu,\
+ nothing*7,\
+ cortex_r4_issue_a+cortex_r4_issue_alu")
+(define_reservation "cortex_r4_div_10"
+ "cortex_r4_issue_a+cortex_r4_issue_alu,\
+ nothing*8,\
+ cortex_r4_issue_a+cortex_r4_issue_alu")
+(define_reservation "cortex_r4_load_store"
+ "cortex_r4_issue_a+cortex_r4_issue_c")
+(define_reservation "cortex_r4_load_store_2"
+ "(cortex_r4_issue_a+cortex_r4_issue_b)*2")
+(define_reservation "cortex_r4_branch" "cortex_r4_issue_b")
+
+;; We assume that all instructions are unconditional.
+
+;; Data processing instructions. Moves without shifts are kept separate
+;; for the purposes of the dual-issue constraints above.
+(define_insn_reservation "cortex_r4_alu" 2
+ (and (eq_attr "tune" "cortexr4")
+ (and (eq_attr "type" "alu")
+ (not (eq_attr "insn" "mov"))))
+ "cortex_r4_alu")
+
+(define_insn_reservation "cortex_r4_mov" 2
+ (and (eq_attr "tune" "cortexr4")
+ (and (eq_attr "type" "alu")
+ (eq_attr "insn" "mov")))
+ "cortex_r4_mov")
+
+(define_insn_reservation "cortex_r4_alu_shift" 2
+ (and (eq_attr "tune" "cortexr4")
+ (eq_attr "type" "alu_shift"))
+ "cortex_r4_alu")
+
+(define_insn_reservation "cortex_r4_alu_shift_reg" 2
+ (and (eq_attr "tune" "cortexr4")
+ (eq_attr "type" "alu_shift_reg"))
+ "cortex_r4_alu_shift_reg")
+
+;; An ALU instruction followed by an ALU instruction with no early dep.
+(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\
+ cortex_r4_mov"
+ "cortex_r4_alu")
+(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\
+ cortex_r4_mov"
+ "cortex_r4_alu_shift"
+ "arm_no_early_alu_shift_dep")
+(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\
+ cortex_r4_mov"
+ "cortex_r4_alu_shift_reg"
+ "arm_no_early_alu_shift_value_dep")
+
+;; In terms of availabilities, a consumer mov could theoretically be
+;; issued together with a producer ALU instruction, without stalls.
+;; In practice this cannot happen because mov;add (in that order) is not
+;; eligible for dual issue and furthermore dual issue is not permitted
+;; when a dependency is involved. We therefore note it as latency one.
+;; A mov followed by another of the same is also latency one.
+(define_bypass 1 "cortex_r4_alu,cortex_r4_alu_shift,cortex_r4_alu_shift_reg,\
+ cortex_r4_mov"
+ "cortex_r4_mov")
+
+;; qadd, qdadd, qsub and qdsub are not currently emitted, and neither are
+;; media data processing instructions nor sad instructions.
+
+;; Multiplication instructions.
+
+(define_insn_reservation "cortex_r4_mul_4" 4
+ (and (eq_attr "tune" "cortexr4")
+ (eq_attr "insn" "mul,smmul"))
+ "cortex_r4_mul_2")
+
+(define_insn_reservation "cortex_r4_mul_3" 3
+ (and (eq_attr "tune" "cortexr4")
+ (eq_attr "insn" "smulxy,smulwy,smuad,smusd"))
+ "cortex_r4_mul")
+
+(define_insn_reservation "cortex_r4_mla_4" 4
+ (and (eq_attr "tune" "cortexr4")
+ (eq_attr "insn" "mla,smmla,smmls"))
+ "cortex_r4_mul_2")
+
+(define_insn_reservation "cortex_r4_mla_3" 3
+ (and (eq_attr "tune" "cortexr4")
+ (eq_attr "insn" "smlaxy,smlawy,smlad,smlsd"))
+ "cortex_r4_mul")
+
+(define_insn_reservation "cortex_r4_smlald" 3
+ (and (eq_attr "tune" "cortexr4")
+ (eq_attr "insn" "smlald,smlsld"))
+ "cortex_r4_mul")
+
+(define_insn_reservation "cortex_r4_mull" 4
+ (and (eq_attr "tune" "cortexr4")
+ (eq_attr "insn" "smull,umull,umlal,umaal"))
+ "cortex_r4_mul_2")
+
+;; A multiply or an MLA with a single-register result, followed by an
+;; MLA with an accumulator dependency, has its result forwarded.
+(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3"
+ "cortex_r4_mla_3,cortex_r4_mla_4"
+ "arm_mac_accumulator_is_mul_result")
+
+(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4"
+ "cortex_r4_mla_3,cortex_r4_mla_4"
+ "arm_mac_accumulator_is_mul_result")
+
+;; A multiply followed by an ALU instruction needing the multiply
+;; result only at ALU has lower latency than one needing it at Shift.
+(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald"
+ "cortex_r4_alu")
+(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald"
+ "cortex_r4_alu_shift"
+ "arm_no_early_alu_shift_dep")
+(define_bypass 2 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald"
+ "cortex_r4_alu_shift_reg"
+ "arm_no_early_alu_shift_value_dep")
+(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull"
+ "cortex_r4_alu")
+(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull"
+ "cortex_r4_alu_shift"
+ "arm_no_early_alu_shift_dep")
+(define_bypass 3 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull"
+ "cortex_r4_alu_shift_reg"
+ "arm_no_early_alu_shift_value_dep")
+
+;; A multiply followed by a mov has one cycle lower latency again.
+(define_bypass 1 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald"
+ "cortex_r4_mov")
+(define_bypass 2 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull"
+ "cortex_r4_mov")
+
+;; We guess that division of A/B using sdiv or udiv, on average,
+;; is performed with B having ten more leading zeros than A.
+;; This gives a latency of nine for udiv and ten for sdiv.
+(define_insn_reservation "cortex_r4_udiv" 9
+ (and (eq_attr "tune" "cortexr4")
+ (eq_attr "insn" "udiv"))
+ "cortex_r4_div_9")
+
+(define_insn_reservation "cortex_r4_sdiv" 10
+ (and (eq_attr "tune" "cortexr4")
+ (eq_attr "insn" "sdiv"))
+ "cortex_r4_div_10")
+
+;; Branches. We assume correct prediction.
+
+(define_insn_reservation "cortex_r4_branch" 0
+ (and (eq_attr "tune" "cortexr4")
+ (eq_attr "type" "branch"))
+ "cortex_r4_branch")
+
+;; Call latencies are not predictable. A semi-arbitrary very large
+;; number is used as "positive infinity" so that everything should be
+;; finished by the time of return.
+(define_insn_reservation "cortex_r4_call" 32
+ (and (eq_attr "tune" "cortexr4")
+ (eq_attr "type" "call"))
+ "nothing")
+
+;; Status register access instructions are not currently emitted.
+
+;; Load instructions.
+;; We do not model the "addr_md_3cycle" cases and assume that
+;; accesses following are correctly aligned.
+
+(define_insn_reservation "cortex_r4_load_1_2" 3
+ (and (eq_attr "tune" "cortexr4")
+ (eq_attr "type" "load1,load2"))
+ "cortex_r4_load_store")
+
+(define_insn_reservation "cortex_r4_load_3_4" 4
+ (and (eq_attr "tune" "cortexr4")
+ (eq_attr "type" "load3,load4"))
+ "cortex_r4_load_store_2")
+
+;; If a producing load is followed by an instruction consuming only
+;; as a Normal Reg, there is one fewer cycle of latency.
+
+(define_bypass 2 "cortex_r4_load_1_2"
+ "cortex_r4_alu")
+(define_bypass 2 "cortex_r4_load_1_2"
+ "cortex_r4_alu_shift"
+ "arm_no_early_alu_shift_dep")
+(define_bypass 2 "cortex_r4_load_1_2"
+ "cortex_r4_alu_shift_reg"
+ "arm_no_early_alu_shift_value_dep")
+
+(define_bypass 3 "cortex_r4_load_3_4"
+ "cortex_r4_alu")
+(define_bypass 3 "cortex_r4_load_3_4"
+ "cortex_r4_alu_shift"
+ "arm_no_early_alu_shift_dep")
+(define_bypass 3 "cortex_r4_load_3_4"
+ "cortex_r4_alu_shift_reg"
+ "arm_no_early_alu_shift_value_dep")
+
+;; If a producing load is followed by an instruction consuming only
+;; as a Late Reg, there are two fewer cycles of latency. Such consumer
+;; instructions are moves and stores.
+
+(define_bypass 1 "cortex_r4_load_1_2"
+ "cortex_r4_mov,cortex_r4_store_1_2,cortex_r4_store_3_4")
+(define_bypass 2 "cortex_r4_load_3_4"
+ "cortex_r4_mov,cortex_r4_store_1_2,cortex_r4_store_3_4")
+
+;; If a producer's result is required as the base or offset of a load,
+;; there is an extra cycle latency.
+
+(define_bypass 3 "cortex_r4_alu,cortex_r4_mov,cortex_r4_alu_shift,\
+ cortex_r4_alu_shift_reg"
+ "cortex_r4_load_1_2,cortex_r4_load_3_4")
+
+(define_bypass 4 "cortex_r4_mul_3,cortex_r4_mla_3,cortex_r4_smlald"
+ "cortex_r4_load_1_2,cortex_r4_load_3_4")
+
+(define_bypass 5 "cortex_r4_mul_4,cortex_r4_mla_4,cortex_r4_mull"
+ "cortex_r4_load_1_2,cortex_r4_load_3_4")
+
+;; Store instructions.
+
+(define_insn_reservation "cortex_r4_store_1_2" 0
+ (and (eq_attr "tune" "cortexr4")
+ (eq_attr "type" "store1,store2"))
+ "cortex_r4_load_store")
+
+(define_insn_reservation "cortex_r4_store_3_4" 0
+ (and (eq_attr "tune" "cortexr4")
+ (eq_attr "type" "store3,store4"))
+ "cortex_r4_load_store_2")
+
Modified: llvm-gcc-4.2/trunk/gcc/config/arm/darwin.h
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/darwin.h?rev=76781&r1=76780&r2=76781&view=diff
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/darwin.h (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/darwin.h Wed Jul 22 15:36:27 2009
@@ -52,7 +52,9 @@
#define REGISTER_PREFIX ""
-/* The assembler's names for the registers. */
+/* The assembler's names for the registers. Note that the ?xx registers are * there so that VFPv3/NEON registers D16-D31 have the same spacing as D0-D15
+ * (each of which is overlaid on two S registers), although there are no
+ * actual single-precision registers which correspond to D16-D31. */
#ifndef REGISTER_NAMES
#define REGISTER_NAMES \
{ \
@@ -73,6 +75,10 @@
"s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15", \
"s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23", \
"s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31", \
+ "d16", "?16", "d17", "?17", "d18", "?18", "d19", "?19", \
+ "d20", "?20", "d21", "?21", "d22", "?22", "d23", "?23", \
+ "d24", "?24", "d25", "?25", "d26", "?26", "d27", "?27", \
+ "d28", "?28", "d29", "?29", "d30", "?30", "d31", "?31", \
"vfpcc" \
}
#endif
@@ -162,22 +168,30 @@
{"mvdx13", 40}, \
{"mvdx14", 41}, \
{"mvdx15", 42}, \
- {"d0", 63}, \
- {"d1", 65}, \
- {"d2", 67}, \
- {"d3", 69}, \
- {"d4", 71}, \
- {"d5", 73}, \
- {"d6", 75}, \
- {"d7", 77}, \
- {"d8", 79}, \
- {"d9", 81}, \
- {"d10", 83}, \
- {"d11", 85}, \
- {"d12", 87}, \
- {"d13", 89}, \
- {"d14", 91}, \
- {"d15", 93}, \
+ {"d0", 63}, {"q0", 63}, \
+ {"d1", 65}, \
+ {"d2", 67}, {"q1", 67}, \
+ {"d3", 69}, \
+ {"d4", 71}, {"q2", 71}, \
+ {"d5", 73}, \
+ {"d6", 75}, {"q3", 75}, \
+ {"d7", 77}, \
+ {"d8", 79}, {"q4", 79}, \
+ {"d9", 81}, \
+ {"d10", 83}, {"q5", 83}, \
+ {"d11", 85}, \
+ {"d12", 87}, {"q6", 87}, \
+ {"d13", 89}, \
+ {"d14", 91}, {"q7", 91}, \
+ {"d15", 93}, \
+ {"q8", 95}, \
+ {"q9", 99}, \
+ {"q10", 103}, \
+ {"q11", 107}, \
+ {"q12", 111}, \
+ {"q13", 115}, \
+ {"q14", 119}, \
+ {"q15", 123} \
}
#endif
@@ -200,6 +214,13 @@
march=armv5tej:armv5; \
march=xscale:xscale; \
march=armv4t:armv4t; \
+ march=armv7:armv7; \
+ march=armv7-a:armv7; \
+ march=armv7-r:armv7; \
+ march=armv7-m:armv7; \
+ march=armv7a:armv7; \
+ march=armv7r:armv7; \
+ march=armv7m:armv7; \
mcpu=arm10tdmi:armv5; \
mcpu=arm1020t:armv5; \
mcpu=arm9e:armv5; \
@@ -216,9 +237,12 @@
mcpu=arm1136jf-s:armv6; \
mcpu=arm1176jz-s:armv6; \
mcpu=arm1176jzf-s:armv6; \
+ mcpu=cortex-a8:armv7; \
+ mcpu=cortex-r4:armv7; \
+ mcpu=cortex-m3:armv7; \
:arm -force_cpusubtype_ALL}"
-#define DARWIN_MINVERSION_SPEC "2.0"
+#define DARWIN_MINVERSION_SPEC "3.0"
/* Default cc1 option for specifying minimum version number. */
#define DARWIN_CC1_MINVERSION_SPEC "-miphoneos-version-min=%(darwin_minversion)"
@@ -235,7 +259,7 @@
#define SUBTARGET_EXTRA_SPECS \
DARWIN_EXTRA_SPECS \
{ "darwin_arch", DARWIN_SUBARCH_SPEC }, \
- { "darwin_subarch", DARWIN_SUBARCH_SPEC },
+ { "darwin_subarch", DARWIN_SUBARCH_SPEC }
/* This can go away once we can feature test the assembler correctly. */
#define ASM_DEBUG_SPEC ""
@@ -245,7 +269,7 @@
if (1) \
{ \
if (!darwin_macosx_version_min && !darwin_iphoneos_version_min) \
- darwin_iphoneos_version_min = "2.0"; \
+ darwin_iphoneos_version_min = "3.0"; \
if (MACHO_DYNAMIC_NO_PIC_P) \
{ \
if (flag_pic) \
@@ -265,23 +289,18 @@
/* Use -mlongcalls for kexts */ \
if (flag_mkernel || flag_apple_kext) \
target_flags |= MASK_LONG_CALLS; \
+ /* GCC 4.2+ only works with SDK 3.0+ */ \
+ if (darwin_iphoneos_version_min && \
+ strverscmp (darwin_iphoneos_version_min, "3.0") < 0) \
+ darwin_reserve_r9_on_v6 = 1; \
} \
} while(0)
-/* We reserve r9 on darwin for thread local data. */
+/* APPLE LOCAL begin 5571707 Allow R9 as caller-saved register */
#undef SUBTARGET_CONDITIONAL_REGISTER_USAGE
#define SUBTARGET_CONDITIONAL_REGISTER_USAGE \
- if (1) \
- { \
- fixed_regs[9] = 1; \
- call_used_regs[9] = 1; \
- } \
- if (TARGET_THUMB) \
- { \
- fixed_regs[THUMB_HARD_FRAME_POINTER_REGNUM] = 1; \
- call_used_regs[THUMB_HARD_FRAME_POINTER_REGNUM] = 1; \
- global_regs[THUMB_HARD_FRAME_POINTER_REGNUM] = 1; \
- }
+ arm_darwin_subtarget_conditional_register_usage();
+/* APPLE LOCAL end 5571707 Allow R9 as caller-saved register */
#undef TARGET_MACHO
#define TARGET_MACHO 1
@@ -308,12 +327,13 @@
#undef SUBTARGET_ASM_DECLARE_FUNCTION_NAME
#define SUBTARGET_ASM_DECLARE_FUNCTION_NAME ARM_DECLARE_FUNCTION_NAME
-/* We default to VFP */
-#define FPUTYPE_DEFAULT FPUTYPE_VFP
+/* APPLE LOCAL begin 6093388 -mfpu=neon default for v7a */
+/* We default to VFP for v6, NEON for v7 */
+#define FPUTYPE_DEFAULT (arm_arch7a ? FPUTYPE_NEON : FPUTYPE_VFP)
#undef TARGET_DEFAULT_FLOAT_ABI
-#define TARGET_DEFAULT_FLOAT_ABI (arm_arch6 ? ARM_FLOAT_ABI_SOFTFP : ARM_FLOAT_ABI_SOFT)
-
+#define TARGET_DEFAULT_FLOAT_ABI ((arm_arch6 || arm_arch7a) ? ARM_FLOAT_ABI_SOFTFP : ARM_FLOAT_ABI_SOFT)
+/* APPLE LOCAL end 6093388 -mfpu=neon default for v7a */
#undef REGISTER_TARGET_PRAGMAS
#define REGISTER_TARGET_PRAGMAS DARWIN_REGISTER_TARGET_PRAGMAS
Modified: llvm-gcc-4.2/trunk/gcc/config/arm/elf.h
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/elf.h?rev=76781&r1=76780&r2=76781&view=diff
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/elf.h (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/elf.h Wed Jul 22 15:36:27 2009
@@ -96,8 +96,11 @@
/* Define this macro if jump tables (for `tablejump' insns) should be
output in the text section, along with the assembler instructions.
Otherwise, the readonly data section is used. */
-/* We put ARM jump tables in the text section, because it makes the code
- more efficient, but for Thumb it's better to put them out of band. */
+/* APPLE LOCAL begin v7 support. Merge from Codesourcery */
+/* We put ARM and Thumb-2 jump tables in the text section, because it makes
+ the code more efficient, but for Thumb-1 it's better to put them out of
+ band. */
+/* APPLE LOCAL end v7 support. Merge from Codesourcery */
/* APPLE LOCAL begin ARM compact switch tables */
/* The above is no longer true. */
#define JUMP_TABLES_IN_TEXT_SECTION (TARGET_EITHER)
Modified: llvm-gcc-4.2/trunk/gcc/config/arm/fpa.md
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/fpa.md?rev=76781&r1=76780&r2=76781&view=diff
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/fpa.md (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/fpa.md Wed Jul 22 15:36:27 2009
@@ -22,6 +22,12 @@
;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
;; Boston, MA 02110-1301, USA.
+;; APPLE LOCAL begin v7 support. Merge from mainline
+;; Some FPA mnemonics are ambiguous between conditional infixes and
+;; conditional suffixes. All instructions use a conditional infix,
+;; even in unified assembly mode.
+
+;; APPLE LOCAL end v7 support. Merge from mainline
;; FPA automaton.
(define_automaton "armfp")
@@ -101,7 +107,8 @@
[(set (match_operand:SF 0 "s_register_operand" "=f,f")
(plus:SF (match_operand:SF 1 "s_register_operand" "%f,f")
(match_operand:SF 2 "arm_float_add_operand" "fG,H")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"@
adf%?s\\t%0, %1, %2
suf%?s\\t%0, %1, #%N2"
@@ -113,7 +120,8 @@
[(set (match_operand:DF 0 "s_register_operand" "=f,f")
(plus:DF (match_operand:DF 1 "s_register_operand" "%f,f")
(match_operand:DF 2 "arm_float_add_operand" "fG,H")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"@
adf%?d\\t%0, %1, %2
suf%?d\\t%0, %1, #%N2"
@@ -126,7 +134,8 @@
(plus:DF (float_extend:DF
(match_operand:SF 1 "s_register_operand" "f,f"))
(match_operand:DF 2 "arm_float_add_operand" "fG,H")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"@
adf%?d\\t%0, %1, %2
suf%?d\\t%0, %1, #%N2"
@@ -139,7 +148,8 @@
(plus:DF (match_operand:DF 1 "s_register_operand" "f")
(float_extend:DF
(match_operand:SF 2 "s_register_operand" "f"))))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"adf%?d\\t%0, %1, %2"
[(set_attr "type" "farith")
(set_attr "predicable" "yes")]
@@ -151,7 +161,8 @@
(match_operand:SF 1 "s_register_operand" "f"))
(float_extend:DF
(match_operand:SF 2 "s_register_operand" "f"))))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"adf%?d\\t%0, %1, %2"
[(set_attr "type" "farith")
(set_attr "predicable" "yes")]
@@ -161,7 +172,8 @@
[(set (match_operand:SF 0 "s_register_operand" "=f,f")
(minus:SF (match_operand:SF 1 "arm_float_rhs_operand" "f,G")
(match_operand:SF 2 "arm_float_rhs_operand" "fG,f")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"@
suf%?s\\t%0, %1, %2
rsf%?s\\t%0, %2, %1"
@@ -172,7 +184,8 @@
[(set (match_operand:DF 0 "s_register_operand" "=f,f")
(minus:DF (match_operand:DF 1 "arm_float_rhs_operand" "f,G")
(match_operand:DF 2 "arm_float_rhs_operand" "fG,f")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"@
suf%?d\\t%0, %1, %2
rsf%?d\\t%0, %2, %1"
@@ -185,7 +198,8 @@
(minus:DF (float_extend:DF
(match_operand:SF 1 "s_register_operand" "f"))
(match_operand:DF 2 "arm_float_rhs_operand" "fG")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"suf%?d\\t%0, %1, %2"
[(set_attr "type" "farith")
(set_attr "predicable" "yes")]
@@ -196,7 +210,8 @@
(minus:DF (match_operand:DF 1 "arm_float_rhs_operand" "f,G")
(float_extend:DF
(match_operand:SF 2 "s_register_operand" "f,f"))))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"@
suf%?d\\t%0, %1, %2
rsf%?d\\t%0, %2, %1"
@@ -210,7 +225,8 @@
(match_operand:SF 1 "s_register_operand" "f"))
(float_extend:DF
(match_operand:SF 2 "s_register_operand" "f"))))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"suf%?d\\t%0, %1, %2"
[(set_attr "type" "farith")
(set_attr "predicable" "yes")]
@@ -220,7 +236,8 @@
[(set (match_operand:SF 0 "s_register_operand" "=f")
(mult:SF (match_operand:SF 1 "s_register_operand" "f")
(match_operand:SF 2 "arm_float_rhs_operand" "fG")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"fml%?s\\t%0, %1, %2"
[(set_attr "type" "ffmul")
(set_attr "predicable" "yes")]
@@ -230,7 +247,8 @@
[(set (match_operand:DF 0 "s_register_operand" "=f")
(mult:DF (match_operand:DF 1 "s_register_operand" "f")
(match_operand:DF 2 "arm_float_rhs_operand" "fG")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"muf%?d\\t%0, %1, %2"
[(set_attr "type" "fmul")
(set_attr "predicable" "yes")]
@@ -241,7 +259,8 @@
(mult:DF (float_extend:DF
(match_operand:SF 1 "s_register_operand" "f"))
(match_operand:DF 2 "arm_float_rhs_operand" "fG")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"muf%?d\\t%0, %1, %2"
[(set_attr "type" "fmul")
(set_attr "predicable" "yes")]
@@ -252,7 +271,8 @@
(mult:DF (match_operand:DF 1 "s_register_operand" "f")
(float_extend:DF
(match_operand:SF 2 "s_register_operand" "f"))))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"muf%?d\\t%0, %1, %2"
[(set_attr "type" "fmul")
(set_attr "predicable" "yes")]
@@ -263,7 +283,8 @@
(mult:DF
(float_extend:DF (match_operand:SF 1 "s_register_operand" "f"))
(float_extend:DF (match_operand:SF 2 "s_register_operand" "f"))))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"muf%?d\\t%0, %1, %2"
[(set_attr "type" "fmul")
(set_attr "predicable" "yes")]
@@ -275,7 +296,8 @@
[(set (match_operand:SF 0 "s_register_operand" "=f,f")
(div:SF (match_operand:SF 1 "arm_float_rhs_operand" "f,G")
(match_operand:SF 2 "arm_float_rhs_operand" "fG,f")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"@
fdv%?s\\t%0, %1, %2
frd%?s\\t%0, %2, %1"
@@ -287,7 +309,8 @@
[(set (match_operand:DF 0 "s_register_operand" "=f,f")
(div:DF (match_operand:DF 1 "arm_float_rhs_operand" "f,G")
(match_operand:DF 2 "arm_float_rhs_operand" "fG,f")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"@
dvf%?d\\t%0, %1, %2
rdf%?d\\t%0, %2, %1"
@@ -300,7 +323,8 @@
(div:DF (float_extend:DF
(match_operand:SF 1 "s_register_operand" "f"))
(match_operand:DF 2 "arm_float_rhs_operand" "fG")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"dvf%?d\\t%0, %1, %2"
[(set_attr "type" "fdivd")
(set_attr "predicable" "yes")]
@@ -311,7 +335,8 @@
(div:DF (match_operand:DF 1 "arm_float_rhs_operand" "fG")
(float_extend:DF
(match_operand:SF 2 "s_register_operand" "f"))))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"rdf%?d\\t%0, %2, %1"
[(set_attr "type" "fdivd")
(set_attr "predicable" "yes")]
@@ -323,7 +348,8 @@
(match_operand:SF 1 "s_register_operand" "f"))
(float_extend:DF
(match_operand:SF 2 "s_register_operand" "f"))))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"dvf%?d\\t%0, %1, %2"
[(set_attr "type" "fdivd")
(set_attr "predicable" "yes")]
@@ -333,7 +359,8 @@
[(set (match_operand:SF 0 "s_register_operand" "=f")
(mod:SF (match_operand:SF 1 "s_register_operand" "f")
(match_operand:SF 2 "arm_float_rhs_operand" "fG")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"rmf%?s\\t%0, %1, %2"
[(set_attr "type" "fdivs")
(set_attr "predicable" "yes")]
@@ -343,7 +370,8 @@
[(set (match_operand:DF 0 "s_register_operand" "=f")
(mod:DF (match_operand:DF 1 "s_register_operand" "f")
(match_operand:DF 2 "arm_float_rhs_operand" "fG")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"rmf%?d\\t%0, %1, %2"
[(set_attr "type" "fdivd")
(set_attr "predicable" "yes")]
@@ -354,7 +382,8 @@
(mod:DF (float_extend:DF
(match_operand:SF 1 "s_register_operand" "f"))
(match_operand:DF 2 "arm_float_rhs_operand" "fG")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"rmf%?d\\t%0, %1, %2"
[(set_attr "type" "fdivd")
(set_attr "predicable" "yes")]
@@ -365,7 +394,8 @@
(mod:DF (match_operand:DF 1 "s_register_operand" "f")
(float_extend:DF
(match_operand:SF 2 "s_register_operand" "f"))))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"rmf%?d\\t%0, %1, %2"
[(set_attr "type" "fdivd")
(set_attr "predicable" "yes")]
@@ -377,7 +407,8 @@
(match_operand:SF 1 "s_register_operand" "f"))
(float_extend:DF
(match_operand:SF 2 "s_register_operand" "f"))))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"rmf%?d\\t%0, %1, %2"
[(set_attr "type" "fdivd")
(set_attr "predicable" "yes")]
@@ -386,7 +417,8 @@
(define_insn "*negsf2_fpa"
[(set (match_operand:SF 0 "s_register_operand" "=f")
(neg:SF (match_operand:SF 1 "s_register_operand" "f")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"mnf%?s\\t%0, %1"
[(set_attr "type" "ffarith")
(set_attr "predicable" "yes")]
@@ -395,7 +427,8 @@
(define_insn "*negdf2_fpa"
[(set (match_operand:DF 0 "s_register_operand" "=f")
(neg:DF (match_operand:DF 1 "s_register_operand" "f")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"mnf%?d\\t%0, %1"
[(set_attr "type" "ffarith")
(set_attr "predicable" "yes")]
@@ -405,7 +438,8 @@
[(set (match_operand:DF 0 "s_register_operand" "=f")
(neg:DF (float_extend:DF
(match_operand:SF 1 "s_register_operand" "f"))))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"mnf%?d\\t%0, %1"
[(set_attr "type" "ffarith")
(set_attr "predicable" "yes")]
@@ -414,7 +448,8 @@
(define_insn "*abssf2_fpa"
[(set (match_operand:SF 0 "s_register_operand" "=f")
(abs:SF (match_operand:SF 1 "s_register_operand" "f")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"abs%?s\\t%0, %1"
[(set_attr "type" "ffarith")
(set_attr "predicable" "yes")]
@@ -423,7 +458,8 @@
(define_insn "*absdf2_fpa"
[(set (match_operand:DF 0 "s_register_operand" "=f")
(abs:DF (match_operand:DF 1 "s_register_operand" "f")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"abs%?d\\t%0, %1"
[(set_attr "type" "ffarith")
(set_attr "predicable" "yes")]
@@ -433,7 +469,8 @@
[(set (match_operand:DF 0 "s_register_operand" "=f")
(abs:DF (float_extend:DF
(match_operand:SF 1 "s_register_operand" "f"))))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"abs%?d\\t%0, %1"
[(set_attr "type" "ffarith")
(set_attr "predicable" "yes")]
@@ -442,7 +479,8 @@
(define_insn "*sqrtsf2_fpa"
[(set (match_operand:SF 0 "s_register_operand" "=f")
(sqrt:SF (match_operand:SF 1 "s_register_operand" "f")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"sqt%?s\\t%0, %1"
[(set_attr "type" "float_em")
(set_attr "predicable" "yes")]
@@ -451,7 +489,8 @@
(define_insn "*sqrtdf2_fpa"
[(set (match_operand:DF 0 "s_register_operand" "=f")
(sqrt:DF (match_operand:DF 1 "s_register_operand" "f")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"sqt%?d\\t%0, %1"
[(set_attr "type" "float_em")
(set_attr "predicable" "yes")]
@@ -461,7 +500,8 @@
[(set (match_operand:DF 0 "s_register_operand" "=f")
(sqrt:DF (float_extend:DF
(match_operand:SF 1 "s_register_operand" "f"))))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"sqt%?d\\t%0, %1"
[(set_attr "type" "float_em")
(set_attr "predicable" "yes")]
@@ -470,7 +510,8 @@
(define_insn "*floatsisf2_fpa"
[(set (match_operand:SF 0 "s_register_operand" "=f")
(float:SF (match_operand:SI 1 "s_register_operand" "r")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"flt%?s\\t%0, %1"
[(set_attr "type" "r_2_f")
(set_attr "predicable" "yes")]
@@ -479,7 +520,8 @@
(define_insn "*floatsidf2_fpa"
[(set (match_operand:DF 0 "s_register_operand" "=f")
(float:DF (match_operand:SI 1 "s_register_operand" "r")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"flt%?d\\t%0, %1"
[(set_attr "type" "r_2_f")
(set_attr "predicable" "yes")]
@@ -488,7 +530,8 @@
(define_insn "*fix_truncsfsi2_fpa"
[(set (match_operand:SI 0 "s_register_operand" "=r")
(fix:SI (fix:SF (match_operand:SF 1 "s_register_operand" "f"))))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"fix%?z\\t%0, %1"
[(set_attr "type" "f_2_r")
(set_attr "predicable" "yes")]
@@ -497,7 +540,8 @@
(define_insn "*fix_truncdfsi2_fpa"
[(set (match_operand:SI 0 "s_register_operand" "=r")
(fix:SI (fix:DF (match_operand:DF 1 "s_register_operand" "f"))))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"fix%?z\\t%0, %1"
[(set_attr "type" "f_2_r")
(set_attr "predicable" "yes")]
@@ -507,7 +551,8 @@
[(set (match_operand:SF 0 "s_register_operand" "=f")
(float_truncate:SF
(match_operand:DF 1 "s_register_operand" "f")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"mvf%?s\\t%0, %1"
[(set_attr "type" "ffarith")
(set_attr "predicable" "yes")]
@@ -516,7 +561,8 @@
(define_insn "*extendsfdf2_fpa"
[(set (match_operand:DF 0 "s_register_operand" "=f")
(float_extend:DF (match_operand:SF 1 "s_register_operand" "f")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"mvf%?d\\t%0, %1"
[(set_attr "type" "ffarith")
(set_attr "predicable" "yes")]
@@ -556,13 +602,14 @@
&& TARGET_HARD_FLOAT && TARGET_FPA
&& (GET_CODE (operands[0]) != MEM
|| register_operand (operands[1], DFmode))"
+;; APPLE LOCAL begin v7 support. Merge from mainline
"*
{
switch (which_alternative)
{
default:
- case 0: return \"ldm%?ia\\t%m1, %M0\\t%@ double\";
- case 1: return \"stm%?ia\\t%m0, %M1\\t%@ double\";
+ case 0: return \"ldm%(ia%)\\t%m1, %M0\\t%@ double\";
+ case 1: return \"stm%(ia%)\\t%m0, %M1\\t%@ double\";
case 2: return \"#\";
case 3: case 4: return output_move_double (operands);
case 5: return \"mvf%?d\\t%0, %1\";
@@ -574,6 +621,7 @@
}
}
"
+;; APPLE LOCAL end v7 support. Merge from mainline
[(set_attr "length" "4,4,8,8,8,4,4,4,4,8,8")
(set_attr "predicable" "yes")
(set_attr "type"
@@ -609,11 +657,105 @@
(set_attr "type" "ffarith,f_load,f_store")]
)
+;; APPLE LOCAL begin v7 support. Merge from mainline
+;; stfs/ldfs always use a conditional infix. This works around the
+;; ambiguity between "stf pl s" and "sftp ls".
+(define_insn "*thumb2_movsf_fpa"
+ [(set (match_operand:SF 0 "nonimmediate_operand" "=f,f,f, m,f,r,r,r, m")
+ (match_operand:SF 1 "general_operand" "fG,H,mE,f,r,f,r,mE,r"))]
+ "TARGET_THUMB2
+ && TARGET_HARD_FLOAT && TARGET_FPA
+ && (GET_CODE (operands[0]) != MEM
+ || register_operand (operands[1], SFmode))"
+ "@
+ mvf%?s\\t%0, %1
+ mnf%?s\\t%0, #%N1
+ ldf%?s\\t%0, %1
+ stf%?s\\t%1, %0
+ str%?\\t%1, [%|sp, #-4]!\;ldf%?s\\t%0, [%|sp], #4
+ stf%?s\\t%1, [%|sp, #-4]!\;ldr%?\\t%0, [%|sp], #4
+ mov%?\\t%0, %1 @bar
+ ldr%?\\t%0, %1\\t%@ float
+ str%?\\t%1, %0\\t%@ float"
+ [(set_attr "length" "4,4,4,4,8,8,4,4,4")
+ (set_attr "ce_count" "1,1,1,1,2,2,1,1,1")
+ (set_attr "predicable" "yes")
+ (set_attr "type"
+ "ffarith,ffarith,f_load,f_store,r_mem_f,f_mem_r,*,load1,store1")
+ (set_attr "pool_range" "*,*,1024,*,*,*,*,4096,*")
+ (set_attr "neg_pool_range" "*,*,1012,*,*,*,*,0,*")]
+)
+
+;; Not predicable because we don't know the number of instructions.
+(define_insn "*thumb2_movdf_fpa"
+ [(set (match_operand:DF 0 "nonimmediate_operand"
+ "=r,Q,r,m,r, f, f,f, m,!f,!r")
+ (match_operand:DF 1 "general_operand"
+ "Q, r,r,r,mF,fG,H,mF,f,r, f"))]
+ "TARGET_THUMB2
+ && TARGET_HARD_FLOAT && TARGET_FPA
+ && (GET_CODE (operands[0]) != MEM
+ || register_operand (operands[1], DFmode))"
+ "*
+ {
+ switch (which_alternative)
+ {
+ default:
+ case 0: return \"ldm%(ia%)\\t%m1, %M0\\t%@ double\";
+ case 1: return \"stm%(ia%)\\t%m0, %M1\\t%@ double\";
+ case 2: case 3: case 4: return output_move_double (operands);
+ case 5: return \"mvf%?d\\t%0, %1\";
+ case 6: return \"mnf%?d\\t%0, #%N1\";
+ case 7: return \"ldf%?d\\t%0, %1\";
+ case 8: return \"stf%?d\\t%1, %0\";
+ case 9: return output_mov_double_fpa_from_arm (operands);
+ case 10: return output_mov_double_arm_from_fpa (operands);
+ }
+ }
+ "
+ [(set_attr "length" "4,4,8,8,8,4,4,4,4,8,8")
+ (set_attr "type"
+ "load1,store2,*,store2,load1,ffarith,ffarith,f_load,f_store,r_mem_f,f_mem_r")
+ (set_attr "pool_range" "*,*,*,*,4092,*,*,1024,*,*,*")
+ (set_attr "neg_pool_range" "*,*,*,*,0,*,*,1020,*,*,*")]
+)
+
+;; Saving and restoring the floating point registers in the prologue should
+;; be done in XFmode, even though we don't support that for anything else
+;; (Well, strictly it's 'internal representation', but that's effectively
+;; XFmode).
+;; Not predicable because we don't know the number of instructions.
+
+(define_insn "*thumb2_movxf_fpa"
+ [(set (match_operand:XF 0 "nonimmediate_operand" "=f,f,f,m,f,r,r")
+ (match_operand:XF 1 "general_operand" "fG,H,m,f,r,f,r"))]
+ "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_FPA && reload_completed"
+ "*
+ switch (which_alternative)
+ {
+ default:
+ case 0: return \"mvf%?e\\t%0, %1\";
+ case 1: return \"mnf%?e\\t%0, #%N1\";
+ case 2: return \"ldf%?e\\t%0, %1\";
+ case 3: return \"stf%?e\\t%1, %0\";
+ case 4: return output_mov_long_double_fpa_from_arm (operands);
+ case 5: return output_mov_long_double_arm_from_fpa (operands);
+ case 6: return output_mov_long_double_arm_from_arm (operands);
+ }
+ "
+ [(set_attr "length" "4,4,4,4,8,8,12")
+ (set_attr "type" "ffarith,ffarith,f_load,f_store,r_mem_f,f_mem_r,*")
+ (set_attr "pool_range" "*,*,1024,*,*,*,*")
+ (set_attr "neg_pool_range" "*,*,1004,*,*,*,*")]
+)
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
+
(define_insn "*cmpsf_fpa"
[(set (reg:CCFP CC_REGNUM)
(compare:CCFP (match_operand:SF 0 "s_register_operand" "f,f")
(match_operand:SF 1 "arm_float_add_operand" "fG,H")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"@
cmf%?\\t%0, %1
cnf%?\\t%0, #%N1"
@@ -625,7 +767,8 @@
[(set (reg:CCFP CC_REGNUM)
(compare:CCFP (match_operand:DF 0 "s_register_operand" "f,f")
(match_operand:DF 1 "arm_float_add_operand" "fG,H")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"@
cmf%?\\t%0, %1
cnf%?\\t%0, #%N1"
@@ -638,7 +781,8 @@
(compare:CCFP (float_extend:DF
(match_operand:SF 0 "s_register_operand" "f,f"))
(match_operand:DF 1 "arm_float_add_operand" "fG,H")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"@
cmf%?\\t%0, %1
cnf%?\\t%0, #%N1"
@@ -651,7 +795,8 @@
(compare:CCFP (match_operand:DF 0 "s_register_operand" "f")
(float_extend:DF
(match_operand:SF 1 "s_register_operand" "f"))))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"cmf%?\\t%0, %1"
[(set_attr "conds" "set")
(set_attr "type" "f_2_r")]
@@ -661,7 +806,8 @@
[(set (reg:CCFPE CC_REGNUM)
(compare:CCFPE (match_operand:SF 0 "s_register_operand" "f,f")
(match_operand:SF 1 "arm_float_add_operand" "fG,H")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"@
cmf%?e\\t%0, %1
cnf%?e\\t%0, #%N1"
@@ -673,7 +819,8 @@
[(set (reg:CCFPE CC_REGNUM)
(compare:CCFPE (match_operand:DF 0 "s_register_operand" "f,f")
(match_operand:DF 1 "arm_float_add_operand" "fG,H")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"@
cmf%?e\\t%0, %1
cnf%?e\\t%0, #%N1"
@@ -686,7 +833,8 @@
(compare:CCFPE (float_extend:DF
(match_operand:SF 0 "s_register_operand" "f,f"))
(match_operand:DF 1 "arm_float_add_operand" "fG,H")))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"@
cmf%?e\\t%0, %1
cnf%?e\\t%0, #%N1"
@@ -699,7 +847,8 @@
(compare:CCFPE (match_operand:DF 0 "s_register_operand" "f")
(float_extend:DF
(match_operand:SF 1 "s_register_operand" "f"))))]
- "TARGET_ARM && TARGET_HARD_FLOAT && TARGET_FPA"
+;; APPLE LOCAL v7 support. Merge from mainline
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_FPA"
"cmf%?e\\t%0, %1"
[(set_attr "conds" "set")
(set_attr "type" "f_2_r")]
@@ -748,3 +897,49 @@
(set_attr "type" "ffarith")
(set_attr "conds" "use")]
)
+
+;; APPLE LOCAL begin v7 support. Merge from mainline
+(define_insn "*thumb2_movsfcc_fpa"
+ [(set (match_operand:SF 0 "s_register_operand" "=f,f,f,f,f,f,f,f")
+ (if_then_else:SF
+ (match_operator 3 "arm_comparison_operator"
+ [(match_operand 4 "cc_register" "") (const_int 0)])
+ (match_operand:SF 1 "arm_float_add_operand" "0,0,fG,H,fG,fG,H,H")
+ (match_operand:SF 2 "arm_float_add_operand" "fG,H,0,0,fG,H,fG,H")))]
+ "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_FPA"
+ "@
+ it\\t%D3\;mvf%D3s\\t%0, %2
+ it\\t%D3\;mnf%D3s\\t%0, #%N2
+ it\\t%d3\;mvf%d3s\\t%0, %1
+ it\\t%d3\;mnf%d3s\\t%0, #%N1
+ ite\\t%d3\;mvf%d3s\\t%0, %1\;mvf%D3s\\t%0, %2
+ ite\\t%d3\;mvf%d3s\\t%0, %1\;mnf%D3s\\t%0, #%N2
+ ite\\t%d3\;mnf%d3s\\t%0, #%N1\;mvf%D3s\\t%0, %2
+ ite\\t%d3\;mnf%d3s\\t%0, #%N1\;mnf%D3s\\t%0, #%N2"
+ [(set_attr "length" "6,6,6,6,10,10,10,10")
+ (set_attr "type" "ffarith")
+ (set_attr "conds" "use")]
+)
+
+(define_insn "*thumb2_movdfcc_fpa"
+ [(set (match_operand:DF 0 "s_register_operand" "=f,f,f,f,f,f,f,f")
+ (if_then_else:DF
+ (match_operator 3 "arm_comparison_operator"
+ [(match_operand 4 "cc_register" "") (const_int 0)])
+ (match_operand:DF 1 "arm_float_add_operand" "0,0,fG,H,fG,fG,H,H")
+ (match_operand:DF 2 "arm_float_add_operand" "fG,H,0,0,fG,H,fG,H")))]
+ "TARGET_THUMB2 && TARGET_HARD_FLOAT && TARGET_FPA"
+ "@
+ it\\t%D3\;mvf%D3d\\t%0, %2
+ it\\t%D3\;mnf%D3d\\t%0, #%N2
+ it\\t%d3\;mvf%d3d\\t%0, %1
+ it\\t%d3\;mnf%d3d\\t%0, #%N1
+ ite\\t%d3\;mvf%d3d\\t%0, %1\;mvf%D3d\\t%0, %2
+ ite\\t%d3\;mvf%d3d\\t%0, %1\;mnf%D3d\\t%0, #%N2
+ ite\\t%d3\;mnf%d3d\\t%0, #%N1\;mvf%D3d\\t%0, %2
+ ite\\t%d3\;mnf%d3d\\t%0, #%N1\;mnf%D3d\\t%0, #%N2"
+ [(set_attr "length" "6,6,6,6,10,10,10,10")
+ (set_attr "type" "ffarith")
+ (set_attr "conds" "use")]
+)
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
Added: llvm-gcc-4.2/trunk/gcc/config/arm/gen-darwin-multilib-exceptions.sh
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/gen-darwin-multilib-exceptions.sh?rev=76781&view=auto
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/gen-darwin-multilib-exceptions.sh (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/gen-darwin-multilib-exceptions.sh Wed Jul 22 15:36:27 2009
@@ -0,0 +1,21 @@
+#!/bin/sh
+# APPLE LOCAL file 6611402 configurable multilib architectures
+# This recursive function generates all of the pairwise combinations from a
+# list of multilib options. The result is suitable for a multilib
+# exceptions list.
+EXCEPTIONS=
+function gen_exceptions()
+{
+ if [ $# == 1 ] ; then
+ return
+ fi
+ local opt=$1
+ shift 1
+ for opt2 in $@ ; do
+ EXCEPTIONS+="*$opt*/*$opt2* "
+ done
+ gen_exceptions $@
+}
+if [ $# == 0 ] ; then exit ; fi
+gen_exceptions $@
+echo $EXCEPTIONS
Added: llvm-gcc-4.2/trunk/gcc/config/arm/hwdiv.md
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/hwdiv.md?rev=76781&view=auto
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/hwdiv.md (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/hwdiv.md Wed Jul 22 15:36:27 2009
@@ -0,0 +1,42 @@
+;; APPLE LOCAL file v7 support. Merge from Codesourcery
+;; ARM instruction patterns for hardware division
+;; Copyright (C) 2005, 2006, 2007 Free Software Foundation, Inc.
+;; Written by CodeSourcery, LLC.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING. If not, write to
+;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
+;; Boston, MA 02110-1301, USA.
+
+(define_insn "divsi3"
+ [(set (match_operand:SI 0 "s_register_operand" "=r")
+ (div:SI (match_operand:SI 1 "s_register_operand" "r")
+ (match_operand:SI 2 "s_register_operand" "r")))]
+ "arm_arch_hwdiv"
+ "sdiv%?\t%0, %1, %2"
+ [(set_attr "predicable" "yes")
+ (set_attr "insn" "sdiv")]
+)
+
+(define_insn "udivsi3"
+ [(set (match_operand:SI 0 "s_register_operand" "=r")
+ (udiv:SI (match_operand:SI 1 "s_register_operand" "r")
+ (match_operand:SI 2 "s_register_operand" "r")))]
+ "arm_arch_hwdiv"
+ "udiv%?\t%0, %1, %2"
+ [(set_attr "predicable" "yes")
+ (set_attr "insn" "udiv")]
+)
+
Modified: llvm-gcc-4.2/trunk/gcc/config/arm/ieee754-df.S
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/ieee754-df.S?rev=76781&r1=76780&r2=76781&view=diff
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/ieee754-df.S (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/ieee754-df.S Wed Jul 22 15:36:27 2009
@@ -91,23 +91,28 @@
/* APPLE LOCAL ARM MACH assembler */
ARM_FUNC_ALIAS(aeabi_dadd, adddf3)
-1: stmfd sp!, {r4, r5, lr}
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+1: do_push {r4, r5, lr}
@ Look for zeroes, equal values, INF, or NAN.
- mov r4, xh, lsl #1
- mov r5, yh, lsl #1
+ shift1 lsl, r4, xh, #1
+ shift1 lsl, r5, yh, #1
teq r4, r5
+ do_it eq
teqeq xl, yl
- orrnes ip, r4, xl
- orrnes ip, r5, yl
- mvnnes ip, r4, asr #21
- mvnnes ip, r5, asr #21
+ do_it ne, ttt
+ COND(orr,s,ne) ip, r4, xl
+ COND(orr,s,ne) ip, r5, yl
+ COND(mvn,s,ne) ip, r4, asr #21
+ COND(mvn,s,ne) ip, r5, asr #21
beq LSYM(Lad_s)
@ Compute exponent difference. Make largest exponent in r4,
@ corresponding arg in xh-xl, and positive exponent difference in r5.
- mov r4, r4, lsr #21
+ shift1 lsr, r4, r4, #21
rsbs r5, r4, r5, lsr #21
+ do_it lt
+ /* APPLE LOCAL end v7 support. Merge from mainline */
rsblt r5, r5, #0
ble 1f
add r4, r4, r5
@@ -122,6 +127,8 @@
@ already in xh-xl. We need up to 54 bit to handle proper rounding
@ of 0x1p54 - 1.1.
cmp r5, #54
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it hi
/* APPLE LOCAL ARM MACH assembler */
RETLDM2(hi, r4, r5)
@@ -131,15 +138,27 @@
mov ip, #0x00100000
orr xh, ip, xh, lsr #12
beq 1f
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+#if defined(__thumb2__)
+ negs xl, xl
+ sbc xh, xh, xh, lsl #1
+#else
rsbs xl, xl, #0
rsc xh, xh, #0
+#endif
1:
tst yh, #0x80000000
mov yh, yh, lsl #12
orr yh, ip, yh, lsr #12
beq 1f
+#if defined(__thumb2__)
+ negs yl, yl
+ sbc yh, yh, yh, lsl #1
+#else
rsbs yl, yl, #0
rsc yh, yh, #0
+#endif
+ /* APPLE LOCAL end v7 support. Merge from mainline */
1:
@ If exponent == difference, one or both args were denormalized.
@ Since this is not common case, rescale them off line.
@@ -153,27 +172,37 @@
@ Shift yh-yl right per r5, add to xh-xl, keep leftover bits into ip.
rsbs lr, r5, #32
blt 1f
- mov ip, yl, lsl lr
- adds xl, xl, yl, lsr r5
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ shift1 lsl, ip, yl, lr
+ shiftop adds, xl, xl, yl, lsr, r5, yl
adc xh, xh, #0
- adds xl, xl, yh, lsl lr
- adcs xh, xh, yh, asr r5
+ shiftop adds, xl, xl, yh, lsl, lr, yl
+ shiftop adcs, xh, xh, yh, asr, r5, yh
b 2f
1: sub r5, r5, #32
add lr, lr, #32
cmp yl, #1
- mov ip, yh, lsl lr
+ shift1 lsl,ip, yh, lr
+ do_it cs
orrcs ip, ip, #2 @ 2 not 1, to allow lsr #1 later
- adds xl, xl, yh, asr r5
+ shiftop adds, xl, xl, yh, asr, r5, yh
adcs xh, xh, yh, asr #31
2:
@ We now have a result in xh-xl-ip.
@ Keep absolute value in xh-xl-ip, sign in r5 (the n bit was set above)
and r5, xh, #0x80000000
bpl LSYM(Lad_p)
+#if defined(__thumb2__)
+ mov lr, #0
+ negs ip, ip
+ sbcs xl, lr, xl
+ sbc xh, lr, xh
+#else
rsbs ip, ip, #0
rscs xl, xl, #0
rsc xh, xh, #0
+#endif
+ /* APPLE LOCAL end v7 support. Merge from mainline */
@ Determine how to normalize the result.
LSYM(Lad_p):
@@ -199,7 +228,10 @@
@ Pack final result together.
LSYM(Lad_e):
cmp ip, #0x80000000
- moveqs ip, xl, lsr #1
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it eq
+ COND(mov,s,eq) ip, xl, lsr #1
+ /* APPLE LOCAL end v7 support. Merge from mainline */
adcs xl, xl, #0
adc xh, xh, r4, lsl #20
orr xh, xh, r5
@@ -243,9 +275,13 @@
#else
teq xh, #0
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq, t
moveq xh, xl
moveq xl, #0
clz r3, xh
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq
addeq r3, r3, #32
sub r3, r3, #11
@@ -261,20 +297,31 @@
@ since a register switch happened above.
add ip, r2, #20
rsb r2, r2, #12
- mov xl, xh, lsl ip
- mov xh, xh, lsr r2
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ shift1 lsl, xl, xh, ip
+ shift1 lsr, xh, xh, r2
b 3f
@ actually shift value left 1 to 20 bits, which might also represent
@ 32 to 52 bits if counting the register switch that happened earlier.
1: add r2, r2, #20
-2: rsble ip, r2, #32
- mov xh, xh, lsl r2
+2: do_it le
+ rsble ip, r2, #32
+ shift1 lsl, xh, xh, r2
+#if defined(__thumb2__)
+ lsr ip, xl, ip
+ itt le
+ orrle xh, xh, ip
+ lslle xl, xl, r2
+#else
orrle xh, xh, xl, lsr ip
movle xl, xl, lsl r2
+#endif
@ adjust exponent accordingly.
3: subs r4, r4, r3
+ do_it ge, tt
+ /* APPLE LOCAL end v7 support. Merge from mainline */
addge xh, xh, r4, lsl #20
orrge xh, xh, r5
/* APPLE LOCAL ARM MACH assembler */
@@ -291,9 +338,11 @@
@ shift result right of 1 to 20 bits, sign is in r5.
add r4, r4, #20
rsb r2, r4, #32
- mov xl, xl, lsr r4
- orr xl, xl, xh, lsl r2
- orr xh, r5, xh, lsr r4
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ shift1 lsr, xl, xl, r4
+ shiftop orr, xl, xl, xh, lsl, r2, yh
+ shiftop orr, xh, r5, xh, lsr, r4, yh
+ /* APPLE LOCAL end v7 support. Merge from mainline */
/* APPLE LOCAL ARM MACH assembler */
RETLDM1(r4, r5)
@@ -301,15 +350,18 @@
@ a register switch from xh to xl.
1: rsb r4, r4, #12
rsb r2, r4, #32
- mov xl, xl, lsr r2
- orr xl, xl, xh, lsl r4
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ shift1 lsr, xl, xl, r2
+ shiftop orr, xl, xl, xh, lsl, r4, yh
+ /* APPLE LOCAL end v7 support. Merge from mainline */
mov xh, r5
/* APPLE LOCAL ARM MACH assembler */
RETLDM1(r4, r5)
@ Shift value right of 32 to 64 bits, or 0 to 32 bits after a switch
@ from xh to xl.
-2: mov xl, xh, lsr r4
+ /* APPLE LOCAL v7 support. Merge from mainline */
+2: shift1 lsr, xl, xh, r4
mov xh, r5
/* APPLE LOCAL ARM MACH assembler */
RETLDM1(r4, r5)
@@ -319,6 +371,8 @@
LSYM(Lad_d):
teq r4, #0
eor yh, yh, #0x00100000
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq, te
eoreq xh, xh, #0x00100000
addeq r4, r4, #1
subne r5, r5, #1
@@ -327,15 +381,20 @@
LSYM(Lad_s):
mvns ip, r4, asr #21
- mvnnes ip, r5, asr #21
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it ne
+ COND(mvn,s,ne) ip, r5, asr #21
beq LSYM(Lad_i)
teq r4, r5
+ do_it eq
teqeq xl, yl
beq 1f
@ Result is x + 0.0 = x or 0.0 + y = y.
- orrs ip, r4, xl
+ orrs ip, r4, xl
+ do_it eq, t
+ /* APPLE LOCAL end v7 support. Merge from mainline */
moveq xh, yh
moveq xl, yl
/* APPLE LOCAL ARM MACH assembler */
@@ -344,6 +403,8 @@
1: teq xh, yh
@ Result is x - x = 0.
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it ne, tt
movne xh, #0
movne xl, #0
/* APPLE LOCAL ARM MACH assembler */
@@ -354,10 +415,14 @@
bne 2f
movs xl, xl, lsl #1
adcs xh, xh, xh
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it cs
orrcs xh, xh, #0x80000000
/* APPLE LOCAL ARM MACH assembler */
RETLDM1(r4, r5)
2: adds r4, r4, #(2 << 21)
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it cc, t
addcc xh, xh, #(1 << 20)
/* APPLE LOCAL ARM MACH assembler */
RETLDM2(cc, r4, r5)
@@ -379,13 +444,18 @@
@ otherwise return xh-xl (which is INF or -INF)
LSYM(Lad_i):
mvns ip, r4, asr #21
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it ne, te
movne xh, yh
movne xl, yl
- mvneqs ip, r5, asr #21
+ COND(mvn,s,eq) ip, r5, asr #21
+ do_it ne, t
movne yh, xh
movne yl, xl
orrs r4, xl, xh, lsl #12
- orreqs r5, yl, yh, lsl #12
+ do_it eq, te
+ COND(orr,s,eq) r5, yl, yh, lsl #12
+ /* APPLE LOCAL end v7 support. Merge from mainline */
teqeq xh, yh
orrne xh, xh, #0x00080000 @ quiet NAN
/* APPLE LOCAL ARM MACH assembler */
@@ -401,9 +471,12 @@
ARM_FUNC_ALIAS(aeabi_ui2d,floatunsidf)
teq r0, #0
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq, t
moveq r1, #0
RETc(eq)
- stmfd sp!, {r4, r5, lr}
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_push {r4, r5, lr}
mov r4, #0x400 @ initial exponent
add r4, r4, #(52-1 - 1)
mov r5, #0 @ sign bit is 0
@@ -423,12 +496,17 @@
ARM_FUNC_ALIAS(aeabi_i2d,floatsidf)
teq r0, #0
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq, t
moveq r1, #0
RETc(eq)
- stmfd sp!, {r4, r5, lr}
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_push {r4, r5, lr}
mov r4, #0x400 @ initial exponent
add r4, r4, #(52-1 - 1)
ands r5, r0, #0x80000000 @ sign bit in r5
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it mi
rsbmi r0, r0, #0 @ absolute value
/* APPLE LOCAL begin ARM MACH assembler */
#if !defined(__VFP_FP__) || defined(__ARMEB__)
@@ -449,17 +527,21 @@
mov xh, r2, asr #3 @ stretch exponent
mov xh, xh, rrx @ retrieve sign bit
mov xl, r2, lsl #28 @ retrieve remaining bits
- andnes r3, r2, #0xff000000 @ isolate exponent
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it ne, ttt
+ COND(and,s,ne) r3, r2, #0xff000000 @ isolate exponent
teqne r3, #0xff000000 @ if not 0, check if INF or NAN
eorne xh, xh, #0x38000000 @ fixup exponent otherwise.
RETc(ne) @ and return it.
teq r2, #0 @ if actually 0
+ do_it ne, e
teqne r3, #0xff000000 @ or INF or NAN
RETc(eq) @ we are done already.
@ value was denormalized. We can normalize it now.
- stmfd sp!, {r4, r5, lr}
+ do_push {r4, r5, lr}
+ /* APPLE LOCAL end v7 support. Merge from mainline */
mov r4, #0x380 @ setup corresponding exponent
and r5, xh, #0x80000000 @ move sign bit in r5
bic xh, xh, #0x80000000
@@ -474,7 +556,12 @@
orrs r2, r0, r1
#if !defined (__VFP_FP__) && !defined(__SOFTFP__)
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it eq, t
mvfeqd f0, #0.0
+#else
+ do_it eq
+ /* APPLE LOCAL end v7 support. Merge from mainline */
#endif
RETc(eq)
@@ -483,9 +570,11 @@
@ we can return the result in f0 as well as in r0/r1 for backwards
@ compatibility.
adr ip, LSYM(f0_ret)
- stmfd sp!, {r4, r5, ip, lr}
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_push {r4, r5, ip, lr}
#else
- stmfd sp!, {r4, r5, lr}
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_push {r4, r5, lr}
#endif
mov r5, #0
@@ -497,7 +586,11 @@
orrs r2, r0, r1
#if !defined (__VFP_FP__) && !defined(__SOFTFP__)
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it eq, t
mvfeqd f0, #0.0
+#else
+ do_it eq
#endif
RETc(eq)
@@ -506,15 +599,21 @@
@ we can return the result in f0 as well as in r0/r1 for backwards
@ compatibility.
adr ip, LSYM(f0_ret)
- stmfd sp!, {r4, r5, ip, lr}
+ do_push {r4, r5, ip, lr}
#else
- stmfd sp!, {r4, r5, lr}
+ do_push {r4, r5, lr}
#endif
ands r5, ah, #0x80000000 @ sign bit in r5
bpl 2f
+#if defined(__thumb2__)
+ negs al, al
+ sbc ah, ah, ah, lsl #1
+#else
rsbs al, al, #0
rsc ah, ah, #0
+#endif
+ /* APPLE LOCAL end v7 support. Merge from mainline */
2:
mov r4, #0x400 @ initial exponent
add r4, r4, #(52-1 - 1)
@@ -534,16 +633,20 @@
@ The value is too big. Scale it down a bit...
mov r2, #3
movs ip, ip, lsr #3
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it ne
addne r2, r2, #3
movs ip, ip, lsr #3
+ do_it ne
addne r2, r2, #3
add r2, r2, ip, lsr #3
rsb r3, r2, #32
- mov ip, xl, lsl r3
- mov xl, xl, lsr r2
- orr xl, xl, xh, lsl r3
- mov xh, xh, lsr r2
+ shift1 lsl, ip, xl, r3
+ shift1 lsr, xl, xl, r2
+ shiftop orr, xl, xl, xh, lsl, r3, lr
+ shift1 lsr, xh, xh, r2
+ /* APPLE LOCAL end v7 support. Merge from mainline */
add r4, r4, r2
b LSYM(Lad_p)
@@ -552,7 +655,8 @@
@ Legacy code expects the result to be returned in f0. Copy it
@ there as well.
LSYM(f0_ret):
- stmfd sp!, {r0, r1}
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_push {r0, r1}
ldfd f0, [sp], #8
RETLDM
@@ -570,13 +674,17 @@
ARM_FUNC_START muldf3
/* APPLE LOCAL ARM MACH assembler */
ARM_FUNC_ALIAS(aeabi_dmul,muldf3)
- stmfd sp!, {r4, r5, r6, lr}
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_push {r4, r5, r6, lr}
@ Mask out exponents, trap any zero/denormal/INF/NAN.
mov ip, #0xff
orr ip, ip, #0x700
ands r4, ip, xh, lsr #20
- andnes r5, ip, yh, lsr #20
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it ne, tte
+ COND(and,s,ne) r5, ip, yh, lsr #20
+ /* APPLE LOCAL end v7 support. Merge from mainline */
teqne r4, ip
teqne r5, ip
bleq LSYM(Lml_s)
@@ -592,7 +700,10 @@
bic xh, xh, ip, lsl #21
bic yh, yh, ip, lsl #21
orrs r5, xl, xh, lsl #12
- orrnes r5, yl, yh, lsl #12
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it ne
+ COND(orr,s,ne) r5, yl, yh, lsl #12
+ /* APPLE LOCAL end v7 support. Merge from mainline */
orr xh, xh, #0x00100000
orr yh, yh, #0x00100000
beq LSYM(Lml_1)
@@ -673,6 +784,8 @@
@ The LSBs in ip are only significant for the final rounding.
@ Fold them into lr.
teq ip, #0
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it ne
orrne lr, lr, #1
@ Adjust result upon the MSB position.
@@ -693,12 +806,16 @@
@ Check exponent range for under/overflow.
subs ip, r4, #(254 - 1)
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it hi
cmphi ip, #0x700
bhi LSYM(Lml_u)
@ Round the result, merge final exponent.
cmp lr, #0x80000000
- moveqs lr, xl, lsr #1
+ do_it eq
+ COND(mov,s,eq) lr, xl, lsr #1
+ /* APPLE LOCAL end v7 support. Merge from mainline */
adcs xl, xl, #0
adc xh, xh, r4, lsl #20
/* APPLE LOCAL ARM MACH assembler */
@@ -711,7 +828,10 @@
orr xl, xl, yl
eor xh, xh, yh
subs r4, r4, ip, lsr #1
- rsbgts r5, r4, ip
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it gt, tt
+ COND(rsb,s,gt) r5, r4, ip
+ /* APPLE LOCAL end v7 support. Merge from mainline */
orrgt xh, xh, r4, lsl #20
/* APPLE LOCAL ARM MACH assembler */
RETLDM2(gt, r4, r5, r6)
@@ -727,6 +847,8 @@
@ Check if denormalized result is possible, otherwise return signed 0.
cmn r4, #(53 + 1)
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it le, tt
movle xl, #0
bicle xh, xh, #0x7fffffff
/* APPLE LOCAL ARM MACH assembler */
@@ -742,14 +864,17 @@
@ shift result right of 1 to 20 bits, preserve sign bit, round, etc.
add r4, r4, #20
rsb r5, r4, #32
- mov r3, xl, lsl r5
- mov xl, xl, lsr r4
- orr xl, xl, xh, lsl r5
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ shift1 lsl, r3, xl, r5
+ shift1 lsr, xl, xl, r4
+ shiftop orr, xl, xl, xh, lsl, r5, r2
and r2, xh, #0x80000000
bic xh, xh, #0x80000000
adds xl, xl, r3, lsr #31
- adc xh, r2, xh, lsr r4
+ shiftop adc, xh, r2, xh, lsr, r4, r6
orrs lr, lr, r3, lsl #1
+ do_it eq
+ /* APPLE LOCAL end v7 support. Merge from mainline */
biceq xl, xl, r3, lsr #31
/* APPLE LOCAL ARM MACH assembler */
RETLDM1(r4, r5, r6)
@@ -758,13 +883,16 @@
@ a register switch from xh to xl. Then round.
1: rsb r4, r4, #12
rsb r5, r4, #32
- mov r3, xl, lsl r4
- mov xl, xl, lsr r5
- orr xl, xl, xh, lsl r4
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ shift1 lsl, r3, xl, r4
+ shift1 lsr, xl, xl, r5
+ shiftop orr, xl, xl, xh, lsl, r4, r2
bic xh, xh, #0x7fffffff
adds xl, xl, r3, lsr #31
adc xh, xh, #0
orrs lr, lr, r3, lsl #1
+ do_it eq
+ /* APPLE LOCAL end v7 support. Merge from mainline */
biceq xl, xl, r3, lsr #31
/* APPLE LOCAL ARM MACH assembler */
RETLDM1(r4, r5, r6)
@@ -772,14 +900,17 @@
@ Shift value right of 32 to 64 bits, or 0 to 32 bits after a switch
@ from xh to xl. Leftover bits are in r3-r6-lr for rounding.
2: rsb r5, r4, #32
- orr lr, lr, xl, lsl r5
- mov r3, xl, lsr r4
- orr r3, r3, xh, lsl r5
- mov xl, xh, lsr r4
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ shiftop orr, lr, lr, xl, lsl, r5, r2
+ shift1 lsr, r3, xl, r4
+ shiftop orr, r3, r3, xh, lsl, r5, r2
+ shift1 lsr, xl, xh, r4
bic xh, xh, #0x7fffffff
- bic xl, xl, xh, lsr r4
+ shiftop bic, xl, xl, xh, lsr, r4, r2
add xl, xl, r3, lsr #31
orrs lr, lr, r3, lsl #1
+ do_it eq
+ /* APPLE LOCAL end v7 support. Merge from mainline */
biceq xl, xl, r3, lsr #31
/* APPLE LOCAL ARM MACH assembler */
RETLDM1(r4, r5, r6)
@@ -793,15 +924,21 @@
1: movs xl, xl, lsl #1
adc xh, xh, xh
tst xh, #0x00100000
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq
subeq r4, r4, #1
beq 1b
orr xh, xh, r6
teq r5, #0
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it ne
movne pc, lr
2: and r6, yh, #0x80000000
3: movs yl, yl, lsl #1
adc yh, yh, yh
tst yh, #0x00100000
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq
subeq r5, r5, #1
beq 3b
orr yh, yh, r6
@@ -811,12 +948,17 @@
@ Isolate the INF and NAN cases away
teq r4, ip
and r5, ip, yh, lsr #20
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it ne
teqne r5, ip
beq 1f
@ Here, one or more arguments are either denormalized or zero.
orrs r6, xl, xh, lsl #1
- orrnes r6, yl, yh, lsl #1
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it ne
+ COND(orr,s,ne) r6, yl, yh, lsl #1
+ /* APPLE LOCAL end v7 support. Merge from mainline */
bne LSYM(Lml_d)
@ Result is 0, but determine sign anyway.
@@ -829,9 +971,12 @@
1: @ One or both args are INF or NAN.
orrs r6, xl, xh, lsl #1
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq, te
moveq xl, yl
moveq xh, yh
- orrnes r6, yl, yh, lsl #1
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ COND(orr,s,ne) r6, yl, yh, lsl #1
beq LSYM(Lml_n) @ 0 * INF or INF * 0 -> NAN
teq r4, ip
bne 1f
@@ -840,6 +985,8 @@
1: teq r5, ip
bne LSYM(Lml_i)
orrs r6, yl, yh, lsl #12
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it ne, t
movne xl, yl
movne xh, yh
bne LSYM(Lml_n) @ <anything> * NAN -> NAN
@@ -871,13 +1018,17 @@
/* APPLE LOCAL ARM MACH assembler */
ARM_FUNC_ALIAS(aeabi_ddiv,divdf3)
- stmfd sp!, {r4, r5, r6, lr}
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_push {r4, r5, r6, lr}
@ Mask out exponents, trap any zero/denormal/INF/NAN.
mov ip, #0xff
orr ip, ip, #0x700
ands r4, ip, xh, lsr #20
- andnes r5, ip, yh, lsr #20
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it ne, tte
+ COND(and,s,ne) r5, ip, yh, lsr #20
+ /* APPLE LOCAL end v7 support. Merge from mainline */
teqne r4, ip
teqne r5, ip
bleq LSYM(Ldv_s)
@@ -908,6 +1059,8 @@
@ Ensure result will land to known bit position.
@ Apply exponent bias accordingly.
cmp r5, yh
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq
cmpeq r6, yl
adc r4, r4, #(255 - 2)
add r4, r4, #0x300
@@ -926,6 +1079,8 @@
@ The actual division loop.
1: subs lr, r6, yl
sbcs lr, r5, yh
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it cs, tt
subcs r6, r6, yl
movcs r5, lr
orrcs xl, xl, ip
@@ -933,6 +1088,8 @@
mov yl, yl, rrx
subs lr, r6, yl
sbcs lr, r5, yh
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it cs, tt
subcs r6, r6, yl
movcs r5, lr
orrcs xl, xl, ip, lsr #1
@@ -940,6 +1097,8 @@
mov yl, yl, rrx
subs lr, r6, yl
sbcs lr, r5, yh
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it cs, tt
subcs r6, r6, yl
movcs r5, lr
orrcs xl, xl, ip, lsr #2
@@ -947,6 +1106,8 @@
mov yl, yl, rrx
subs lr, r6, yl
sbcs lr, r5, yh
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it cs, tt
subcs r6, r6, yl
movcs r5, lr
orrcs xl, xl, ip, lsr #3
@@ -973,18 +1134,25 @@
2:
@ Be sure result starts in the high word.
tst xh, #0x00100000
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq, t
orreq xh, xh, xl
moveq xl, #0
3:
@ Check exponent range for under/overflow.
subs ip, r4, #(254 - 1)
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it hi
cmphi ip, #0x700
bhi LSYM(Lml_u)
@ Round the result, merge final exponent.
subs ip, r5, yh
- subeqs ip, r6, yl
- moveqs ip, xl, lsr #1
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it eq, t
+ COND(sub,s,eq) ip, r6, yl
+ COND(mov,s,eq) ip, xl, lsr #1
+ /* APPLE LOCAL end v7 support. Merge from mainline */
adcs xl, xl, #0
adc xh, xh, r4, lsl #20
/* APPLE LOCAL ARM MACH assembler */
@@ -995,7 +1163,10 @@
and lr, lr, #0x80000000
orr xh, lr, xh, lsr #12
adds r4, r4, ip, lsr #1
- rsbgts r5, r4, ip
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it gt, tt
+ COND(rsb,s,gt) r5, r4, ip
+ /* APPLE LOCAL end v7 support. Merge from mainline */
orrgt xh, xh, r4, lsl #20
/* APPLE LOCAL ARM MACH assembler */
RETLDM2(gt, r4, r5, r6)
@@ -1015,6 +1186,8 @@
LSYM(Ldv_s):
and r5, ip, yh, lsr #20
teq r4, ip
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq
teqeq r5, ip
beq LSYM(Lml_n) @ INF/NAN / INF/NAN -> NAN
teq r4, ip
@@ -1035,7 +1208,10 @@
b LSYM(Lml_n) @ <anything> / NAN -> NAN
2: @ If both are nonzero, we need to normalize and resume above.
orrs r6, xl, xh, lsl #1
- orrnes r6, yl, yh, lsl #1
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it ne
+ COND(orr,s,ne) r6, yl, yh, lsl #1
+ /* APPLE LOCAL end v7 support. Merge from mainline */
bne LSYM(Lml_d)
@ One or both arguments are 0.
orrs r4, xl, xh, lsl #1
@@ -1078,14 +1254,21 @@
mov ip, xh, lsl #1
mvns ip, ip, asr #21
mov ip, yh, lsl #1
- mvnnes ip, ip, asr #21
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it ne
+ COND(mvn,s,ne) ip, ip, asr #21
+ /* APPLE LOCAL end v7 support. Merge from mainline */
beq 3f
@ Test for equality.
@ Note that 0.0 is equal to -0.0.
2: orrs ip, xl, xh, lsl #1 @ if x == 0.0 or -0.0
- orreqs ip, yl, yh, lsl #1 @ and y == 0.0 or -0.0
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it eq, e
+ COND(orr,s,eq) ip, yl, yh, lsl #1 @ and y == 0.0 or -0.0
teqne xh, yh @ or xh == yh
+ do_it eq, tt
+ /* APPLE LOCAL end v7 support. Merge from mainline */
teqeq xl, yl @ and xl == yl
moveq r0, #0 @ then equal.
RETc(eq)
@@ -1097,10 +1280,16 @@
teq xh, yh
@ Compare values if same sign
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it pl
cmppl xh, yh
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq
cmpeq xl, yl
@ Result:
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it cs, e
movcs r0, yh, asr #31
mvncc r0, yh, asr #31
orr r0, r0, #1
@@ -1144,13 +1333,17 @@
@ The status-returning routines are required to preserve all
@ registers except ip, lr, and cpsr.
-6: stmfd sp!, {r0, lr}
+ /* APPLE LOCAL v7 support. Merge from mainline */
+6: do_push {r0, lr}
ARM_CALL cmpdf2
@ Set the Z flag correctly, and the C flag unconditionally.
cmp r0, #0
@ Clear the C flag if the return value was -1, indicating
@ that the first operand was smaller than the second.
- cmnmi r0, #0
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it mi
+ cmnmi r0, #0
+ /* APPLE LOCAL end v7 support. Merge from mainline */
/* APPLE LOCAL ARM MACH assembler */
RETLDM1(r0)
@@ -1162,6 +1355,8 @@
str lr, [sp, #-8]!
ARM_CALL aeabi_cdcmple
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq, e
moveq r0, #1 @ Equal to.
movne r0, #0 @ Less than, greater than, or unordered.
RETLDM
@@ -1172,6 +1367,8 @@
str lr, [sp, #-8]!
ARM_CALL aeabi_cdcmple
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it cc, e
movcc r0, #1 @ Less than.
movcs r0, #0 @ Equal to, greater than, or unordered.
RETLDM
@@ -1182,6 +1379,8 @@
str lr, [sp, #-8]!
ARM_CALL aeabi_cdcmple
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it ls, e
movls r0, #1 @ Less than or equal to.
movhi r0, #0 @ Greater than or unordered.
RETLDM
@@ -1192,6 +1391,8 @@
str lr, [sp, #-8]!
ARM_CALL aeabi_cdrcmple
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it ls, e
movls r0, #1 @ Operand 2 is less than or equal to operand 1.
movhi r0, #0 @ Operand 2 greater than operand 1, or unordered.
RETLDM
@@ -1202,6 +1403,8 @@
str lr, [sp, #-8]!
ARM_CALL aeabi_cdrcmple
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it cc, e
movcc r0, #1 @ Operand 2 is less than operand 1.
movcs r0, #0 @ Operand 2 is greater than or equal to operand 1,
@ or they are unordered.
@@ -1258,7 +1461,10 @@
orr r3, r3, #0x80000000
orr r3, r3, xl, lsr #21
tst xh, #0x80000000 @ the sign bit
- mov r0, r3, lsr r2
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ shift1 lsr, r0, r3, r2
+ do_it ne
+ /* APPLE LOCAL end v7 support. Merge from mainline */
rsbne r0, r0, #0
RET
@@ -1268,6 +1474,8 @@
2: orrs xl, xl, xh, lsl #12
bne 4f @ x is NAN.
3: ands r0, xh, #0x80000000 @ the sign bit
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq
moveq r0, #0x7fffffff @ maximum signed positive si
RET
@@ -1299,7 +1507,8 @@
mov r3, xh, lsl #11
orr r3, r3, #0x80000000
orr r3, r3, xl, lsr #21
- mov r0, r3, lsr r2
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ shift1 lsr, r0, r3, r2
RET
1: mov r0, #0
@@ -1327,8 +1536,11 @@
@ check exponent range.
mov r2, xh, lsl #1
subs r3, r2, #((1023 - 127) << 21)
- subcss ip, r3, #(1 << 21)
- rsbcss ip, ip, #(254 << 21)
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it cs, t
+ COND(sub,s,cs) ip, r3, #(1 << 21)
+ COND(rsb,s,cs) ip, ip, #(254 << 21)
+ /* APPLE LOCAL end v7 support. Merge from mainline */
bls 2f @ value is out of range
1: @ shift and round mantissa
@@ -1337,6 +1549,8 @@
orr xl, ip, xl, lsr #29
cmp r2, #0x80000000
adc r0, xl, r3, lsl #2
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq
biceq r0, r0, #1
RET
@@ -1346,6 +1560,8 @@
@ check if denormalized value is possible
adds r2, r3, #(23 << 21)
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it lt, t
andlt r0, xh, #0x80000000 @ too small, return signed 0.
RETc(lt)
@@ -1354,13 +1570,20 @@
mov r2, r2, lsr #21
rsb r2, r2, #24
rsb ip, r2, #32
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+#if defined(__thumb2__)
+ lsls r3, xl, ip
+#else
movs r3, xl, lsl ip
- mov xl, xl, lsr r2
+#endif
+ shift1 lsr, xl, xl, r2
+ do_it ne
orrne xl, xl, #1 @ fold r3 for rounding considerations.
mov r3, xh, lsl #11
mov r3, r3, lsr #11
- orr xl, xl, r3, lsl ip
- mov r3, r3, lsr r2
+ shiftop orr, xl, xl, r3, lsl, ip, ip
+ shift1 lsr, r3, r3, r2
+ /* APPLE LOCAL end v7 support. Merge from mainline */
mov r3, r3, lsl #1
b 1b
@@ -1368,6 +1591,8 @@
mvns r3, r2, asr #21
bne 5f @ simple overflow
orrs r3, xl, xh, lsl #12
+ /* APPLE LOCAL v7 support. Merge from mainline */
+ do_it ne, tt
movne r0, #0x7f000000
orrne r0, r0, #0x00c00000
RETc(ne) @ return NAN
@@ -1450,6 +1675,7 @@
fmdrr d7, r2, r3
fcmpd d6, d7
fmstat
+ do_it ne, e
movne r0, #0
moveq r0, #1
RET
@@ -1466,6 +1692,7 @@
fmdrr d7, r2, r3
fcmpd d6, d7
fmstat
+ do_it eq, e
moveq r0, #0
movne r0, #1
RET
@@ -1482,6 +1709,7 @@
fmdrr d7, r2, r3
fcmpd d6, d7
fmstat
+ do_it pl, e
movpl r0, #0
movmi r0, #1
RET
@@ -1498,6 +1726,7 @@
fmdrr d7, r2, r3
fcmpd d6, d7
fmstat
+ do_it le, e
movle r0, #0
movgt r0, #1
RET
@@ -1514,6 +1743,7 @@
fmdrr d7, r2, r3
fcmpd d6, d7
fmstat
+ do_it hi, e
movhi r0, #0
movls r0, #1
RET
@@ -1530,6 +1760,7 @@
fmdrr d7, r2, r3
fcmpd d6, d7
fmstat
+ do_it lt, e
movlt r0, #0
movge r0, #1
RET
@@ -1546,6 +1777,7 @@
fmdrr d7, r2, r3
fcmpd d6, d7
fmstat
+ do_it vc, e
movvc r0, #0
movvs r0, #1
RET
Modified: llvm-gcc-4.2/trunk/gcc/config/arm/ieee754-sf.S
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/ieee754-sf.S?rev=76781&r1=76780&r2=76781&view=diff
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/ieee754-sf.S (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/ieee754-sf.S Wed Jul 22 15:36:27 2009
@@ -74,36 +74,50 @@
1: @ Look for zeroes, equal values, INF, or NAN.
movs r2, r0, lsl #1
- movnes r3, r1, lsl #1
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it ne, ttt
+ COND(mov,s,ne) r3, r1, lsl #1
teqne r2, r3
- mvnnes ip, r2, asr #24
- mvnnes ip, r3, asr #24
+ COND(mvn,s,ne) ip, r2, asr #24
+ COND(mvn,s,ne) ip, r3, asr #24
+/* APPLE LOCAL end v7 support. Merge from mainline */
beq LSYM(Lad_s)
@ Compute exponent difference. Make largest exponent in r2,
@ corresponding arg in r0, and positive exponent difference in r3.
mov r2, r2, lsr #24
rsbs r3, r2, r3, lsr #24
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it gt, ttt
addgt r2, r2, r3
eorgt r1, r0, r1
eorgt r0, r1, r0
eorgt r1, r0, r1
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it lt
rsblt r3, r3, #0
@ If exponent difference is too large, return largest argument
@ already in r0. We need up to 25 bit to handle proper rounding
@ of 0x1p25 - 1.1.
cmp r3, #25
+ /* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it hi
RETc(hi)
+ /* APPLE LOCAL end v7 support. Merge from mainline */
@ Convert mantissa to signed integer.
tst r0, #0x80000000
orr r0, r0, #0x00800000
bic r0, r0, #0xff000000
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it ne
rsbne r0, r0, #0
tst r1, #0x80000000
orr r1, r1, #0x00800000
bic r1, r1, #0xff000000
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it ne
rsbne r1, r1, #0
@ If exponent == difference, one or both args were denormalized.
@@ -117,15 +131,22 @@
@ Shift and add second arg to first arg in r0.
@ Keep leftover bits into r1.
- adds r0, r0, r1, asr r3
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ shiftop adds, r0, r0, r1, asr, r3, ip
rsb r3, r3, #32
- mov r1, r1, lsl r3
+ shift1 lsl, r1, r1, r3
@ Keep absolute value in r0-r1, sign in r3 (the n bit was set above)
and r3, r0, #0x80000000
bpl LSYM(Lad_p)
+#if defined(__thumb2__)
+ negs r1, r1
+ sbc r0, r0, r0, lsl #1
+#else
rsbs r1, r1, #0
rsc r0, r0, #0
+#endif
+/* APPLE LOCAL end v7 support. Merge from mainline */
@ Determine how to normalize the result.
LSYM(Lad_p):
@@ -150,6 +171,8 @@
LSYM(Lad_e):
cmp r1, #0x80000000
adc r0, r0, r2, lsl #23
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq
biceq r0, r0, #1
orr r0, r0, r3
RET
@@ -188,16 +211,26 @@
clz ip, r0
sub ip, ip, #8
subs r2, r2, ip
- mov r0, r0, lsl ip
+/* APPLE LOCAL v7 support. Merge from mainline */
+ shift1 lsl, r0, r0, ip
#endif
@ Final result with sign
@ If exponent negative, denormalize result.
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it ge, et
addge r0, r0, r2, lsl #23
rsblt r2, r2, #0
orrge r0, r0, r3
+#if defined(__thumb2__)
+ do_it lt, t
+ lsrlt r0, r0, r2
+ orrlt r0, r3, r0
+#else
orrlt r0, r3, r0, lsr r2
+#endif
+/* APPLE LOCAL end v7 support. Merge from mainline */
RET
@ Fixup and adjust bit position for denormalized arguments.
@@ -205,6 +238,8 @@
LSYM(Lad_d):
teq r2, #0
eor r1, r1, #0x00800000
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq, te
eoreq r0, r0, #0x00800000
addeq r2, r2, #1
subne r3, r3, #1
@@ -214,7 +249,10 @@
mov r3, r1, lsl #1
mvns ip, r2, asr #24
- mvnnes ip, r3, asr #24
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it ne
+ COND(mvn,s,ne) ip, r3, asr #24
+/* APPLE LOCAL end v7 support. Merge from mainline */
beq LSYM(Lad_i)
teq r2, r3
@@ -222,12 +260,16 @@
@ Result is x + 0.0 = x or 0.0 + y = y.
teq r2, #0
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq
moveq r0, r1
RET
1: teq r0, r1
@ Result is x - x = 0.
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it ne, t
movne r0, #0
RETc(ne)
@@ -235,9 +277,13 @@
tst r2, #0xff000000
bne 2f
movs r0, r0, lsl #1
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it cs
orrcs r0, r0, #0x80000000
RET
2: adds r2, r2, #(2 << 24)
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it cc, t
addcc r0, r0, #(1 << 23)
RETc(cc)
and r3, r0, #0x80000000
@@ -256,11 +302,15 @@
@ otherwise return r0 (which is INF or -INF)
LSYM(Lad_i):
mvns r2, r2, asr #24
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it ne, et
movne r0, r1
- mvneqs r3, r3, asr #24
+ COND(mvn,s,eq) r3, r3, asr #24
movne r1, r0
movs r2, r0, lsl #9
- moveqs r3, r1, lsl #9
+ do_it eq, te
+ COND(mov,s,eq) r3, r1, lsl #9
+/* APPLE LOCAL end v7 support. Merge from mainline */
teqeq r0, r1
orrne r0, r0, #0x00400000 @ quiet NAN
RET
@@ -283,9 +333,13 @@
ARM_FUNC_ALIAS(aeabi_i2f,floatsisf)
ands r3, r0, #0x80000000
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it mi
rsbmi r0, r0, #0
1: movs ip, r0
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq
RETc(eq)
@ Add initial exponent to sign
@@ -310,7 +364,12 @@
orrs r2, r0, r1
#if !defined (__VFP_FP__) && !defined(__SOFTFP__)
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it eq, t
mvfeqs f0, #0.0
+#else
+ do_it eq
+/* APPLE LOCAL end v7 support. Merge from mainline */
#endif
RETc(eq)
@@ -323,14 +382,26 @@
orrs r2, r0, r1
#if !defined (__VFP_FP__) && !defined(__SOFTFP__)
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it eq, t
mvfeqs f0, #0.0
+#else
+ do_it eq
+/* APPLE LOCAL end v7 support. Merge from mainline */
#endif
RETc(eq)
ands r3, ah, #0x80000000 @ sign bit in r3
bpl 1f
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+#if defined(__thumb2__)
+ negs al, al
+ sbc ah, ah, ah, lsl #1
+#else
rsbs al, al, #0
rsc ah, ah, #0
+#endif
+/* APPLE LOCAL end v7 support. Merge from mainline */
1:
#if !defined (__VFP_FP__) && !defined(__SOFTFP__)
@ For hard FPA code we want to return via the tail below so that
@@ -341,12 +412,16 @@
#endif
movs ip, ah
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq, tt
moveq ip, al
moveq ah, al
moveq al, #0
@ Add initial exponent to sign
orr r3, r3, #((127 + 23 + 32) << 23)
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq
subeq r3, r3, #(32 << 23)
2: sub r3, r3, #(1 << 23)
@@ -354,15 +429,23 @@
mov r2, #23
cmp ip, #(1 << 16)
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it hs, t
movhs ip, ip, lsr #16
subhs r2, r2, #16
cmp ip, #(1 << 8)
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it hs, t
movhs ip, ip, lsr #8
subhs r2, r2, #8
cmp ip, #(1 << 4)
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it hs, t
movhs ip, ip, lsr #4
subhs r2, r2, #4
cmp ip, #(1 << 2)
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it hs, e
subhs r2, r2, #2
sublo r2, r2, ip, lsr #1
subs r2, r2, ip, lsr #3
@@ -377,19 +460,23 @@
sub r3, r3, r2, lsl #23
blt 3f
- add r3, r3, ah, lsl r2
- mov ip, al, lsl r2
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ shiftop add, r3, r3, ah, lsl, r2, ip
+ shift1 lsl, ip, al, r2
rsb r2, r2, #32
cmp ip, #0x80000000
- adc r0, r3, al, lsr r2
+ shiftop adc, r0, r3, al, lsr, r2, r2
+ do_it eq
biceq r0, r0, #1
RET
3: add r2, r2, #32
- mov ip, ah, lsl r2
+ shift1 lsl, ip, ah, r2
rsb r2, r2, #32
orrs al, al, ip, lsl #1
- adc r0, r3, ah, lsr r2
+ shiftop adc, r0, r3, ah, lsr, r2, r2
+ do_it eq
+/* APPLE LOCAL end v7 support. Merge from mainline */
biceq r0, r0, ip, lsr #31
RET
@@ -418,7 +505,10 @@
@ Mask out exponents, trap any zero/denormal/INF/NAN.
mov ip, #0xff
ands r2, ip, r0, lsr #23
- andnes r3, ip, r1, lsr #23
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it ne, tt
+ COND(and,s,ne) r3, ip, r1, lsr #23
+/* APPLE LOCAL end v7 support. Merge from mainline */
teqne r2, ip
teqne r3, ip
beq LSYM(Lml_s)
@@ -434,7 +524,10 @@
@ If power of two, branch to a separate path.
@ Make up for final alignment.
movs r0, r0, lsl #9
- movnes r1, r1, lsl #9
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it ne
+ COND(mov,s,ne) r1, r1, lsl #9
+/* APPLE LOCAL end v7 support. Merge from mainline */
beq LSYM(Lml_1)
mov r3, #0x08000000
orr r0, r3, r0, lsr #5
@@ -446,7 +539,8 @@
and r3, ip, #0x80000000
@ Well, no way to make it shorter without the umull instruction.
- stmfd sp!, {r3, r4, r5}
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_push {r3, r4, r5}
mov r4, r0, lsr #16
mov r5, r1, lsr #16
bic r0, r0, r4, lsl #16
@@ -457,7 +551,8 @@
mla r0, r4, r1, r0
adds r3, r3, r0, lsl #16
adc r1, ip, r0, lsr #16
- ldmfd sp!, {r0, r4, r5}
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_pop {r0, r4, r5}
#else
@@ -471,6 +566,8 @@
@ Adjust result upon the MSB position.
cmp r1, #(1 << 23)
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it cc, tt
movcc r1, r1, lsl #1
orrcc r1, r1, r3, lsr #31
movcc r3, r3, lsl #1
@@ -486,6 +583,8 @@
@ Round the result, merge final exponent.
cmp r3, #0x80000000
adc r0, r0, r2, lsl #23
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq
biceq r0, r0, #1
RET
@@ -493,11 +592,15 @@
LSYM(Lml_1):
teq r0, #0
and ip, ip, #0x80000000
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it eq
moveq r1, r1, lsl #9
orr r0, ip, r0, lsr #9
orr r0, r0, r1, lsr #9
subs r2, r2, #127
- rsbgts r3, r2, #255
+ do_it gt, tt
+ COND(rsb,s,gt) r3, r2, #255
+/* APPLE LOCAL end v7 support. Merge from mainline */
orrgt r0, r0, r2, lsl #23
RETc(gt)
@@ -512,18 +615,22 @@
@ Check if denormalized result is possible, otherwise return signed 0.
cmn r2, #(24 + 1)
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it le, t
bicle r0, r0, #0x7fffffff
RETc(le)
@ Shift value right, round, etc.
rsb r2, r2, #0
movs r1, r0, lsl #1
- mov r1, r1, lsr r2
+ shift1 lsr, r1, r1, r2
rsb r2, r2, #32
- mov ip, r0, lsl r2
+ shift1 lsl, ip, r0, r2
movs r0, r1, rrx
adc r0, r0, #0
orrs r3, r3, ip, lsl #1
+ do_it eq
+/* APPLE LOCAL end v7 support. Merge from mainline */
biceq r0, r0, ip, lsr #31
RET
@@ -532,14 +639,18 @@
LSYM(Lml_d):
teq r2, #0
and ip, r0, #0x80000000
-1: moveq r0, r0, lsl #1
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+1: do_it eq, tt
+ moveq r0, r0, lsl #1
tsteq r0, #0x00800000
subeq r2, r2, #1
beq 1b
orr r0, r0, ip
teq r3, #0
and ip, r1, #0x80000000
-2: moveq r1, r1, lsl #1
+2: do_it eq, tt
+ moveq r1, r1, lsl #1
+/* APPLE LOCAL end v7 support. Merge from mainline */
tsteq r1, #0x00800000
subeq r3, r3, #1
beq 2b
@@ -550,12 +661,16 @@
@ Isolate the INF and NAN cases away
and r3, ip, r1, lsr #23
teq r2, ip
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it ne
teqne r3, ip
beq 1f
@ Here, one or more arguments are either denormalized or zero.
bics ip, r0, #0x80000000
- bicnes ip, r1, #0x80000000
+ do_it ne
+ COND(bic,s,ne) ip, r1, #0x80000000
+/* APPLE LOCAL end v7 support. Merge from mainline */
bne LSYM(Lml_d)
@ Result is 0, but determine sign anyway.
@@ -566,6 +681,8 @@
1: @ One or both args are INF or NAN.
teq r0, #0x0
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it ne, ett
teqne r0, #0x80000000
moveq r0, r1
teqne r1, #0x0
@@ -578,6 +695,8 @@
1: teq r3, ip
bne LSYM(Lml_i)
movs r3, r1, lsl #9
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it ne
movne r0, r1
bne LSYM(Lml_n) @ <anything> * NAN -> NAN
@@ -608,7 +727,10 @@
@ Mask out exponents, trap any zero/denormal/INF/NAN.
mov ip, #0xff
ands r2, ip, r0, lsr #23
- andnes r3, ip, r1, lsr #23
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it ne, tt
+ COND(and,s,ne) r3, ip, r1, lsr #23
+/* APPLE LOCAL end v7 support. Merge from mainline */
teqne r2, ip
teqne r3, ip
beq LSYM(Ldv_s)
@@ -635,25 +757,33 @@
@ Ensure result will land to known bit position.
@ Apply exponent bias accordingly.
cmp r3, r1
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it cc
movcc r3, r3, lsl #1
adc r2, r2, #(127 - 2)
@ The actual division loop.
mov ip, #0x00800000
1: cmp r3, r1
+ do_it cs, t
subcs r3, r3, r1
orrcs r0, r0, ip
cmp r3, r1, lsr #1
+ do_it cs, t
subcs r3, r3, r1, lsr #1
orrcs r0, r0, ip, lsr #1
cmp r3, r1, lsr #2
+ do_it cs, t
subcs r3, r3, r1, lsr #2
orrcs r0, r0, ip, lsr #2
cmp r3, r1, lsr #3
+ do_it cs, t
subcs r3, r3, r1, lsr #3
orrcs r0, r0, ip, lsr #3
movs r3, r3, lsl #4
- movnes ip, ip, lsr #4
+ do_it ne
+ COND(mov,s,ne) ip, ip, lsr #4
+/* APPLE LOCAL end v7 support. Merge from mainline */
bne 1b
@ Check exponent for under/overflow.
@@ -663,6 +793,8 @@
@ Round the result, merge final exponent.
cmp r3, r1
adc r0, r0, r2, lsl #23
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq
biceq r0, r0, #1
RET
@@ -671,7 +803,10 @@
and ip, ip, #0x80000000
orr r0, ip, r0, lsr #9
adds r2, r2, #127
- rsbgts r3, r2, #255
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it gt, tt
+ COND(rsb,s,gt) r3, r2, #255
+/* APPLE LOCAL end v7 support. Merge from mainline */
orrgt r0, r0, r2, lsl #23
RETc(gt)
@@ -685,14 +820,18 @@
LSYM(Ldv_d):
teq r2, #0
and ip, r0, #0x80000000
-1: moveq r0, r0, lsl #1
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+1: do_it eq, tt
+ moveq r0, r0, lsl #1
tsteq r0, #0x00800000
subeq r2, r2, #1
beq 1b
orr r0, r0, ip
teq r3, #0
and ip, r1, #0x80000000
-2: moveq r1, r1, lsl #1
+2: do_it eq, tt
+ moveq r1, r1, lsl #1
+/* APPLE LOCAL end v7 support. Merge from mainline */
tsteq r1, #0x00800000
subeq r3, r3, #1
beq 2b
@@ -718,7 +857,10 @@
b LSYM(Lml_n) @ <anything> / NAN -> NAN
2: @ If both are nonzero, we need to normalize and resume above.
bics ip, r0, #0x80000000
- bicnes ip, r1, #0x80000000
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it ne
+ COND(bic,s,ne) ip, r1, #0x80000000
+/* APPLE LOCAL end v7 support. Merge from mainline */
bne LSYM(Ldv_d)
@ One or both arguments are zero.
bics r2, r0, #0x80000000
@@ -774,18 +916,26 @@
mov r2, r0, lsl #1
mov r3, r1, lsl #1
mvns ip, r2, asr #24
- mvnnes ip, r3, asr #24
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ do_it ne
+ COND(mvn,s,ne) ip, r3, asr #24
beq 3f
@ Compare values.
@ Note that 0.0 is equal to -0.0.
2: orrs ip, r2, r3, lsr #1 @ test if both are 0, clear C flag
+ do_it ne
teqne r0, r1 @ if not 0 compare sign
- subpls r0, r2, r3 @ if same sign compare values, set r0
+ do_it pl
+ COND(sub,s,pl) r0, r2, r3 @ if same sign compare values, set r0
@ Result:
+ do_it hi
movhi r0, r1, asr #31
+ do_it lo
mvnlo r0, r1, asr #31
+ do_it ne
+/* APPLE LOCAL end v7 support. Merge from mainline */
orrne r0, r0, #1
RET
@@ -822,13 +972,16 @@
@ The status-returning routines are required to preserve all
@ registers except ip, lr, and cpsr.
-6: stmfd sp!, {r0, r1, r2, r3, lr}
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+6: do_push {r0, r1, r2, r3, lr}
ARM_CALL cmpsf2
@ Set the Z flag correctly, and the C flag unconditionally.
- cmp r0, #0
+ cmp r0, #0
@ Clear the C flag if the return value was -1, indicating
@ that the first operand was smaller than the second.
- cmnmi r0, #0
+ do_it mi
+ cmnmi r0, #0
+/* APPLE LOCAL end v7 support. Merge from mainline */
/* APPLE LOCAL ARM MACH assembler */
RETLDM1(r0, r1, r2, r3)
@@ -840,6 +993,8 @@
str lr, [sp, #-8]!
ARM_CALL aeabi_cfcmple
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it eq, e
moveq r0, #1 @ Equal to.
movne r0, #0 @ Less than, greater than, or unordered.
RETLDM
@@ -850,6 +1005,8 @@
str lr, [sp, #-8]!
ARM_CALL aeabi_cfcmple
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it cc, e
movcc r0, #1 @ Less than.
movcs r0, #0 @ Equal to, greater than, or unordered.
RETLDM
@@ -860,6 +1017,8 @@
str lr, [sp, #-8]!
ARM_CALL aeabi_cfcmple
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it ls, e
movls r0, #1 @ Less than or equal to.
movhi r0, #0 @ Greater than or unordered.
RETLDM
@@ -870,6 +1029,8 @@
str lr, [sp, #-8]!
ARM_CALL aeabi_cfrcmple
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it ls, e
movls r0, #1 @ Operand 2 is less than or equal to operand 1.
movhi r0, #0 @ Operand 2 greater than operand 1, or unordered.
RETLDM
@@ -880,6 +1041,8 @@
str lr, [sp, #-8]!
ARM_CALL aeabi_cfrcmple
+/* APPLE LOCAL v7 support. Merge from mainline */
+ do_it cc, e
movcc r0, #1 @ Operand 2 is less than operand 1.
movcs r0, #0 @ Operand 2 is greater than or equal to operand 1,
@ or they are unordered.
@@ -933,7 +1096,9 @@
mov r3, r0, lsl #8
orr r3, r3, #0x80000000
tst r0, #0x80000000 @ the sign bit
- mov r0, r3, lsr r2
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+ shift1 lsr, r0, r3, r2
+ do_it ne
rsbne r0, r0, #0
RET
@@ -945,6 +1110,8 @@
movs r2, r0, lsl #9
bne 4f @ r0 is NAN.
3: ands r0, r0, #0x80000000 @ the sign bit
+ do_it eq
+/* APPLE LOCAL end v7 support. Merge from mainline */
moveq r0, #0x7fffffff @ the maximum signed positive si
RET
@@ -974,7 +1141,8 @@
@ scale the value
mov r3, r0, lsl #8
orr r3, r3, #0x80000000
- mov r0, r3, lsr r2
+/* APPLE LOCAL v7 support. Merge from mainline */
+ shift1 lsr, r0, r3, r2
RET
1: mov r0, #0
@@ -1062,6 +1230,7 @@
fmsr s15, r1
fcmps s14, s15
fmstat
+ do_it ne, e
movne r0, #0
moveq r0, #1
RET
@@ -1078,6 +1247,7 @@
fmsr s15, r1
fcmps s14, s15
fmstat
+ do_it eq, e
moveq r0, #0
movne r0, #1
RET
@@ -1094,6 +1264,7 @@
fmsr s15, r1
fcmps s14, s15
fmstat
+ do_it pl, e
movpl r0, #0
movmi r0, #1
RET
@@ -1110,6 +1281,7 @@
fmsr s15, r1
fcmps s14, s15
fmstat
+ do_it le, e
movle r0, #0
movgt r0, #1
RET
@@ -1126,6 +1298,7 @@
fmsr s15, r1
fcmps s14, s15
fmstat
+ do_it hi, e
movhi r0, #0
movls r0, #1
RET
@@ -1142,6 +1315,7 @@
fmsr s15, r1
fcmps s14, s15
fmstat
+ do_it lt, e
movlt r0, #0
movge r0, #1
RET
@@ -1158,6 +1332,7 @@
fmsr s15, r1
fcmps s14, s15
fmstat
+ do_it vc, e
movvc r0, #0
movvs r0, #1
RET
Modified: llvm-gcc-4.2/trunk/gcc/config/arm/iwmmxt.md
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/iwmmxt.md?rev=76781&r1=76780&r2=76781&view=diff
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/iwmmxt.md (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/iwmmxt.md Wed Jul 22 15:36:27 2009
@@ -1,3 +1,5 @@
+;; APPLE LOCAL v7 support. Merge from mainline
+;; ??? This file needs auditing for thumb2
;; Patterns for the Intel Wireless MMX technology architecture.
;; Copyright (C) 2003, 2004, 2005 Free Software Foundation, Inc.
;; Contributed by Red Hat.
@@ -19,6 +21,17 @@
;; the Free Software Foundation, 51 Franklin Street, Fifth Floor,
;; Boston, MA 02110-1301, USA.
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
+;; Integer element sizes implemented by IWMMXT.
+(define_mode_macro VMMX [V2SI V4HI V8QI])
+
+;; Integer element sizes for shifts.
+(define_mode_macro VSHFT [V4HI V2SI DI])
+
+;; Determine element size suffix from vector mode.
+(define_mode_attr MMX_char [(V8QI "b") (V4HI "h") (V2SI "w") (DI "d")])
+
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
(define_insn "iwmmxt_iordi3"
[(set (match_operand:DI 0 "register_operand" "=y,?&r,?&r")
(ior:DI (match_operand:DI 1 "register_operand" "%y,0,r")
@@ -157,9 +170,10 @@
(set_attr "neg_pool_range" "*,*,4084, *,*,*")]
)
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
(define_insn "movv8qi_internal"
- [(set (match_operand:V8QI 0 "nonimmediate_operand" "=y,m,y,?r,?y,?r")
- (match_operand:V8QI 1 "general_operand" "y,y,mi,y,r,mi"))]
+ [(set (match_operand:V8QI 0 "nonimmediate_operand" "=y,m,y,?r,?y,?r,?r")
+ (match_operand:V8QI 1 "general_operand" "y,y,mi,y,r,r,mi"))]
"TARGET_REALLY_IWMMXT"
"*
switch (which_alternative)
@@ -169,17 +183,18 @@
case 2: return \"wldrd%?\\t%0, %1\";
case 3: return \"tmrrc%?\\t%Q0, %R0, %1\";
case 4: return \"tmcrr%?\\t%0, %Q1, %R1\";
+ case 5: return \"#\";
default: return output_move_double (operands);
}"
[(set_attr "predicable" "yes")
- (set_attr "length" "4, 4, 4,4,4, 8")
- (set_attr "type" "*,store1,load1,*,*,load1")
- (set_attr "pool_range" "*, *, 256,*,*, 256")
- (set_attr "neg_pool_range" "*, *, 244,*,*, 244")])
+ (set_attr "length" "4, 4, 4,4,4,8, 8")
+ (set_attr "type" "*,store1,load1,*,*,*,load1")
+ (set_attr "pool_range" "*, *, 256,*,*,*, 256")
+ (set_attr "neg_pool_range" "*, *, 244,*,*,*, 244")])
(define_insn "movv4hi_internal"
- [(set (match_operand:V4HI 0 "nonimmediate_operand" "=y,m,y,?r,?y,?r")
- (match_operand:V4HI 1 "general_operand" "y,y,mi,y,r,mi"))]
+ [(set (match_operand:V4HI 0 "nonimmediate_operand" "=y,m,y,?r,?y,?r,?r")
+ (match_operand:V4HI 1 "general_operand" "y,y,mi,y,r,r,mi"))]
"TARGET_REALLY_IWMMXT"
"*
switch (which_alternative)
@@ -189,17 +204,18 @@
case 2: return \"wldrd%?\\t%0, %1\";
case 3: return \"tmrrc%?\\t%Q0, %R0, %1\";
case 4: return \"tmcrr%?\\t%0, %Q1, %R1\";
+ case 5: return \"#\";
default: return output_move_double (operands);
}"
[(set_attr "predicable" "yes")
- (set_attr "length" "4, 4, 4,4,4, 8")
- (set_attr "type" "*,store1,load1,*,*,load1")
- (set_attr "pool_range" "*, *, 256,*,*, 256")
- (set_attr "neg_pool_range" "*, *, 244,*,*, 244")])
+ (set_attr "length" "4, 4, 4,4,4,8, 8")
+ (set_attr "type" "*,store1,load1,*,*,*,load1")
+ (set_attr "pool_range" "*, *, 256,*,*,*, 256")
+ (set_attr "neg_pool_range" "*, *, 244,*,*,*, 244")])
(define_insn "movv2si_internal"
- [(set (match_operand:V2SI 0 "nonimmediate_operand" "=y,m,y,?r,?y,?r")
- (match_operand:V2SI 1 "general_operand" "y,y,mi,y,r,mi"))]
+ [(set (match_operand:V2SI 0 "nonimmediate_operand" "=y,m,y,?r,?y,?r,?r")
+ (match_operand:V2SI 1 "general_operand" "y,y,mi,y,r,r,mi"))]
"TARGET_REALLY_IWMMXT"
"*
switch (which_alternative)
@@ -209,13 +225,15 @@
case 2: return \"wldrd%?\\t%0, %1\";
case 3: return \"tmrrc%?\\t%Q0, %R0, %1\";
case 4: return \"tmcrr%?\\t%0, %Q1, %R1\";
+ case 5: return \"#\";
default: return output_move_double (operands);
}"
[(set_attr "predicable" "yes")
- (set_attr "length" "4, 4, 4,4,4, 24")
- (set_attr "type" "*,store1,load1,*,*,load1")
- (set_attr "pool_range" "*, *, 256,*,*, 256")
- (set_attr "neg_pool_range" "*, *, 244,*,*, 244")])
+ (set_attr "length" "4, 4, 4,4,4,8, 24")
+ (set_attr "type" "*,store1,load1,*,*,*,load1")
+ (set_attr "pool_range" "*, *, 256,*,*,*, 256")
+ (set_attr "neg_pool_range" "*, *, 244,*,*,*, 244")])
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
;; This pattern should not be needed. It is to match a
;; wierd case generated by GCC when no optimizations are
@@ -235,30 +253,16 @@
;; Vector add/subtract
-(define_insn "addv8qi3"
- [(set (match_operand:V8QI 0 "register_operand" "=y")
- (plus:V8QI (match_operand:V8QI 1 "register_operand" "y")
- (match_operand:V8QI 2 "register_operand" "y")))]
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
+(define_insn "*add<mode>3_iwmmxt"
+ [(set (match_operand:VMMX 0 "register_operand" "=y")
+ (plus:VMMX (match_operand:VMMX 1 "register_operand" "y")
+ (match_operand:VMMX 2 "register_operand" "y")))]
"TARGET_REALLY_IWMMXT"
- "waddb%?\\t%0, %1, %2"
- [(set_attr "predicable" "yes")])
-
-(define_insn "addv4hi3"
- [(set (match_operand:V4HI 0 "register_operand" "=y")
- (plus:V4HI (match_operand:V4HI 1 "register_operand" "y")
- (match_operand:V4HI 2 "register_operand" "y")))]
- "TARGET_REALLY_IWMMXT"
- "waddh%?\\t%0, %1, %2"
- [(set_attr "predicable" "yes")])
-
-(define_insn "addv2si3"
- [(set (match_operand:V2SI 0 "register_operand" "=y")
- (plus:V2SI (match_operand:V2SI 1 "register_operand" "y")
- (match_operand:V2SI 2 "register_operand" "y")))]
- "TARGET_REALLY_IWMMXT"
- "waddw%?\\t%0, %1, %2"
+ "wadd<MMX_char>%?\\t%0, %1, %2"
[(set_attr "predicable" "yes")])
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
(define_insn "ssaddv8qi3"
[(set (match_operand:V8QI 0 "register_operand" "=y")
(ss_plus:V8QI (match_operand:V8QI 1 "register_operand" "y")
@@ -307,30 +311,16 @@
"waddwus%?\\t%0, %1, %2"
[(set_attr "predicable" "yes")])
-(define_insn "subv8qi3"
- [(set (match_operand:V8QI 0 "register_operand" "=y")
- (minus:V8QI (match_operand:V8QI 1 "register_operand" "y")
- (match_operand:V8QI 2 "register_operand" "y")))]
- "TARGET_REALLY_IWMMXT"
- "wsubb%?\\t%0, %1, %2"
- [(set_attr "predicable" "yes")])
-
-(define_insn "subv4hi3"
- [(set (match_operand:V4HI 0 "register_operand" "=y")
- (minus:V4HI (match_operand:V4HI 1 "register_operand" "y")
- (match_operand:V4HI 2 "register_operand" "y")))]
- "TARGET_REALLY_IWMMXT"
- "wsubh%?\\t%0, %1, %2"
- [(set_attr "predicable" "yes")])
-
-(define_insn "subv2si3"
- [(set (match_operand:V2SI 0 "register_operand" "=y")
- (minus:V2SI (match_operand:V2SI 1 "register_operand" "y")
- (match_operand:V2SI 2 "register_operand" "y")))]
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
+(define_insn "*sub<mode>3_iwmmxt"
+ [(set (match_operand:VMMX 0 "register_operand" "=y")
+ (minus:VMMX (match_operand:VMMX 1 "register_operand" "y")
+ (match_operand:VMMX 2 "register_operand" "y")))]
"TARGET_REALLY_IWMMXT"
- "wsubw%?\\t%0, %1, %2"
+ "wsub<MMX_char>%?\\t%0, %1, %2"
[(set_attr "predicable" "yes")])
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
(define_insn "sssubv8qi3"
[(set (match_operand:V8QI 0 "register_operand" "=y")
(ss_minus:V8QI (match_operand:V8QI 1 "register_operand" "y")
@@ -379,7 +369,8 @@
"wsubwus%?\\t%0, %1, %2"
[(set_attr "predicable" "yes")])
-(define_insn "mulv4hi3"
+;; APPLE LOCAL v7 support. Merge from Codesourcery
+(define_insn "*mulv4hi3_iwmmxt"
[(set (match_operand:V4HI 0 "register_operand" "=y")
(mult:V4HI (match_operand:V4HI 1 "register_operand" "y")
(match_operand:V4HI 2 "register_operand" "y")))]
@@ -730,102 +721,40 @@
;; Max/min insns
-(define_insn "smaxv8qi3"
- [(set (match_operand:V8QI 0 "register_operand" "=y")
- (smax:V8QI (match_operand:V8QI 1 "register_operand" "y")
- (match_operand:V8QI 2 "register_operand" "y")))]
- "TARGET_REALLY_IWMMXT"
- "wmaxsb%?\\t%0, %1, %2"
- [(set_attr "predicable" "yes")])
-
-(define_insn "umaxv8qi3"
- [(set (match_operand:V8QI 0 "register_operand" "=y")
- (umax:V8QI (match_operand:V8QI 1 "register_operand" "y")
- (match_operand:V8QI 2 "register_operand" "y")))]
- "TARGET_REALLY_IWMMXT"
- "wmaxub%?\\t%0, %1, %2"
- [(set_attr "predicable" "yes")])
-
-(define_insn "smaxv4hi3"
- [(set (match_operand:V4HI 0 "register_operand" "=y")
- (smax:V4HI (match_operand:V4HI 1 "register_operand" "y")
- (match_operand:V4HI 2 "register_operand" "y")))]
- "TARGET_REALLY_IWMMXT"
- "wmaxsh%?\\t%0, %1, %2"
- [(set_attr "predicable" "yes")])
-
-(define_insn "umaxv4hi3"
- [(set (match_operand:V4HI 0 "register_operand" "=y")
- (umax:V4HI (match_operand:V4HI 1 "register_operand" "y")
- (match_operand:V4HI 2 "register_operand" "y")))]
- "TARGET_REALLY_IWMMXT"
- "wmaxuh%?\\t%0, %1, %2"
- [(set_attr "predicable" "yes")])
-
-(define_insn "smaxv2si3"
- [(set (match_operand:V2SI 0 "register_operand" "=y")
- (smax:V2SI (match_operand:V2SI 1 "register_operand" "y")
- (match_operand:V2SI 2 "register_operand" "y")))]
- "TARGET_REALLY_IWMMXT"
- "wmaxsw%?\\t%0, %1, %2"
- [(set_attr "predicable" "yes")])
-
-(define_insn "umaxv2si3"
- [(set (match_operand:V2SI 0 "register_operand" "=y")
- (umax:V2SI (match_operand:V2SI 1 "register_operand" "y")
- (match_operand:V2SI 2 "register_operand" "y")))]
- "TARGET_REALLY_IWMMXT"
- "wmaxuw%?\\t%0, %1, %2"
- [(set_attr "predicable" "yes")])
-
-(define_insn "sminv8qi3"
- [(set (match_operand:V8QI 0 "register_operand" "=y")
- (smin:V8QI (match_operand:V8QI 1 "register_operand" "y")
- (match_operand:V8QI 2 "register_operand" "y")))]
- "TARGET_REALLY_IWMMXT"
- "wminsb%?\\t%0, %1, %2"
- [(set_attr "predicable" "yes")])
-
-(define_insn "uminv8qi3"
- [(set (match_operand:V8QI 0 "register_operand" "=y")
- (umin:V8QI (match_operand:V8QI 1 "register_operand" "y")
- (match_operand:V8QI 2 "register_operand" "y")))]
- "TARGET_REALLY_IWMMXT"
- "wminub%?\\t%0, %1, %2"
- [(set_attr "predicable" "yes")])
-
-(define_insn "sminv4hi3"
- [(set (match_operand:V4HI 0 "register_operand" "=y")
- (smin:V4HI (match_operand:V4HI 1 "register_operand" "y")
- (match_operand:V4HI 2 "register_operand" "y")))]
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
+(define_insn "*smax<mode>3_iwmmxt"
+ [(set (match_operand:VMMX 0 "register_operand" "=y")
+ (smax:VMMX (match_operand:VMMX 1 "register_operand" "y")
+ (match_operand:VMMX 2 "register_operand" "y")))]
"TARGET_REALLY_IWMMXT"
- "wminsh%?\\t%0, %1, %2"
+ "wmaxs<MMX_char>%?\\t%0, %1, %2"
[(set_attr "predicable" "yes")])
-(define_insn "uminv4hi3"
- [(set (match_operand:V4HI 0 "register_operand" "=y")
- (umin:V4HI (match_operand:V4HI 1 "register_operand" "y")
- (match_operand:V4HI 2 "register_operand" "y")))]
+(define_insn "*umax<mode>3_iwmmxt"
+ [(set (match_operand:VMMX 0 "register_operand" "=y")
+ (umax:VMMX (match_operand:VMMX 1 "register_operand" "y")
+ (match_operand:VMMX 2 "register_operand" "y")))]
"TARGET_REALLY_IWMMXT"
- "wminuh%?\\t%0, %1, %2"
+ "wmaxu<MMX_char>%?\\t%0, %1, %2"
[(set_attr "predicable" "yes")])
-(define_insn "sminv2si3"
- [(set (match_operand:V2SI 0 "register_operand" "=y")
- (smin:V2SI (match_operand:V2SI 1 "register_operand" "y")
- (match_operand:V2SI 2 "register_operand" "y")))]
+(define_insn "*smin<mode>3_iwmmxt"
+ [(set (match_operand:VMMX 0 "register_operand" "=y")
+ (smin:VMMX (match_operand:VMMX 1 "register_operand" "y")
+ (match_operand:VMMX 2 "register_operand" "y")))]
"TARGET_REALLY_IWMMXT"
- "wminsw%?\\t%0, %1, %2"
+ "wmins<MMX_char>%?\\t%0, %1, %2"
[(set_attr "predicable" "yes")])
-(define_insn "uminv2si3"
- [(set (match_operand:V2SI 0 "register_operand" "=y")
- (umin:V2SI (match_operand:V2SI 1 "register_operand" "y")
- (match_operand:V2SI 2 "register_operand" "y")))]
+(define_insn "*umin<mode>3_iwmmxt"
+ [(set (match_operand:VMMX 0 "register_operand" "=y")
+ (umin:VMMX (match_operand:VMMX 1 "register_operand" "y")
+ (match_operand:VMMX 2 "register_operand" "y")))]
"TARGET_REALLY_IWMMXT"
- "wminuw%?\\t%0, %1, %2"
+ "wminu<MMX_char>%?\\t%0, %1, %2"
[(set_attr "predicable" "yes")])
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
;; Pack/unpack insns.
(define_insn "iwmmxt_wpackhss"
@@ -1137,77 +1066,31 @@
"wrordg%?\\t%0, %1, %2"
[(set_attr "predicable" "yes")])
-(define_insn "ashrv4hi3"
- [(set (match_operand:V4HI 0 "register_operand" "=y")
- (ashiftrt:V4HI (match_operand:V4HI 1 "register_operand" "y")
- (match_operand:SI 2 "register_operand" "z")))]
- "TARGET_REALLY_IWMMXT"
- "wsrahg%?\\t%0, %1, %2"
- [(set_attr "predicable" "yes")])
-
-(define_insn "ashrv2si3"
- [(set (match_operand:V2SI 0 "register_operand" "=y")
- (ashiftrt:V2SI (match_operand:V2SI 1 "register_operand" "y")
- (match_operand:SI 2 "register_operand" "z")))]
- "TARGET_REALLY_IWMMXT"
- "wsrawg%?\\t%0, %1, %2"
- [(set_attr "predicable" "yes")])
-
-(define_insn "ashrdi3_iwmmxt"
- [(set (match_operand:DI 0 "register_operand" "=y")
- (ashiftrt:DI (match_operand:DI 1 "register_operand" "y")
- (match_operand:SI 2 "register_operand" "z")))]
- "TARGET_REALLY_IWMMXT"
- "wsradg%?\\t%0, %1, %2"
- [(set_attr "predicable" "yes")])
-
-(define_insn "lshrv4hi3"
- [(set (match_operand:V4HI 0 "register_operand" "=y")
- (lshiftrt:V4HI (match_operand:V4HI 1 "register_operand" "y")
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
+(define_insn "ashr<mode>3_iwmmxt"
+ [(set (match_operand:VSHFT 0 "register_operand" "=y")
+ (ashiftrt:VSHFT (match_operand:VSHFT 1 "register_operand" "y")
(match_operand:SI 2 "register_operand" "z")))]
"TARGET_REALLY_IWMMXT"
- "wsrlhg%?\\t%0, %1, %2"
+ "wsra<MMX_char>g%?\\t%0, %1, %2"
[(set_attr "predicable" "yes")])
-(define_insn "lshrv2si3"
- [(set (match_operand:V2SI 0 "register_operand" "=y")
- (lshiftrt:V2SI (match_operand:V2SI 1 "register_operand" "y")
- (match_operand:SI 2 "register_operand" "z")))]
- "TARGET_REALLY_IWMMXT"
- "wsrlwg%?\\t%0, %1, %2"
- [(set_attr "predicable" "yes")])
-
-(define_insn "lshrdi3_iwmmxt"
- [(set (match_operand:DI 0 "register_operand" "=y")
- (lshiftrt:DI (match_operand:DI 1 "register_operand" "y")
+(define_insn "lshr<mode>3_iwmmxt"
+ [(set (match_operand:VSHFT 0 "register_operand" "=y")
+ (lshiftrt:VSHFT (match_operand:VSHFT 1 "register_operand" "y")
(match_operand:SI 2 "register_operand" "z")))]
"TARGET_REALLY_IWMMXT"
- "wsrldg%?\\t%0, %1, %2"
- [(set_attr "predicable" "yes")])
-
-(define_insn "ashlv4hi3"
- [(set (match_operand:V4HI 0 "register_operand" "=y")
- (ashift:V4HI (match_operand:V4HI 1 "register_operand" "y")
- (match_operand:SI 2 "register_operand" "z")))]
- "TARGET_REALLY_IWMMXT"
- "wsllhg%?\\t%0, %1, %2"
+ "wsrl<MMX_char>g%?\\t%0, %1, %2"
[(set_attr "predicable" "yes")])
-(define_insn "ashlv2si3"
- [(set (match_operand:V2SI 0 "register_operand" "=y")
- (ashift:V2SI (match_operand:V2SI 1 "register_operand" "y")
- (match_operand:SI 2 "register_operand" "z")))]
- "TARGET_REALLY_IWMMXT"
- "wsllwg%?\\t%0, %1, %2"
- [(set_attr "predicable" "yes")])
-
-(define_insn "ashldi3_iwmmxt"
- [(set (match_operand:DI 0 "register_operand" "=y")
- (ashift:DI (match_operand:DI 1 "register_operand" "y")
+(define_insn "ashl<mode>3_iwmmxt"
+ [(set (match_operand:VSHFT 0 "register_operand" "=y")
+ (ashift:VSHFT (match_operand:VSHFT 1 "register_operand" "y")
(match_operand:SI 2 "register_operand" "z")))]
"TARGET_REALLY_IWMMXT"
- "wslldg%?\\t%0, %1, %2"
+ "wsll<MMX_char>g%?\\t%0, %1, %2"
[(set_attr "predicable" "yes")])
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
(define_insn "rorv4hi3_di"
[(set (match_operand:V4HI 0 "register_operand" "=y")
Modified: llvm-gcc-4.2/trunk/gcc/config/arm/lib1funcs.asm
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/lib1funcs.asm?rev=76781&r1=76780&r2=76781&view=diff
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/lib1funcs.asm (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/lib1funcs.asm Wed Jul 22 15:36:27 2009
@@ -1,7 +1,8 @@
@ libgcc routines for ARM cpu.
@ Division routines, written by Richard Earnshaw, (rearnsha at armltd.co.uk)
-/* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005
+/* APPLE LOCAL v7 support. Merge from mainline */
+/* Copyright 1995, 1996, 1998, 1999, 2000, 2003, 2004, 2005, 2007
Free Software Foundation, Inc.
This file is free software; you can redistribute it and/or modify it
@@ -62,31 +63,32 @@
/* Function end macros. Variants for interworking. */
-@ This selects the minimum architecture level required.
-#define __ARM_ARCH__ 3
-
+/* APPLE LOCAL begin v7 support. Merge from mainline */
#if defined(__ARM_ARCH_3M__) || defined(__ARM_ARCH_4__) \
|| defined(__ARM_ARCH_4T__)
/* We use __ARM_ARCH__ set to 4 here, but in reality it's any processor with
long multiply instructions. That includes v3M. */
-# undef __ARM_ARCH__
# define __ARM_ARCH__ 4
#endif
#if defined(__ARM_ARCH_5__) || defined(__ARM_ARCH_5T__) \
|| defined(__ARM_ARCH_5E__) || defined(__ARM_ARCH_5TE__) \
|| defined(__ARM_ARCH_5TEJ__)
-# undef __ARM_ARCH__
# define __ARM_ARCH__ 5
#endif
#if defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \
|| defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) \
- || defined(__ARM_ARCH_6ZK__)
-# undef __ARM_ARCH__
+ || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__)
# define __ARM_ARCH__ 6
#endif
+#if defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \
+ || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__)
+# define __ARM_ARCH__ 7
+#endif
+
+/* APPLE LOCAL end v7 support. Merge from mainline */
#ifndef __ARM_ARCH__
#error Unable to determine architecture.
#endif
@@ -184,12 +186,24 @@
#define RETLDM \
ldr lr, [sp], #8 ; \
bx lr
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+#if definded (__thumb2__)
+#define RETLDM1(...) \
+ pop {__VA_ARGS__, lr} ; \
+ bx lr
+#define RETLDM2(cond,...) \
+ pop##cond {__VA_ARGS__, lr} ; \
+ bx##cond lr
+#else
#define RETLDM1(...) \
ldmia sp!, {__VA_ARGS__, lr} ; \
bx lr
#define RETLDM2(cond,...) \
ldm##cond##ia sp!, {__VA_ARGS__, lr} ; \
bx##cond lr
+#endif
+
+/* APPLE LOCAL end v7 support. Merge from mainline */
#define RETLDM_unwind(addr) \
ldr lr, [sp], #8 ; \
9: cfi_pop 9b - addr, 0xe, 0x0 ; \
@@ -197,14 +211,82 @@
#else
#define RETLDM \
ldr pc, [sp], #8
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+#if defined (__thumb2__)
+#define RETLDM1(...) \
+ pop {__VA_ARGS__, pc}
+#define RETLDM2(cond,...) \
+ pop##cond {__VA_ARGS__, pc}
+#else
#define RETLDM1(...) \
ldmia sp!, {__VA_ARGS__, pc}
#define RETLDM2(cond,...) \
ldm##cond##ia sp!, {__VA_ARGS__, pc}
+#endif
+/* APPLE LOCAL end v7 support. Merge from mainline */
#define RETLDM_unwind(addr) \
ldr pc, [sp], #8
#endif
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+
+/* The Unified assembly syntax allows the same code to be assembled for both
+ ARM and Thumb-2. However this is only supported by recent gas, so define
+ a set of macros to allow ARM code on older assemblers. */
+#if defined(__thumb2__)
+.macro do_it cond, suffix=""
+#if defined (__MACH__)
+ it$1 $0
+#else
+ it\suffix \cond
+#endif
+.endm
+.macro shift1 op, arg0, arg1, arg2
+#if defined (__MACH__)
+ $0 $1, $2, $3
+#else
+ \op \arg0, \arg1, \arg2
+#endif
+.endm
+#define do_push push
+#define do_pop pop
+#define COND(op1, op2, cond) op1 ## op2 ## cond
+/* Perform an arithmetic operation with a variable shift operand. This
+ requires two instructions and a scratch register on Thumb-2. */
+.macro shiftop name, dest, src1, src2, shiftop, shiftreg, tmp
+#if defined (__MACH__)
+ $4 $6, $3, $5
+ $0 $1, $2, $6
+#else
+ \shiftop \tmp, \src2, \shiftreg
+ \name \dest, \src1, \tmp
+#endif
+.endm
+#else
+.macro do_it cond, suffix=""
+.endm
+.macro shift1 op, arg0, arg1, arg2
+#if defined (__MACH__)
+ mov $1, $2, $0 $3
+#else
+ mov \arg0, \arg1, \op \arg2
+#endif
+.endm
+#define do_push stmfd sp!,
+#define do_pop ldmfd sp!,
+#define COND(op1, op2, cond) op1 ## cond ## op2
+.macro shiftop name, dest, src1, src2, shiftop, shiftreg, tmp
+#if defined (__MACH__)
+ $0 $1, $2, $3, $4 $5
+#else
+ \name \dest, \src1, \src2, \shiftop \shiftreg
+#endif
+.endm
+#endif
+
+
+
+/* APPLE LOCAL end v7 support. Merge from mainline */
.macro ARM_LDIV0 name
str lr, [sp, #-8]!
#if !defined(__MACH__)
@@ -277,6 +359,13 @@
#ifdef __thumb__
#define THUMB_FUNC .thumb_func
#define THUMB_CODE .force_thumb
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+# if defined(__thumb2__)
+#define THUMB_SYNTAX .syntax divided
+# else
+#define THUMB_SYNTAX
+# endif
+/* APPLE LOCAL end v7 support. Merge from mainline */
/* APPLE LOCAL ARM function alignment */
#define FUNC_ALIGN .align 1
#else
@@ -284,6 +373,8 @@
#define THUMB_CODE
/* APPLE LOCAL ARM function alignment */
#define FUNC_ALIGN .align 2
+/* APPLE LOCAL v7 support. Merge from mainline */
+#define THUMB_SYNTAX
#endif
/* APPLE LOCAL begin ARM MACH assembler */
@@ -310,10 +401,33 @@
/* Special function that will always be coded in ARM assembly, even if
in Thumb-only compilation. */
-#if defined(__INTERWORKING_STUBS__)
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+#if defined(__thumb2__)
+
+/* For Thumb-2 we build everything in thumb mode. */
+.macro ARM_FUNC_START name
+#if defined(__MACH__)
+ FUNC_START $0
+#else
+ FUNC_START \name
+#endif
+ .syntax unified
+.endm
+#define EQUIV .thumb_set
+.macro ARM_CALL name
+#if defined(__MACH__)
+ bl ___$0
+#else
+ bl ___\name
+#endif
+.endm
+
+#elif defined(__INTERWORKING_STUBS__)
+
+/* APPLE LOCAL end v7 support. Merge from mainline */
.macro ARM_FUNC_START name
-#if defined(__MACH)
+#if defined(__MACH__)
FUNC_START $0
#else
FUNC_START \name
@@ -339,7 +453,11 @@
bl _L__\name
#endif
.endm
-#else
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+
+#else /* !(__INTERWORKING_STUBS__ || __thumb2__) */
+
+/* APPLE LOCAL end v7 support. Merge from mainline */
.macro ARM_FUNC_START name
#if defined(__MACH__)
.text
@@ -845,7 +963,7 @@
cmp dividend, divisor
blo LSYM(Lgot_result)
- /* LLVM LOCAL */
+ /* APPLE LOCAL v7 support */
THUMB_DIV_MOD_BODY(0)
mov r0, result
@@ -889,7 +1007,8 @@
#ifdef __thumb__
push {r0, r1, lr}
bl SYM(__udivsi3)
- POP {r1, r2, r3}
+ /* APPLE LOCAL v7 support */
+ pop {r1, r2, r3}
mul r2, r0
sub r1, r1, r2
bx r3
@@ -921,7 +1040,7 @@
LSYM(Lover10):
push { work }
- /* LLVM LOCAL */
+ /* APPLE LOCAL v7 support */
THUMB_DIV_MOD_BODY(1)
pop { work }
@@ -975,7 +1094,7 @@
cmp dividend, divisor
blo LSYM(Lgot_result)
- /* LLVM LOCAL */
+ /* APPLE LOCAL v7 support */
THUMB_DIV_MOD_BODY(0)
mov r0, result
@@ -1039,7 +1158,8 @@
#ifdef __thumb__
push {r0, r1, lr}
bl SYM(__divsi3)
- POP {r1, r2, r3}
+ /* APPLE LOCAL v7 support */
+ pop {r1, r2, r3}
mul r2, r0
sub r1, r1, r2
bx r3
@@ -1079,7 +1199,7 @@
cmp dividend, divisor
blo LSYM(Lgot_result)
- /* LLVM LOCAL */
+ /* APPLE LOCAL v7 support */
THUMB_DIV_MOD_BODY(1)
pop { work }
@@ -1279,6 +1399,10 @@
/* APPLE LOCAL begin ARM 4790140 compact switch tables */
/* ----------------------------------------------------------------------- */
+/* These aren't needed for Thumb2 since then we have actual instructions
+ to do what these functions do. */
+#ifndef __thumb2__
+
/* Thumb switch table implementation. Arm code, although must be called
from Thumb (the low bit of LR is expected to be 1).
Expects the call site to be followed by 1-byte count, then <count>
@@ -1354,7 +1478,6 @@
FUNC_END switch32
#endif
-/* APPLE LOCAL end ARM 4790140 compact switch tables */
/* APPLE LOCAL begin 6465387 exception handling interworking VFP save */
#if (__ARM_ARCH__ == 6)
@@ -1374,6 +1497,9 @@
#endif
/* APPLE LOCAL end 6465387 exception handling interworking VFP save */
+#endif /* !defined (__thumb2__) */
+/* APPLE LOCAL end ARM 4790140 compact switch tables */
+
#endif /* __symbian__ */
/* ------------------------------------------------------------------------ */
@@ -1427,6 +1553,12 @@
#endif /* L_call_via_rX */
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+/* Don't bother with the old interworking routines for Thumb-2. */
+/* ??? Maybe only omit these on v7m. */
+#ifndef __thumb2__
+
+/* APPLE LOCAL end v7 support. Merge from mainline */
#if defined L_interwork_call_via_rX
/* These labels & instructions are used by the Arm/Thumb interworking code,
@@ -1552,6 +1684,8 @@
SIZE (_interwork_call_via_lr)
#endif /* L_interwork_call_via_rX */
+/* APPLE LOCAL v7 support. Merge from mainline */
+#endif /* !__thumb2__ */
#endif /* Arch supports thumb. */
#ifndef __symbian__
Modified: llvm-gcc-4.2/trunk/gcc/config/arm/libunwind.S
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/libunwind.S?rev=76781&r1=76780&r2=76781&view=diff
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/libunwind.S (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/libunwind.S Wed Jul 22 15:36:27 2009
@@ -43,7 +43,16 @@
this. */
add r1, r0, #52
ldmia r1, {r3, r4, r5} /* {sp, lr, pc}. */
-#ifdef __INTERWORKING__
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+#if defined(__thumb2__)
+ /* Thumb-2 doesn't allow sp in a load-multiple instruction, so push
+ the target address onto the target stack. This is safe as
+ we're always returning to somewhere further up the call stack. */
+ mov ip, r3
+ mov lr, r4
+ str r5, [ip, #-4]!
+#elif defined(__INTERWORKING__)
+/* APPLE LOCAL end v7 support. Merge from mainline */
/* Restore pc into ip. */
mov r2, r5
stmfd sp!, {r2, r3, r4}
@@ -52,8 +61,14 @@
#endif
/* Don't bother restoring ip. */
ldmia r0, {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp}
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+#if defined(__thumb2__)
+ /* Pop the return address off the target stack. */
+ mov sp, ip
+ pop {pc}
+#elif defined(__INTERWORKING__)
/* Pop the three registers we pushed earlier. */
-#ifdef __INTERWORKING__
+/* APPLE LOCAL end v7 support. Merge from mainline */
ldmfd sp, {ip, sp, lr}
bx ip
#else
@@ -62,27 +77,129 @@
FUNC_END restore_core_regs
UNPREFIX restore_core_regs
-/* Load VFP registers d0-d15 from the address in r0. */
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+/* Load VFP registers d0-d15 from the address in r0.
+ Use this to load from FSTMX format. */
+/* APPLE LOCAL end v7 support. Merge from mainline */
ARM_FUNC_START gnu_Unwind_Restore_VFP
/* Use the generic coprocessor form so that gas doesn't complain
on soft-float targets. */
ldc p11,cr0,[r0],{0x21} /* fldmiax r0, {d0-d15} */
RET
-/* Store VFR regsters d0-d15 to the address in r0. */
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+/* Store VFP registers d0-d15 to the address in r0.
+ Use this to store in FSTMX format. */
+/* APPLE LOCAL end v7 support. Merge from mainline */
ARM_FUNC_START gnu_Unwind_Save_VFP
/* Use the generic coprocessor form so that gas doesn't complain
on soft-float targets. */
stc p11,cr0,[r0],{0x21} /* fstmiax r0, {d0-d15} */
RET
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+/* Load VFP registers d0-d15 from the address in r0.
+ Use this to load from FSTMD format. */
+ARM_FUNC_START gnu_Unwind_Restore_VFP_D
+ ldc p11,cr0,[r0],{0x20} /* fldmiad r0, {d0-d15} */
+ RET
+
+/* Store VFP registers d0-d15 to the address in r0.
+ Use this to store in FLDMD format. */
+ARM_FUNC_START gnu_Unwind_Save_VFP_D
+ stc p11,cr0,[r0],{0x20} /* fstmiad r0, {d0-d15} */
+ RET
+
+/* Load VFP registers d16-d31 from the address in r0.
+ Use this to load from FSTMD (=VSTM) format. Needs VFPv3. */
+ARM_FUNC_START gnu_Unwind_Restore_VFP_D_16_to_31
+ ldcl p11,cr0,[r0],{0x20} /* vldm r0, {d16-d31} */
+ RET
+
+/* Store VFP registers d16-d31 to the address in r0.
+ Use this to store in FLDMD (=VLDM) format. Needs VFPv3. */
+ARM_FUNC_START gnu_Unwind_Save_VFP_D_16_to_31
+ stcl p11,cr0,[r0],{0x20} /* vstm r0, {d16-d31} */
+ RET
+
+/* APPLE LOCAL end v7 support. Merge from mainline */
+/* APPLE LOCAL begin v7 support. Merge from Codesourcery */
+ARM_FUNC_START gnu_Unwind_Restore_WMMXD
+ /* Use the generic coprocessor form so that gas doesn't complain
+ on non-iWMMXt targets. */
+ ldcl p1, cr0, [r0], #8 /* wldrd wr0, [r0], #8 */
+ ldcl p1, cr1, [r0], #8 /* wldrd wr1, [r0], #8 */
+ ldcl p1, cr2, [r0], #8 /* wldrd wr2, [r0], #8 */
+ ldcl p1, cr3, [r0], #8 /* wldrd wr3, [r0], #8 */
+ ldcl p1, cr4, [r0], #8 /* wldrd wr4, [r0], #8 */
+ ldcl p1, cr5, [r0], #8 /* wldrd wr5, [r0], #8 */
+ ldcl p1, cr6, [r0], #8 /* wldrd wr6, [r0], #8 */
+ ldcl p1, cr7, [r0], #8 /* wldrd wr7, [r0], #8 */
+ ldcl p1, cr8, [r0], #8 /* wldrd wr8, [r0], #8 */
+ ldcl p1, cr9, [r0], #8 /* wldrd wr9, [r0], #8 */
+ ldcl p1, cr10, [r0], #8 /* wldrd wr10, [r0], #8 */
+ ldcl p1, cr11, [r0], #8 /* wldrd wr11, [r0], #8 */
+ ldcl p1, cr12, [r0], #8 /* wldrd wr12, [r0], #8 */
+ ldcl p1, cr13, [r0], #8 /* wldrd wr13, [r0], #8 */
+ ldcl p1, cr14, [r0], #8 /* wldrd wr14, [r0], #8 */
+ ldcl p1, cr15, [r0], #8 /* wldrd wr15, [r0], #8 */
+ RET
+
+ARM_FUNC_START gnu_Unwind_Save_WMMXD
+ /* Use the generic coprocessor form so that gas doesn't complain
+ on non-iWMMXt targets. */
+ stcl p1, cr0, [r0], #8 /* wstrd wr0, [r0], #8 */
+ stcl p1, cr1, [r0], #8 /* wstrd wr1, [r0], #8 */
+ stcl p1, cr2, [r0], #8 /* wstrd wr2, [r0], #8 */
+ stcl p1, cr3, [r0], #8 /* wstrd wr3, [r0], #8 */
+ stcl p1, cr4, [r0], #8 /* wstrd wr4, [r0], #8 */
+ stcl p1, cr5, [r0], #8 /* wstrd wr5, [r0], #8 */
+ stcl p1, cr6, [r0], #8 /* wstrd wr6, [r0], #8 */
+ stcl p1, cr7, [r0], #8 /* wstrd wr7, [r0], #8 */
+ stcl p1, cr8, [r0], #8 /* wstrd wr8, [r0], #8 */
+ stcl p1, cr9, [r0], #8 /* wstrd wr9, [r0], #8 */
+ stcl p1, cr10, [r0], #8 /* wstrd wr10, [r0], #8 */
+ stcl p1, cr11, [r0], #8 /* wstrd wr11, [r0], #8 */
+ stcl p1, cr12, [r0], #8 /* wstrd wr12, [r0], #8 */
+ stcl p1, cr13, [r0], #8 /* wstrd wr13, [r0], #8 */
+ stcl p1, cr14, [r0], #8 /* wstrd wr14, [r0], #8 */
+ stcl p1, cr15, [r0], #8 /* wstrd wr15, [r0], #8 */
+ RET
+
+ARM_FUNC_START gnu_Unwind_Restore_WMMXC
+ /* Use the generic coprocessor form so that gas doesn't complain
+ on non-iWMMXt targets. */
+ ldc2 p1, cr8, [r0], #4 /* wldrw wcgr0, [r0], #4 */
+ ldc2 p1, cr9, [r0], #4 /* wldrw wcgr1, [r0], #4 */
+ ldc2 p1, cr10, [r0], #4 /* wldrw wcgr2, [r0], #4 */
+ ldc2 p1, cr11, [r0], #4 /* wldrw wcgr3, [r0], #4 */
+ RET
+
+ARM_FUNC_START gnu_Unwind_Save_WMMXC
+ /* Use the generic coprocessor form so that gas doesn't complain
+ on non-iWMMXt targets. */
+ stc2 p1, cr8, [r0], #4 /* wstrw wcgr0, [r0], #4 */
+ stc2 p1, cr9, [r0], #4 /* wstrw wcgr1, [r0], #4 */
+ stc2 p1, cr10, [r0], #4 /* wstrw wcgr2, [r0], #4 */
+ stc2 p1, cr11, [r0], #4 /* wstrw wcgr3, [r0], #4 */
+ RET
+
+/* APPLE LOCAL end v7 support. Merge from Codesourcery */
/* Wrappers to save core registers, then call the real routine. */
.macro UNWIND_WRAPPER name nargs
ARM_FUNC_START \name
/* Create a phase2_vrs structure. */
/* Split reg push in two to ensure the correct value for sp. */
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+#if defined(__thumb2__)
+ mov ip, sp
+ push {lr} /* PC is ignored. */
+ push {ip, lr} /* Push original SP and LR. */
+#else
stmfd sp!, {sp, lr, pc}
+#endif
+/* APPLE LOCAL end v7 support. Merge from mainline */
stmfd sp!, {r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, sl, fp, ip}
/* Demand-save flags, plus an extra word for alignment. */
@@ -91,7 +208,8 @@
/* Point r1 at the block. Pass r[0..nargs) unchanged. */
add r\nargs, sp, #4
-#if defined(__thumb__)
+/* APPLE LOCAL v7 support. Merge from mainline */
+#if defined(__thumb__) && !defined(__thumb2__)
/* Switch back to thumb mode to avoid interworking hassle. */
adr ip, .L1_\name
orr ip, ip, #1
Added: llvm-gcc-4.2/trunk/gcc/config/arm/neon-docgen.ml
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/neon-docgen.ml?rev=76781&view=auto
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/neon-docgen.ml (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/neon-docgen.ml Wed Jul 22 15:36:27 2009
@@ -0,0 +1,323 @@
+(* APPLE LOCAL file v7 support. Merge from Codesourcery *)
+(* ARM NEON documentation generator.
+
+ Copyright (C) 2006 Free Software Foundation, Inc.
+ Contributed by CodeSourcery.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation; either version 2, or (at your option) any later
+ version.
+
+ GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING. If not, write to the Free
+ Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA. *)
+
+open Neon
+
+(* The combined "ops" and "reinterp" table. *)
+let ops_reinterp = reinterp @ ops
+
+(* Helper functions for extracting things from the "ops" table. *)
+let single_opcode desired_opcode () =
+ List.fold_left (fun got_so_far ->
+ fun row ->
+ match row with
+ (opcode, _, _, _, _, _) ->
+ if opcode = desired_opcode then row :: got_so_far
+ else got_so_far
+ ) [] ops_reinterp
+
+let multiple_opcodes desired_opcodes () =
+ List.fold_left (fun got_so_far ->
+ fun desired_opcode ->
+ (single_opcode desired_opcode ()) @ got_so_far)
+ [] desired_opcodes
+
+let ldx_opcode number () =
+ List.fold_left (fun got_so_far ->
+ fun row ->
+ match row with
+ (opcode, _, _, _, _, _) ->
+ match opcode with
+ Vldx n | Vldx_lane n | Vldx_dup n when n = number ->
+ row :: got_so_far
+ | _ -> got_so_far
+ ) [] ops_reinterp
+
+let stx_opcode number () =
+ List.fold_left (fun got_so_far ->
+ fun row ->
+ match row with
+ (opcode, _, _, _, _, _) ->
+ match opcode with
+ Vstx n | Vstx_lane n when n = number ->
+ row :: got_so_far
+ | _ -> got_so_far
+ ) [] ops_reinterp
+
+let tbl_opcode () =
+ List.fold_left (fun got_so_far ->
+ fun row ->
+ match row with
+ (opcode, _, _, _, _, _) ->
+ match opcode with
+ Vtbl _ -> row :: got_so_far
+ | _ -> got_so_far
+ ) [] ops_reinterp
+
+let tbx_opcode () =
+ List.fold_left (fun got_so_far ->
+ fun row ->
+ match row with
+ (opcode, _, _, _, _, _) ->
+ match opcode with
+ Vtbx _ -> row :: got_so_far
+ | _ -> got_so_far
+ ) [] ops_reinterp
+
+(* The groups of intrinsics. *)
+let intrinsic_groups =
+ [ "Addition", single_opcode Vadd;
+ "Multiplication", single_opcode Vmul;
+ "Multiply-accumulate", single_opcode Vmla;
+ "Multiply-subtract", single_opcode Vmls;
+ "Subtraction", single_opcode Vsub;
+ "Comparison (equal-to)", single_opcode Vceq;
+ "Comparison (greater-than-or-equal-to)", single_opcode Vcge;
+ "Comparison (less-than-or-equal-to)", single_opcode Vcle;
+ "Comparison (greater-than)", single_opcode Vcgt;
+ "Comparison (less-than)", single_opcode Vclt;
+ "Comparison (absolute greater-than-or-equal-to)", single_opcode Vcage;
+ "Comparison (absolute less-than-or-equal-to)", single_opcode Vcale;
+ "Comparison (absolute greater-than)", single_opcode Vcagt;
+ "Comparison (absolute less-than)", single_opcode Vcalt;
+ "Test bits", single_opcode Vtst;
+ "Absolute difference", single_opcode Vabd;
+ "Absolute difference and accumulate", single_opcode Vaba;
+ "Maximum", single_opcode Vmax;
+ "Minimum", single_opcode Vmin;
+ "Pairwise add", single_opcode Vpadd;
+ "Pairwise add, single_opcode widen and accumulate", single_opcode Vpada;
+ "Folding maximum", single_opcode Vpmax;
+ "Folding minimum", single_opcode Vpmin;
+ "Reciprocal step", multiple_opcodes [Vrecps; Vrsqrts];
+ "Vector shift left", single_opcode Vshl;
+ "Vector shift left by constant", single_opcode Vshl_n;
+ "Vector shift right by constant", single_opcode Vshr_n;
+ "Vector shift right by constant and accumulate", single_opcode Vsra_n;
+ "Vector shift right and insert", single_opcode Vsri;
+ "Vector shift left and insert", single_opcode Vsli;
+ "Absolute value", single_opcode Vabs;
+ "Negation", single_opcode Vneg;
+ "Bitwise not", single_opcode Vmvn;
+ "Count leading sign bits", single_opcode Vcls;
+ "Count leading zeros", single_opcode Vclz;
+ "Count number of set bits", single_opcode Vcnt;
+ "Reciprocal estimate", single_opcode Vrecpe;
+ "Reciprocal square-root estimate", single_opcode Vrsqrte;
+ "Get lanes from a vector", single_opcode Vget_lane;
+ "Set lanes in a vector", single_opcode Vset_lane;
+ "Create vector from literal bit pattern", single_opcode Vcreate;
+ "Set all lanes to the same value",
+ multiple_opcodes [Vdup_n; Vmov_n; Vdup_lane];
+ "Combining vectors", single_opcode Vcombine;
+ "Splitting vectors", multiple_opcodes [Vget_high; Vget_low];
+ "Conversions", multiple_opcodes [Vcvt; Vcvt_n];
+ "Move, single_opcode narrowing", single_opcode Vmovn;
+ "Move, single_opcode long", single_opcode Vmovl;
+ "Table lookup", tbl_opcode;
+ "Extended table lookup", tbx_opcode;
+ "Multiply, lane", single_opcode Vmul_lane;
+ "Long multiply, lane", single_opcode Vmull_lane;
+ "Saturating doubling long multiply, lane", single_opcode Vqdmull_lane;
+ "Saturating doubling multiply high, lane", single_opcode Vqdmulh_lane;
+ "Multiply-accumulate, lane", single_opcode Vmla_lane;
+ "Multiply-subtract, lane", single_opcode Vmls_lane;
+ "Vector multiply by scalar", single_opcode Vmul_n;
+ "Vector long multiply by scalar", single_opcode Vmull_n;
+ "Vector saturating doubling long multiply by scalar",
+ single_opcode Vqdmull_n;
+ "Vector saturating doubling multiply high by scalar",
+ single_opcode Vqdmulh_n;
+ "Vector multiply-accumulate by scalar", single_opcode Vmla_n;
+ "Vector multiply-subtract by scalar", single_opcode Vmls_n;
+ "Vector extract", single_opcode Vext;
+ "Reverse elements", multiple_opcodes [Vrev64; Vrev32; Vrev16];
+ "Bit selection", single_opcode Vbsl;
+ "Transpose elements", single_opcode Vtrn;
+ "Zip elements", single_opcode Vzip;
+ "Unzip elements", single_opcode Vuzp;
+ "Element/structure loads, VLD1 variants", ldx_opcode 1;
+ "Element/structure stores, VST1 variants", stx_opcode 1;
+ "Element/structure loads, VLD2 variants", ldx_opcode 2;
+ "Element/structure stores, VST2 variants", stx_opcode 2;
+ "Element/structure loads, VLD3 variants", ldx_opcode 3;
+ "Element/structure stores, VST3 variants", stx_opcode 3;
+ "Element/structure loads, VLD4 variants", ldx_opcode 4;
+ "Element/structure stores, VST4 variants", stx_opcode 4;
+ "Logical operations (AND)", single_opcode Vand;
+ "Logical operations (OR)", single_opcode Vorr;
+ "Logical operations (exclusive OR)", single_opcode Veor;
+ "Logical operations (AND-NOT)", single_opcode Vbic;
+ "Logical operations (OR-NOT)", single_opcode Vorn;
+ "Reinterpret casts", single_opcode Vreinterp ]
+
+(* Given an intrinsic shape, produce a string to document the corresponding
+ operand shapes. *)
+let rec analyze_shape shape =
+ let rec n_things n thing =
+ match n with
+ 0 -> []
+ | n -> thing :: (n_things (n - 1) thing)
+ in
+ let rec analyze_shape_elt reg_no elt =
+ match elt with
+ Dreg -> "@var{d" ^ (string_of_int reg_no) ^ "}"
+ | Qreg -> "@var{q" ^ (string_of_int reg_no) ^ "}"
+ | Corereg -> "@var{r" ^ (string_of_int reg_no) ^ "}"
+ | Immed -> "#@var{0}"
+ | VecArray (1, elt) ->
+ let elt_regexp = analyze_shape_elt 0 elt in
+ "@{" ^ elt_regexp ^ "@}"
+ | VecArray (n, elt) ->
+ let rec f m =
+ match m with
+ 0 -> []
+ | m -> (analyze_shape_elt (m - 1) elt) :: (f (m - 1))
+ in
+ let ops = List.rev (f n) in
+ "@{" ^ (commas (fun x -> x) ops "") ^ "@}"
+ | (PtrTo elt | CstPtrTo elt) ->
+ "[" ^ (analyze_shape_elt reg_no elt) ^ "]"
+ | Element_of_dreg -> (analyze_shape_elt reg_no Dreg) ^ "[@var{0}]"
+ | Element_of_qreg -> (analyze_shape_elt reg_no Qreg) ^ "[@var{0}]"
+ | All_elements_of_dreg -> (analyze_shape_elt reg_no Dreg) ^ "[]"
+ in
+ match shape with
+ All (n, elt) -> commas (analyze_shape_elt 0) (n_things n elt) ""
+ | Long -> (analyze_shape_elt 0 Qreg) ^ ", " ^ (analyze_shape_elt 0 Dreg) ^
+ ", " ^ (analyze_shape_elt 0 Dreg)
+ | Long_noreg elt -> (analyze_shape_elt 0 elt) ^ ", " ^
+ (analyze_shape_elt 0 elt)
+ | Wide -> (analyze_shape_elt 0 Qreg) ^ ", " ^ (analyze_shape_elt 0 Qreg) ^
+ ", " ^ (analyze_shape_elt 0 Dreg)
+ | Wide_noreg elt -> analyze_shape (Long_noreg elt)
+ | Narrow -> (analyze_shape_elt 0 Dreg) ^ ", " ^ (analyze_shape_elt 0 Qreg) ^
+ ", " ^ (analyze_shape_elt 0 Qreg)
+ | Use_operands elts -> commas (analyze_shape_elt 0) (Array.to_list elts) ""
+ | By_scalar Dreg ->
+ analyze_shape (Use_operands [| Dreg; Dreg; Element_of_dreg |])
+ | By_scalar Qreg ->
+ analyze_shape (Use_operands [| Qreg; Qreg; Element_of_dreg |])
+ | By_scalar _ -> assert false
+ | Wide_lane ->
+ analyze_shape (Use_operands [| Qreg; Dreg; Element_of_dreg |])
+ | Wide_scalar ->
+ analyze_shape (Use_operands [| Qreg; Dreg; Element_of_dreg |])
+ | Pair_result elt ->
+ let elt_regexp = analyze_shape_elt 0 elt in
+ let elt_regexp' = analyze_shape_elt 1 elt in
+ elt_regexp ^ ", " ^ elt_regexp'
+ | Unary_scalar _ -> "FIXME Unary_scalar"
+ | Binary_imm elt -> analyze_shape (Use_operands [| elt; elt; Immed |])
+ | Narrow_imm -> analyze_shape (Use_operands [| Dreg; Qreg; Immed |])
+ | Long_imm -> analyze_shape (Use_operands [| Qreg; Dreg; Immed |])
+
+(* Document a single intrinsic. *)
+let describe_intrinsic first chan
+ (elt_ty, (_, features, shape, name, munge, _)) =
+ let c_arity, new_elt_ty = munge shape elt_ty in
+ let c_types = strings_of_arity c_arity in
+ Printf.fprintf chan "@itemize @bullet\n";
+ let item_code = if first then "@item" else "@itemx" in
+ Printf.fprintf chan "%s %s %s_%s (" item_code (List.hd c_types)
+ (intrinsic_name name) (string_of_elt elt_ty);
+ Printf.fprintf chan "%s)\n" (commas (fun ty -> ty) (List.tl c_types) "");
+ if not (List.exists (fun feature -> feature = No_op) features) then
+ begin
+ let print_one_insn name =
+ Printf.fprintf chan "@code{";
+ let no_suffix = (new_elt_ty = NoElts) in
+ let name_with_suffix =
+ if no_suffix then name
+ else name ^ "." ^ (string_of_elt_dots new_elt_ty)
+ in
+ let possible_operands = analyze_all_shapes features shape
+ analyze_shape
+ in
+ let rec print_one_possible_operand op =
+ Printf.fprintf chan "%s %s}" name_with_suffix op
+ in
+ (* If the intrinsic expands to multiple instructions, we assume
+ they are all of the same form. *)
+ print_one_possible_operand (List.hd possible_operands)
+ in
+ let rec print_insns names =
+ match names with
+ [] -> ()
+ | [name] -> print_one_insn name
+ | name::names -> (print_one_insn name;
+ Printf.fprintf chan " @emph{or} ";
+ print_insns names)
+ in
+ let insn_names = get_insn_names features name in
+ Printf.fprintf chan "@*@emph{Form of expected instruction(s):} ";
+ print_insns insn_names;
+ Printf.fprintf chan "\n"
+ end;
+ Printf.fprintf chan "@end itemize\n";
+ Printf.fprintf chan "\n\n"
+
+(* Document a group of intrinsics. *)
+let document_group chan (group_title, group_extractor) =
+ (* Extract the rows in question from the ops table and then turn them
+ into a list of intrinsics. *)
+ let intrinsics =
+ List.fold_left (fun got_so_far ->
+ fun row ->
+ match row with
+ (_, _, _, _, _, elt_tys) ->
+ List.fold_left (fun got_so_far' ->
+ fun elt_ty ->
+ (elt_ty, row) :: got_so_far')
+ got_so_far elt_tys
+ ) [] (group_extractor ())
+ in
+ (* Emit the title for this group. *)
+ Printf.fprintf chan "@subsubsection %s\n\n" group_title;
+ (* Emit a description of each intrinsic. *)
+ List.iter (describe_intrinsic true chan) intrinsics;
+ (* Close this group. *)
+ Printf.fprintf chan "\n\n"
+
+let gnu_header chan =
+ List.iter (fun s -> Printf.fprintf chan "%s\n" s) [
+ "@c Copyright (C) 2006 Free Software Foundation, Inc.";
+ "@c This is part of the GCC manual.";
+ "@c For copying conditions, see the file gcc.texi.";
+ "";
+ "@c This file is generated automatically using gcc/config/arm/neon-docgen.ml";
+ "@c Please do not edit manually."]
+
+(* Program entry point. *)
+let _ =
+ if Array.length Sys.argv <> 2 then
+ failwith "Usage: neon-docgen <output filename>"
+ else
+ let file = Sys.argv.(1) in
+ try
+ let chan = open_out file in
+ gnu_header chan;
+ List.iter (document_group chan) intrinsic_groups;
+ close_out chan
+ with Sys_error sys ->
+ failwith ("Could not create output file " ^ file ^ ": " ^ sys)
Added: llvm-gcc-4.2/trunk/gcc/config/arm/neon-gen.ml
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/neon-gen.ml?rev=76781&view=auto
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/neon-gen.ml (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/neon-gen.ml Wed Jul 22 15:36:27 2009
@@ -0,0 +1,424 @@
+(* APPLE LOCAL file v7 support. Merge from Codesourcery *)
+(* Auto-generate ARM Neon intrinsics header file.
+ Copyright (C) 2006, 2007 Free Software Foundation, Inc.
+ Contributed by CodeSourcery.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation; either version 2, or (at your option) any later
+ version.
+
+ GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING. If not, write to the Free
+ Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA.
+
+ This is an O'Caml program. The O'Caml compiler is available from:
+
+ http://caml.inria.fr/
+
+ Or from your favourite OS's friendly packaging system. Tested with version
+ 3.09.2, though other versions will probably work too.
+
+ Compile with:
+ ocamlc -c neon.ml
+ ocamlc -o neon-gen neon.cmo neon-gen.ml
+
+ Run with:
+ ./neon-gen > arm_neon.h
+*)
+
+open Neon
+
+(* The format codes used in the following functions are documented at:
+ http://caml.inria.fr/pub/docs/manual-ocaml/libref/Format.html\
+ #6_printflikefunctionsforprettyprinting
+ (one line, remove the backslash.)
+*)
+
+(* Following functions can be used to approximate GNU indentation style. *)
+let start_function () =
+ Format.printf "@[<v 0>";
+ ref 0
+
+let end_function nesting =
+ match !nesting with
+ 0 -> Format.printf "@;@;@]"
+ | _ -> failwith ("Bad nesting (ending function at level "
+ ^ (string_of_int !nesting) ^ ")")
+
+let open_braceblock nesting =
+ begin match !nesting with
+ 0 -> Format.printf "@,@<0>{@[<v 2>@,"
+ | _ -> Format.printf "@,@[<v 2> @<0>{@[<v 2>@,"
+ end;
+ incr nesting
+
+let close_braceblock nesting =
+ decr nesting;
+ match !nesting with
+ 0 -> Format.printf "@]@,@<0>}"
+ | _ -> Format.printf "@]@,@<0>}@]"
+
+let print_function arity fnname body =
+ let ffmt = start_function () in
+ Format.printf "__extension__ static __inline ";
+ let inl = "__attribute__ ((__always_inline__))" in
+ begin match arity with
+ Arity0 ret ->
+ Format.printf "%s %s@,%s (void)" (string_of_vectype ret) inl fnname
+ | Arity1 (ret, arg0) ->
+ Format.printf "%s %s@,%s (%s __a)" (string_of_vectype ret) inl fnname
+ (string_of_vectype arg0)
+ | Arity2 (ret, arg0, arg1) ->
+ Format.printf "%s %s@,%s (%s __a, %s __b)"
+ (string_of_vectype ret) inl fnname (string_of_vectype arg0)
+ (string_of_vectype arg1)
+ | Arity3 (ret, arg0, arg1, arg2) ->
+ Format.printf "%s %s@,%s (%s __a, %s __b, %s __c)"
+ (string_of_vectype ret) inl fnname (string_of_vectype arg0)
+ (string_of_vectype arg1) (string_of_vectype arg2)
+ | Arity4 (ret, arg0, arg1, arg2, arg3) ->
+ Format.printf "%s %s@,%s (%s __a, %s __b, %s __c, %s __d)"
+ (string_of_vectype ret) inl fnname (string_of_vectype arg0)
+ (string_of_vectype arg1) (string_of_vectype arg2)
+ (string_of_vectype arg3)
+ end;
+ open_braceblock ffmt;
+ let rec print_lines = function
+ [] -> ()
+ | [line] -> Format.printf "%s" line
+ | line::lines -> Format.printf "%s@," line; print_lines lines in
+ print_lines body;
+ close_braceblock ffmt;
+ end_function ffmt
+
+let return_by_ptr features = List.mem ReturnPtr features
+
+let union_string num elts base =
+ let itype = inttype_for_array num elts in
+ let iname = string_of_inttype itype
+ and sname = string_of_vectype (T_arrayof (num, elts)) in
+ Printf.sprintf "union { %s __i; %s __o; } %s" sname iname base
+
+let rec signed_ctype = function
+ T_uint8x8 | T_poly8x8 -> T_int8x8
+ | T_uint8x16 | T_poly8x16 -> T_int8x16
+ | T_uint16x4 | T_poly16x4 -> T_int16x4
+ | T_uint16x8 | T_poly16x8 -> T_int16x8
+ | T_uint32x2 -> T_int32x2
+ | T_uint32x4 -> T_int32x4
+ | T_uint64x1 -> T_int64x1
+ | T_uint64x2 -> T_int64x2
+ (* Cast to types defined by mode in arm.c, not random types pulled in from
+ the <stdint.h> header in use. This fixes incompatible pointer errors when
+ compiling with C++. *)
+ | T_uint8 | T_int8 -> T_intQI
+ | T_uint16 | T_int16 -> T_intHI
+ | T_uint32 | T_int32 -> T_intSI
+ | T_uint64 | T_int64 -> T_intDI
+ | T_poly8 -> T_intQI
+ | T_poly16 -> T_intHI
+ | T_arrayof (n, elt) -> T_arrayof (n, signed_ctype elt)
+ | T_ptrto elt -> T_ptrto (signed_ctype elt)
+ | T_const elt -> T_const (signed_ctype elt)
+ | x -> x
+
+let add_cast ctype cval =
+ let stype = signed_ctype ctype in
+ if ctype <> stype then
+ Printf.sprintf "(%s) %s" (string_of_vectype stype) cval
+ else
+ cval
+
+let cast_for_return to_ty = "(" ^ (string_of_vectype to_ty) ^ ")"
+
+(* Return a tuple of a list of declarations to go at the start of the function,
+ and a list of statements needed to return THING. *)
+let return arity return_by_ptr thing =
+ match arity with
+ Arity0 (ret) | Arity1 (ret, _) | Arity2 (ret, _, _) | Arity3 (ret, _, _, _)
+ | Arity4 (ret, _, _, _, _) ->
+ match ret with
+ T_arrayof (num, vec) ->
+ if return_by_ptr then
+ let sname = string_of_vectype ret in
+ [Printf.sprintf "%s __rv;" sname],
+ [thing ^ ";"; "return __rv;"]
+ else
+ let uname = union_string num vec "__rv" in
+ [uname ^ ";"], ["__rv.__o = " ^ thing ^ ";"; "return __rv.__i;"]
+ | T_void -> [], [thing ^ ";"]
+ | _ ->
+ [], ["return " ^ (cast_for_return ret) ^ thing ^ ";"]
+
+let rec element_type ctype =
+ match ctype with
+ T_arrayof (_, v) -> element_type v
+ | _ -> ctype
+
+let params return_by_ptr ps =
+ let pdecls = ref [] in
+ let ptype t p =
+ match t with
+ T_arrayof (num, elts) ->
+ let uname = union_string num elts (p ^ "u") in
+ let decl = Printf.sprintf "%s = { %s };" uname p in
+ pdecls := decl :: !pdecls;
+ p ^ "u.__o"
+ | _ -> add_cast t p in
+ let plist = match ps with
+ Arity0 _ -> []
+ | Arity1 (_, t1) -> [ptype t1 "__a"]
+ | Arity2 (_, t1, t2) -> [ptype t1 "__a"; ptype t2 "__b"]
+ | Arity3 (_, t1, t2, t3) -> [ptype t1 "__a"; ptype t2 "__b"; ptype t3 "__c"]
+ | Arity4 (_, t1, t2, t3, t4) ->
+ [ptype t1 "__a"; ptype t2 "__b"; ptype t3 "__c"; ptype t4 "__d"] in
+ match ps with
+ Arity0 ret | Arity1 (ret, _) | Arity2 (ret, _, _) | Arity3 (ret, _, _, _)
+ | Arity4 (ret, _, _, _, _) ->
+ if return_by_ptr then
+ !pdecls, add_cast (T_ptrto (element_type ret)) "&__rv.val[0]" :: plist
+ else
+ !pdecls, plist
+
+let modify_params features plist =
+ let is_flipped =
+ List.exists (function Flipped _ -> true | _ -> false) features in
+ if is_flipped then
+ match plist with
+ [ a; b ] -> [ b; a ]
+ | _ ->
+ failwith ("Don't know how to flip args " ^ (String.concat ", " plist))
+ else
+ plist
+
+(* !!! Decide whether to add an extra information word based on the shape
+ form. *)
+let extra_word shape features paramlist bits =
+ let use_word =
+ match shape with
+ All _ | Long | Long_noreg _ | Wide | Wide_noreg _ | Narrow
+ | By_scalar _ | Wide_scalar | Wide_lane | Binary_imm _ | Long_imm
+ | Narrow_imm -> true
+ | _ -> List.mem InfoWord features
+ in
+ if use_word then
+ paramlist @ [string_of_int bits]
+ else
+ paramlist
+
+(* Bit 0 represents signed (1) vs unsigned (0), or float (1) vs poly (0).
+ Bit 1 represents rounding (1) vs none (0)
+ Bit 2 represents floats & polynomials (1), or ordinary integers (0). *)
+let infoword_value elttype features =
+ let bits02 =
+ match elt_class elttype with
+ Signed | ConvClass (Signed, _) | ConvClass (_, Signed) -> 0b001
+ | Poly -> 0b100
+ | Float -> 0b101
+ | _ -> 0b000
+ and rounding_bit = if List.mem Rounding features then 0b010 else 0b000 in
+ bits02 lor rounding_bit
+
+(* "Cast" type operations will throw an exception in mode_of_elt (actually in
+ elt_width, called from there). Deal with that here, and generate a suffix
+ with multiple modes (<to><from>). *)
+let rec mode_suffix elttype shape =
+ try
+ let mode = mode_of_elt elttype shape in
+ string_of_mode mode
+ with MixedMode (dst, src) ->
+ let dstmode = mode_of_elt dst shape
+ and srcmode = mode_of_elt src shape in
+ string_of_mode dstmode ^ string_of_mode srcmode
+
+let print_variant opcode features shape name (ctype, asmtype, elttype) =
+ let bits = infoword_value elttype features in
+ let modesuf = mode_suffix elttype shape in
+ let return_by_ptr = return_by_ptr features in
+ let pdecls, paramlist = params return_by_ptr ctype in
+ let paramlist' = modify_params features paramlist in
+ let paramlist'' = extra_word shape features paramlist' bits in
+ let parstr = String.concat ", " paramlist'' in
+ let builtin = Printf.sprintf "__builtin_neon_%s%s (%s)"
+ (builtin_name features name) modesuf parstr in
+ let rdecls, stmts = return ctype return_by_ptr builtin in
+ let body = pdecls @ rdecls @ stmts
+ and fnname = (intrinsic_name name) ^ "_" ^ (string_of_elt elttype) in
+ print_function ctype fnname body
+
+(* When this function processes the element types in the ops table, it rewrites
+ them in a list of tuples (a,b,c):
+ a : C type as an "arity", e.g. Arity1 (T_poly8x8, T_poly8x8)
+ b : Asm type : a single, processed element type, e.g. P16. This is the
+ type which should be attached to the asm opcode.
+ c : Variant type : the unprocessed type for this variant (e.g. in add
+ instructions which don't care about the sign, b might be i16 and c
+ might be s16.)
+*)
+
+let print_op (opcode, features, shape, name, munge, types) =
+ let sorted_types = List.sort compare types in
+ let munged_types = List.map
+ (fun elt -> let c, asm = munge shape elt in c, asm, elt) sorted_types in
+ List.iter
+ (fun variant -> print_variant opcode features shape name variant)
+ munged_types
+
+let print_ops ops =
+ List.iter print_op ops
+
+(* Output type definitions. Table entries are:
+ cbase : "C" name for the type.
+ abase : "ARM" base name for the type (i.e. int in int8x8_t).
+ esize : element size.
+ enum : element count.
+ We can't really distinguish between polynomial types and integer types in
+ the C type system, I don't think, which may allow the user to make mistakes
+ without warnings from the compiler.
+ FIXME: It's probably better to use stdint.h names here.
+*)
+
+let deftypes () =
+ let typeinfo = [
+ (* Doubleword vector types. *)
+ "__builtin_neon_qi", "int", 8, 8;
+ "__builtin_neon_hi", "int", 16, 4;
+ "__builtin_neon_si", "int", 32, 2;
+ "__builtin_neon_di", "int", 64, 1;
+ "__builtin_neon_sf", "float", 32, 2;
+ "__builtin_neon_poly8", "poly", 8, 8;
+ "__builtin_neon_poly16", "poly", 16, 4;
+ "__builtin_neon_uqi", "uint", 8, 8;
+ "__builtin_neon_uhi", "uint", 16, 4;
+ "__builtin_neon_usi", "uint", 32, 2;
+ "__builtin_neon_udi", "uint", 64, 1;
+
+ (* Quadword vector types. *)
+ "__builtin_neon_qi", "int", 8, 16;
+ "__builtin_neon_hi", "int", 16, 8;
+ "__builtin_neon_si", "int", 32, 4;
+ "__builtin_neon_di", "int", 64, 2;
+ "__builtin_neon_sf", "float", 32, 4;
+ "__builtin_neon_poly8", "poly", 8, 16;
+ "__builtin_neon_poly16", "poly", 16, 8;
+ "__builtin_neon_uqi", "uint", 8, 16;
+ "__builtin_neon_uhi", "uint", 16, 8;
+ "__builtin_neon_usi", "uint", 32, 4;
+ "__builtin_neon_udi", "uint", 64, 2
+ ] in
+ List.iter
+ (fun (cbase, abase, esize, enum) ->
+ let attr =
+ match enum with
+ 1 -> ""
+ | _ -> Printf.sprintf "\t__attribute__ ((__vector_size__ (%d)))"
+ (esize * enum / 8) in
+ Format.printf "typedef %s %s%dx%d_t%s;@\n" cbase abase esize enum attr)
+ typeinfo;
+ Format.print_newline ();
+ (* Extra types not in <stdint.h>. *)
+ Format.printf "typedef __builtin_neon_sf float32_t;\n";
+ Format.printf "typedef __builtin_neon_poly8 poly8_t;\n";
+ Format.printf "typedef __builtin_neon_poly16 poly16_t;\n"
+
+(* Output structs containing arrays, for load & store instructions etc. *)
+
+let arrtypes () =
+ let typeinfo = [
+ "int", 8; "int", 16;
+ "int", 32; "int", 64;
+ "uint", 8; "uint", 16;
+ "uint", 32; "uint", 64;
+ "float", 32; "poly", 8;
+ "poly", 16
+ ] in
+ let writestruct elname elsize regsize arrsize =
+ let elnum = regsize / elsize in
+ let structname =
+ Printf.sprintf "%s%dx%dx%d_t" elname elsize elnum arrsize in
+ let sfmt = start_function () in
+ Format.printf "typedef struct %s" structname;
+ open_braceblock sfmt;
+ Format.printf "%s%dx%d_t val[%d];" elname elsize elnum arrsize;
+ close_braceblock sfmt;
+ Format.printf " %s;" structname;
+ end_function sfmt;
+ in
+ for n = 2 to 4 do
+ List.iter
+ (fun (elname, elsize) ->
+ writestruct elname elsize 64 n;
+ writestruct elname elsize 128 n)
+ typeinfo
+ done
+
+let print_lines = List.iter (fun s -> Format.printf "%s@\n" s)
+
+(* Do it. *)
+
+let _ =
+ print_lines [
+"/* ARM NEON intrinsics include file. This file is generated automatically";
+" using neon-gen.ml. Please do not edit manually.";
+"";
+" Copyright (C) 2006, 2007 Free Software Foundation, Inc.";
+" Contributed by CodeSourcery.";
+"";
+" This file is part of GCC.";
+"";
+" GCC is free software; you can redistribute it and/or modify it";
+" under the terms of the GNU General Public License as published";
+" by the Free Software Foundation; either version 2, or (at your";
+" option) any later version.";
+"";
+" GCC is distributed in the hope that it will be useful, but WITHOUT";
+" ANY WARRANTY; without even the implied warranty of MERCHANTABILITY";
+" or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public";
+" License for more details.";
+"";
+" You should have received a copy of the GNU General Public License";
+" along with GCC; see the file COPYING. If not, write to the";
+" Free Software Foundation, 51 Franklin Street, Fifth Floor, Boston,";
+" MA 02110-1301, USA. */";
+"";
+"/* As a special exception, if you include this header file into source";
+" files compiled by GCC, this header file does not by itself cause";
+" the resulting executable to be covered by the GNU General Public";
+" License. This exception does not however invalidate any other";
+" reasons why the executable file might be covered by the GNU General";
+" Public License. */";
+"";
+"#ifndef _GCC_ARM_NEON_H";
+"#define _GCC_ARM_NEON_H 1";
+"";
+"#ifndef __ARM_NEON__";
+"#error You must enable NEON instructions (e.g. -mfloat-abi=softfp -mfpu=neon) to use arm_neon.h";
+"#else";
+"";
+"#ifdef __cplusplus";
+"extern \"C\" {";
+"#endif";
+"";
+"#include <stdint.h>";
+""];
+ deftypes ();
+ arrtypes ();
+ Format.print_newline ();
+ print_ops ops;
+ Format.print_newline ();
+ print_ops reinterp;
+ print_lines [
+"#ifdef __cplusplus";
+"}";
+"#endif";
+"#endif";
+"#endif"]
Added: llvm-gcc-4.2/trunk/gcc/config/arm/neon-schedgen.ml
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/neon-schedgen.ml?rev=76781&view=auto
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/neon-schedgen.ml (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/neon-schedgen.ml Wed Jul 22 15:36:27 2009
@@ -0,0 +1,498 @@
+(* APPLE LOCAL file v7 support. Merge from Codesourcery *)
+(* Emission of the core of the Cortex-A8 NEON scheduling description.
+ Copyright (C) 2007 Free Software Foundation, Inc.
+ Contributed by CodeSourcery.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation; either version 2, or (at your option) any later
+ version.
+
+ GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING. If not, write to the Free
+ Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA.
+*)
+
+(* This scheduling description generator works as follows.
+ - Each group of instructions has source and destination requirements
+ specified. The source requirements may be specified using
+ Source (the stage at which all source operands not otherwise
+ described are read), Source_m (the stage at which Rm operands are
+ read), Source_n (likewise for Rn) and Source_d (likewise for Rd).
+ - For each group of instructions the earliest stage where a source
+ operand may be required is calculated.
+ - Each group of instructions is selected in turn as a producer.
+ The latencies between this group and every other group are then
+ calculated, yielding up to four values for each combination:
+ 1. Producer -> consumer Rn latency
+ 2. Producer -> consumer Rm latency
+ 3. Producer -> consumer Rd (as a source) latency
+ 4. Producer -> consumer worst-case latency.
+ Value 4 is calculated from the destination availability requirements
+ of the consumer and the earliest source availability requirements
+ of the producer.
+ - The largest Value 4 calculated for the current producer is the
+ worse-case latency, L, for that instruction group. This value is written
+ out in a define_insn_reservation for the producer group.
+ - For each producer and consumer pair, the latencies calculated above
+ are collated. The average (of up to four values) is calculated and
+ if this average is different from the worst-case latency, an
+ unguarded define_bypass construction is issued for that pair.
+ (For each pair only one define_bypass construction will be emitted,
+ and at present we do not emit specific guards.)
+*)
+
+open Utils
+
+let n1 = 1 and n2 = 2 and n3 = 3 and n4 = 4 and n5 = 5 and n6 = 6
+ and n7 = 7 and n8 = 8 and n9 = 9
+
+type availability = Source of int
+ | Source_n of int
+ | Source_m of int
+ | Source_d of int
+ | Dest of int
+ | Dest_n_after of int * int
+
+type guard = Guard_none | Guard_only_m | Guard_only_n | Guard_only_d
+
+(* Reservation behaviours. All but the last row here correspond to one
+ pipeline each. Each constructor will correspond to one
+ define_reservation. *)
+type reservation =
+ Mul | Mul_2cycle | Mul_4cycle
+| Shift | Shift_2cycle
+| ALU | ALU_2cycle
+| Fmul | Fmul_2cycle
+| Fadd | Fadd_2cycle
+(* | VFP *)
+| Permute of int
+| Ls of int
+| Fmul_then_fadd | Fmul_then_fadd_2
+
+(* This table must be kept as short as possible by conflating
+ entries with the same availability behaviour.
+
+ First components: instruction group names
+ Second components: availability requirements, in the order in which
+ they should appear in the comments in the .md file.
+ Third components: reservation info
+*)
+let availability_table = [
+ (* NEON integer ALU instructions. *)
+ (* vbit vbif vbsl vorr vbic vnot vcls vclz vcnt vadd vand vorr
+ veor vbic vorn ddd qqq *)
+ "neon_int_1", [Source n2; Dest n3], ALU;
+ (* vadd vsub qqd vsub ddd qqq *)
+ "neon_int_2", [Source_m n1; Source_n n2; Dest n3], ALU;
+ (* vsum vneg dd qq vadd vsub qdd *)
+ "neon_int_3", [Source n1; Dest n3], ALU;
+ (* vabs vceqz vcgez vcbtz vclez vcltz vadh vradh vsbh vrsbh dqq *)
+ (* vhadd vrhadd vqadd vtst ddd qqq *)
+ "neon_int_4", [Source n2; Dest n4], ALU;
+ (* vabd qdd vhsub vqsub vabd vceq vcge vcgt vmax vmin vfmx vfmn ddd ddd *)
+ "neon_int_5", [Source_m n1; Source_n n2; Dest n4], ALU;
+ (* vqneg vqabs dd qq *)
+ "neon_vqneg_vqabs", [Source n1; Dest n4], ALU;
+ (* vmov vmvn *)
+ "neon_vmov", [Dest n3], ALU;
+ (* vaba *)
+ "neon_vaba", [Source_n n2; Source_m n1; Source_d n3; Dest n6], ALU;
+ "neon_vaba_qqq",
+ [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n6)], ALU_2cycle;
+ (* vsma *)
+ "neon_vsma", [Source_m n1; Source_d n3; Dest n6], ALU;
+
+ (* NEON integer multiply instructions. *)
+ (* vmul, vqdmlh, vqrdmlh *)
+ (* vmul, vqdmul, qdd 16/8 long 32/16 long *)
+ "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long", [Source n2; Dest n6], Mul;
+ "neon_mul_qqq_8_16_32_ddd_32", [Source n2; Dest_n_after (1, n6)], Mul_2cycle;
+ (* vmul, vqdmul again *)
+ "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar",
+ [Source_n n2; Source_m n1; Dest_n_after (1, n6)], Mul_2cycle;
+ (* vmla, vmls *)
+ "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long",
+ [Source_n n2; Source_m n2; Source_d n3; Dest n6], Mul;
+ "neon_mla_qqq_8_16",
+ [Source_n n2; Source_m n2; Source_d n3; Dest_n_after (1, n6)], Mul_2cycle;
+ "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long",
+ [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n6)], Mul_2cycle;
+ "neon_mla_qqq_32_qqd_32_scalar",
+ [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (3, n6)], Mul_4cycle;
+ (* vmul, vqdmulh, vqrdmulh *)
+ (* vmul, vqdmul *)
+ "neon_mul_ddd_16_scalar_32_16_long_scalar",
+ [Source_n n2; Source_m n1; Dest n6], Mul;
+ "neon_mul_qqd_32_scalar",
+ [Source_n n2; Source_m n1; Dest_n_after (3, n6)], Mul_4cycle;
+ (* vmla, vmls *)
+ (* vmla, vmla, vqdmla, vqdmls *)
+ "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar",
+ [Source_n n2; Source_m n1; Source_d n3; Dest n6], Mul;
+
+ (* NEON integer shift instructions. *)
+ (* vshr/vshl immediate, vshr_narrow, vshl_vmvh, vsli_vsri_ddd *)
+ "neon_shift_1", [Source n1; Dest n3], Shift;
+ (* vqshl, vrshr immediate; vqshr, vqmov, vrshr, vqrshr narrow;
+ vqshl_vrshl_vqrshl_ddd *)
+ "neon_shift_2", [Source n1; Dest n4], Shift;
+ (* vsli, vsri and vshl for qqq *)
+ "neon_shift_3", [Source n1; Dest_n_after (1, n3)], Shift_2cycle;
+ "neon_vshl_ddd", [Source n1; Dest n1], Shift;
+ "neon_vqshl_vrshl_vqrshl_qqq", [Source n1; Dest_n_after (1, n4)],
+ Shift_2cycle;
+ "neon_vsra_vrsra", [Source_m n1; Source_d n3; Dest n6], Shift;
+
+ (* NEON floating-point instructions. *)
+ (* vadd, vsub, vabd, vmul, vceq, vcge, vcgt, vcage, vcagt, vmax, vmin *)
+ (* vabs, vneg, vceqz, vcgez, vcgtz, vclez, vcltz, vrecpe, vrsqrte, vcvt *)
+ "neon_fp_vadd_ddd_vabs_dd", [Source n2; Dest n5], Fadd;
+ "neon_fp_vadd_qqq_vabs_qq", [Source n2; Dest_n_after (1, n5)],
+ Fadd_2cycle;
+ (* vsum, fvmx, vfmn *)
+ "neon_fp_vsum", [Source n1; Dest n5], Fadd;
+ "neon_fp_vmul_ddd", [Source_n n2; Source_m n1; Dest n5], Fmul;
+ "neon_fp_vmul_qqd", [Source_n n2; Source_m n1; Dest_n_after (1, n5)],
+ Fmul_2cycle;
+ (* vmla, vmls *)
+ "neon_fp_vmla_ddd",
+ [Source_n n2; Source_m n2; Source_d n3; Dest n9], Fmul_then_fadd;
+ "neon_fp_vmla_qqq",
+ [Source_n n2; Source_m n2; Source_d n3; Dest_n_after (1, n9)],
+ Fmul_then_fadd_2;
+ "neon_fp_vmla_ddd_scalar",
+ [Source_n n2; Source_m n1; Source_d n3; Dest n9], Fmul_then_fadd;
+ "neon_fp_vmla_qqq_scalar",
+ [Source_n n2; Source_m n1; Source_d n3; Dest_n_after (1, n9)],
+ Fmul_then_fadd_2;
+ "neon_fp_vrecps_vrsqrts_ddd", [Source n2; Dest n9], Fmul_then_fadd;
+ "neon_fp_vrecps_vrsqrts_qqq", [Source n2; Dest_n_after (1, n9)],
+ Fmul_then_fadd_2;
+
+ (* NEON byte permute instructions. *)
+ (* vmov; vtrn and vswp for dd; vzip for dd; vuzp for dd; vrev; vext for dd *)
+ "neon_bp_simple", [Source n1; Dest n2], Permute 1;
+ (* vswp for qq; vext for qqq; vtbl with {Dn} or {Dn, Dn1};
+ similarly for vtbx *)
+ "neon_bp_2cycle", [Source n1; Dest_n_after (1, n2)], Permute 2;
+ (* all the rest *)
+ "neon_bp_3cycle", [Source n1; Dest_n_after (2, n2)], Permute 3;
+
+ (* NEON load/store instructions. *)
+ "neon_ldr", [Dest n1], Ls 1;
+ "neon_str", [Source n1], Ls 1;
+ "neon_vld1_1_2_regs", [Dest_n_after (1, n1)], Ls 2;
+ "neon_vld1_3_4_regs", [Dest_n_after (2, n1)], Ls 3;
+ "neon_vld2_2_regs_vld1_vld2_all_lanes", [Dest_n_after (1, n2)], Ls 2;
+ "neon_vld2_4_regs", [Dest_n_after (2, n2)], Ls 3;
+ "neon_vld3_vld4", [Dest_n_after (3, n2)], Ls 4;
+ "neon_vst1_1_2_regs_vst2_2_regs", [Source n1], Ls 2;
+ "neon_vst1_3_4_regs", [Source n1], Ls 3;
+ "neon_vst2_4_regs_vst3_vst4", [Source n1], Ls 4;
+ "neon_vst3_vst4", [Source n1], Ls 4;
+ "neon_vld1_vld2_lane", [Source n1; Dest_n_after (2, n2)], Ls 3;
+ "neon_vld3_vld4_lane", [Source n1; Dest_n_after (4, n2)], Ls 5;
+ "neon_vst1_vst2_lane", [Source n1], Ls 2;
+ "neon_vst3_vst4_lane", [Source n1], Ls 3;
+ "neon_vld3_vld4_all_lanes", [Dest_n_after (1, n2)], Ls 3;
+
+ (* NEON register transfer instructions. *)
+ "neon_mcr", [Dest n2], Permute 1;
+ "neon_mcr_2_mcrr", [Dest n2], Permute 2;
+ (* MRC instructions are in the .tpl file. *)
+]
+
+(* Augment the tuples in the availability table with an extra component
+ that describes the earliest stage where a source operand may be
+ required. (It is also possible that an entry in the table has no
+ source requirements.) *)
+let calculate_sources =
+ List.map (fun (name, avail, res) ->
+ let earliest_stage =
+ List.fold_left
+ (fun cur -> fun info ->
+ match info with
+ Source stage
+ | Source_n stage
+ | Source_m stage
+ | Source_d stage ->
+ (match cur with
+ None -> Some stage
+ | Some stage' when stage < stage' -> Some stage
+ | _ -> cur)
+ | _ -> cur) None avail
+ in
+ (name, avail, res, earliest_stage))
+
+(* Find the stage, if any, at the end of which a group produces a result. *)
+let find_dest (attr, avail, _, _) =
+ try
+ find_with_result
+ (fun av -> match av with
+ Dest st -> Some (Some st)
+ | Dest_n_after (after, st) -> Some (Some (after + st))
+ | _ -> None) avail
+ with Not_found -> None
+
+(* Find the worst-case latency between a producer and a consumer. *)
+let worst_case_latency producer (_, _, _, earliest_required) =
+ let dest = find_dest producer in
+ match earliest_required, dest with
+ None, _ ->
+ (* The consumer doesn't have any source requirements. *)
+ None
+ | _, None ->
+ (* The producer doesn't produce any results (e.g. a store insn). *)
+ None
+ | Some consumed, Some produced -> Some (produced - consumed + 1)
+
+(* Helper function for below. *)
+let latency_calc f producer (_, avail, _, _) =
+ try
+ let source_avail = find_with_result f avail in
+ match find_dest producer with
+ None ->
+ (* The producer does not produce a result. *)
+ Some 0
+ | Some produced ->
+ let latency = produced - source_avail + 1 in
+ (* Latencies below zero are raised to zero since we don't have
+ delay slots. *)
+ if latency < 0 then Some 0 else Some latency
+ with Not_found -> None
+
+(* Find any Rm latency between a producer and a consumer. If no
+ Rm source requirement is explicitly specified for the consumer,
+ return "positive infinity". Also return "positive infinity" if
+ the latency matches the supplied worst-case latency for this
+ producer. *)
+let get_m_latency producer consumer =
+ match latency_calc (fun av -> match av with Source_m stage -> Some stage
+ | _ -> None) producer consumer
+ with None -> [] | Some latency -> [(Guard_only_m, latency)]
+
+(* Likewise for Rn. *)
+let get_n_latency producer consumer =
+ match latency_calc (fun av -> match av with Source_n stage -> Some stage
+ | _ -> None) producer consumer
+ with None -> [] | Some latency -> [(Guard_only_n, latency)]
+
+(* Likewise for Rd. *)
+let get_d_latency producer consumer =
+ match
+ latency_calc (fun av -> match av with Source_d stage -> Some stage
+ | _ -> None) producer consumer
+ with None -> [] | Some latency -> [(Guard_only_d, latency)]
+
+(* Given a producer and a consumer, work out the latency of the producer
+ to the consumer in each of the four cases (availability information
+ permitting) identified at the top of this file. Return the
+ consumer, the worst-case unguarded latency and any guarded latencies. *)
+let calculate_latencies producer consumer =
+ let worst = worst_case_latency producer consumer in
+ let m_latency = get_m_latency producer consumer in
+ let n_latency = get_n_latency producer consumer in
+ let d_latency = get_d_latency producer consumer in
+ (consumer, worst, m_latency @ n_latency @ d_latency)
+
+(* Helper function for below. *)
+let pick_latency largest worst guards =
+ let guards =
+ match worst with
+ None -> guards
+ | Some worst -> (Guard_none, worst) :: guards
+ in
+ if List.length guards = 0 then None else
+ let total_latency =
+ List.fold_left (fun acc -> fun (_, latency) -> acc + latency) 0 guards
+ in
+ let average_latency = (float_of_int total_latency) /.
+ (float_of_int (List.length guards)) in
+ let rounded_latency = int_of_float (ceil average_latency) in
+ if rounded_latency = largest then None
+ else Some (Guard_none, rounded_latency)
+
+(* Collate all bypasses for a particular producer as required in
+ worst_case_latencies_and_bypasses. (By this stage there is a maximum
+ of one bypass from this producer to any particular consumer listed
+ in LATENCIES.) Use a hash table to collate bypasses with the
+ same latency and guard. *)
+let collate_bypasses (producer_name, _, _, _) largest latencies =
+ let ht = Hashtbl.create 42 in
+ let keys = ref [] in
+ List.iter (
+ fun ((consumer, _, _, _), worst, guards) ->
+ (* Find out which latency to use. Ignoring latencies that match
+ the *overall* worst-case latency for this producer (which will
+ be in define_insn_reservation), we have to examine:
+ 1. the latency with no guard between this producer and this
+ consumer; and
+ 2. any guarded latency. *)
+ let guard_latency_opt = pick_latency largest worst guards in
+ match guard_latency_opt with
+ None -> ()
+ | Some (guard, latency) ->
+ begin
+ (if (try ignore (Hashtbl.find ht (guard, latency)); false
+ with Not_found -> true) then
+ keys := (guard, latency) :: !keys);
+ Hashtbl.add ht (guard, latency) consumer
+ end
+ ) latencies;
+ (* The hash table now has bypasses collated so that ones with the
+ same latency and guard have the same keys. Walk through all the
+ keys, extract the associated bypasses, and concatenate the names
+ of the consumers for each bypass. *)
+ List.map (
+ fun ((guard, latency) as key) ->
+ let consumers = Hashtbl.find_all ht key in
+ (producer_name,
+ String.concat ",\\\n " consumers,
+ latency,
+ guard)
+ ) !keys
+
+(* For every producer, find the worst-case latency between it and
+ *any* consumer. Also determine (if such a thing exists) the
+ lowest-latency bypass from each producer to each consumer. Group
+ the output in such a way that all bypasses with the same producer
+ and latency are together, and so that bypasses with the worst-case
+ latency are ignored. *)
+let worst_case_latencies_and_bypasses =
+ let rec f (worst_acc, bypasses_acc) prev xs =
+ match xs with
+ [] -> (worst_acc, bypasses_acc)
+ | ((producer_name, producer_avail, res_string, _) as producer)::next ->
+ (* For this particular producer, work out the latencies between
+ it and every consumer. *)
+ let latencies =
+ List.fold_left (fun acc -> fun consumer ->
+ (calculate_latencies producer consumer) :: acc)
+ [] (prev @ xs)
+ in
+ (* Now work out what the overall worst case latency was for this
+ particular producer. *)
+ match latencies with
+ [] -> assert false
+ | _ ->
+ let comp_fn (_, l1, _) (_, l2, _) =
+ if l1 > l2 then -1 else if l1 = l2 then 0 else 1
+ in
+ let largest =
+ match List.hd (List.sort comp_fn latencies) with
+ (_, None, _) -> 0 (* Producer has no consumers. *)
+ | (_, Some worst, _) -> worst
+ in
+ (* Having got the largest latency, collect all bypasses for
+ this producer and filter out those with that larger
+ latency. Record the others for later emission. *)
+ let bypasses = collate_bypasses producer largest latencies in
+ (* Go on to process remaining producers, having noted
+ the result for this one. *)
+ f ((producer_name, producer_avail, largest,
+ res_string) :: worst_acc,
+ bypasses @ bypasses_acc)
+ (prev @ [producer]) next
+ in
+ f ([], []) []
+
+(* Emit a helpful comment for a define_insn_reservation. *)
+let write_comment producer avail =
+ let seen_source = ref false in
+ let describe info =
+ let read = if !seen_source then "" else "read " in
+ match info with
+ Source stage ->
+ seen_source := true;
+ Printf.printf "%stheir source operands at N%d" read stage
+ | Source_n stage ->
+ seen_source := true;
+ Printf.printf "%stheir (D|Q)n operands at N%d" read stage
+ | Source_m stage ->
+ seen_source := true;
+ Printf.printf "%stheir (D|Q)m operands at N%d" read stage
+ | Source_d stage ->
+ Printf.printf "%stheir (D|Q)d operands at N%d" read stage
+ | Dest stage ->
+ Printf.printf "produce a result at N%d" stage
+ | Dest_n_after (after, stage) ->
+ Printf.printf "produce a result at N%d on cycle %d" stage (after + 1)
+ in
+ Printf.printf ";; Instructions using this reservation ";
+ let rec f infos x =
+ let sep = if x mod 2 = 1 then "" else "\n;;" in
+ match infos with
+ [] -> assert false
+ | [info] -> describe info; Printf.printf ".\n"
+ | info::(_::[] as infos) ->
+ describe info; Printf.printf ", and%s " sep; f infos (x+1)
+ | info::infos -> describe info; Printf.printf ",%s " sep; f infos (x+1)
+ in
+ f avail 0
+
+(* Emit a define_insn_reservation for each producer. The latency
+ written in will be its worst-case latency. *)
+let emit_insn_reservations =
+ List.iter (
+ fun (producer, avail, latency, reservation) ->
+ write_comment producer avail;
+ Printf.printf "(define_insn_reservation \"%s\" %d\n" producer latency;
+ Printf.printf " (and (eq_attr \"tune\" \"cortexa8\")\n";
+ Printf.printf " (eq_attr \"neon_type\" \"%s\"))\n" producer;
+ let str =
+ match reservation with
+ Mul -> "dp" | Mul_2cycle -> "dp_2" | Mul_4cycle -> "dp_4"
+ | Shift -> "dp" | Shift_2cycle -> "dp_2"
+ | ALU -> "dp" | ALU_2cycle -> "dp_2"
+ | Fmul -> "dp" | Fmul_2cycle -> "dp_2"
+ | Fadd -> "fadd" | Fadd_2cycle -> "fadd_2"
+ | Ls 1 -> "ls"
+ | Ls n -> "ls_" ^ (string_of_int n)
+ | Permute 1 -> "perm"
+ | Permute n -> "perm_" ^ (string_of_int n)
+ | Fmul_then_fadd -> "fmul_then_fadd"
+ | Fmul_then_fadd_2 -> "fmul_then_fadd_2"
+ in
+ Printf.printf " \"cortex_a8_neon_%s\")\n\n" str
+ )
+
+(* Given a guard description, return the name of the C function to
+ be used as the guard for define_bypass. *)
+let guard_fn g =
+ match g with
+ Guard_only_m -> "arm_neon_only_m_dependency"
+ | Guard_only_n -> "arm_neon_only_n_dependency"
+ | Guard_only_d -> "arm_neon_only_d_dependency"
+ | Guard_none -> assert false
+
+(* Emit a define_bypass for each bypass. *)
+let emit_bypasses =
+ List.iter (
+ fun (producer, consumers, latency, guard) ->
+ Printf.printf "(define_bypass %d \"%s\"\n" latency producer;
+ if guard = Guard_none then
+ Printf.printf " \"%s\")\n\n" consumers
+ else
+ begin
+ Printf.printf " \"%s\"\n" consumers;
+ Printf.printf " \"%s\")\n\n" (guard_fn guard)
+ end
+ )
+
+(* Program entry point. *)
+let main =
+ let table = calculate_sources availability_table in
+ let worst_cases, bypasses = worst_case_latencies_and_bypasses table in
+ emit_insn_reservations (List.rev worst_cases);
+ Printf.printf ";; Exceptions to the default latencies.\n\n";
+ emit_bypasses bypasses
+
Added: llvm-gcc-4.2/trunk/gcc/config/arm/neon-testgen.ml
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/neon-testgen.ml?rev=76781&view=auto
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/neon-testgen.ml (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/neon-testgen.ml Wed Jul 22 15:36:27 2009
@@ -0,0 +1,274 @@
+(* APPLE LOCAL file v7 support. Merge from Codesourcery *)
+(* Auto-generate ARM Neon intrinsics tests.
+ Copyright (C) 2006 Free Software Foundation, Inc.
+ Contributed by CodeSourcery.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation; either version 2, or (at your option) any later
+ version.
+
+ GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING. If not, write to the Free
+ Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA.
+
+ This is an O'Caml program. The O'Caml compiler is available from:
+
+ http://caml.inria.fr/
+
+ Or from your favourite OS's friendly packaging system. Tested with version
+ 3.09.2, though other versions will probably work too.
+
+ Compile with:
+ ocamlc -c neon.ml
+ ocamlc -o neon-testgen neon.cmo neon-testgen.ml
+*)
+
+open Neon
+
+type c_type_flags = Pointer | Const
+
+(* Open a test source file. *)
+let open_test_file dir name =
+ try
+ open_out (dir ^ "/" ^ name ^ ".c")
+ with Sys_error str ->
+ failwith ("Could not create test source file " ^ name ^ ": " ^ str)
+
+(* Emit prologue code to a test source file. *)
+let emit_prologue chan test_name =
+ Printf.fprintf chan "/* Test the `%s' ARM Neon intrinsic. */\n" test_name;
+ Printf.fprintf chan "/* This file was autogenerated by neon-testgen. */\n\n";
+ Printf.fprintf chan "/* { dg-do assemble } */\n";
+ Printf.fprintf chan "/* { dg-require-effective-target arm_neon_ok } */\n";
+ Printf.fprintf chan
+ "/* { dg-options \"-save-temps -O0 -mfpu=neon -mfloat-abi=softfp\" } */\n";
+ Printf.fprintf chan "\n#include \"arm_neon.h\"\n\n";
+ Printf.fprintf chan "void test_%s (void)\n{\n" test_name
+
+(* Emit declarations of local variables that are going to be passed
+ to an intrinsic, together with one to take a returned value if needed. *)
+let emit_automatics chan c_types =
+ let emit () =
+ ignore (
+ List.fold_left (fun arg_number -> fun (flags, ty) ->
+ let pointer_bit =
+ if List.mem Pointer flags then "*" else ""
+ in
+ (* Const arguments to builtins are directly
+ written in as constants. *)
+ if not (List.mem Const flags) then
+ Printf.fprintf chan " %s %sarg%d_%s;\n"
+ ty pointer_bit arg_number ty;
+ arg_number + 1)
+ 0 (List.tl c_types))
+ in
+ match c_types with
+ (_, return_ty) :: tys ->
+ if return_ty <> "void" then
+ (* The intrinsic returns a value. *)
+ (Printf.fprintf chan " %s out_%s;\n" return_ty return_ty;
+ emit ())
+ else
+ (* The intrinsic does not return a value. *)
+ emit ()
+ | _ -> assert false
+
+(* Emit code to call an intrinsic. *)
+let emit_call chan const_valuator c_types name elt_ty =
+ (if snd (List.hd c_types) <> "void" then
+ Printf.fprintf chan " out_%s = " (snd (List.hd c_types))
+ else
+ Printf.fprintf chan " ");
+ Printf.fprintf chan "%s_%s (" (intrinsic_name name) (string_of_elt elt_ty);
+ let print_arg chan arg_number (flags, ty) =
+ (* If the argument is of const type, then directly write in the
+ constant now. *)
+ if List.mem Const flags then
+ match const_valuator with
+ None ->
+ if List.mem Pointer flags then
+ Printf.fprintf chan "0"
+ else
+ Printf.fprintf chan "1"
+ | Some f -> Printf.fprintf chan "%s" (string_of_int (f arg_number))
+ else
+ Printf.fprintf chan "arg%d_%s" arg_number ty
+ in
+ let rec print_args arg_number tys =
+ match tys with
+ [] -> ()
+ | [ty] -> print_arg chan arg_number ty
+ | ty::tys ->
+ print_arg chan arg_number ty;
+ Printf.fprintf chan ", ";
+ print_args (arg_number + 1) tys
+ in
+ print_args 0 (List.tl c_types);
+ Printf.fprintf chan ");\n"
+
+(* Emit epilogue code to a test source file. *)
+let emit_epilogue chan features regexps =
+ let no_op = List.exists (fun feature -> feature = No_op) features in
+ Printf.fprintf chan "}\n\n";
+ (if not no_op then
+ List.iter (fun regexp ->
+ Printf.fprintf chan
+ "/* { dg-final { scan-assembler \"%s\" } } */\n" regexp)
+ regexps
+ else
+ ()
+ );
+ Printf.fprintf chan "/* { dg-final { cleanup-saved-temps } } */\n"
+
+(* Check a list of C types to determine which ones are pointers and which
+ ones are const. *)
+let check_types tys =
+ let tys' =
+ List.map (fun ty ->
+ let len = String.length ty in
+ if len > 2 && String.get ty (len - 2) = ' '
+ && String.get ty (len - 1) = '*'
+ then ([Pointer], String.sub ty 0 (len - 2))
+ else ([], ty)) tys
+ in
+ List.map (fun (flags, ty) ->
+ if String.length ty > 6 && String.sub ty 0 6 = "const "
+ then (Const :: flags, String.sub ty 6 ((String.length ty) - 6))
+ else (flags, ty)) tys'
+
+(* Given an intrinsic shape, produce a regexp that will match
+ the right-hand sides of instructions generated by an intrinsic of
+ that shape. *)
+let rec analyze_shape shape =
+ let rec n_things n thing =
+ match n with
+ 0 -> []
+ | n -> thing :: (n_things (n - 1) thing)
+ in
+ let rec analyze_shape_elt elt =
+ match elt with
+ Dreg -> "\\[dD\\]\\[0-9\\]+"
+ | Qreg -> "\\[qQ\\]\\[0-9\\]+"
+ | Corereg -> "\\[rR\\]\\[0-9\\]+"
+ | Immed -> "#\\[0-9\\]+"
+ | VecArray (1, elt) ->
+ let elt_regexp = analyze_shape_elt elt in
+ "((\\\\\\{" ^ elt_regexp ^ "\\\\\\})|(" ^ elt_regexp ^ "))"
+ | VecArray (n, elt) ->
+ let elt_regexp = analyze_shape_elt elt in
+ let alt1 = elt_regexp ^ "-" ^ elt_regexp in
+ let alt2 = commas (fun x -> x) (n_things n elt_regexp) "" in
+ "\\\\\\{((" ^ alt1 ^ ")|(" ^ alt2 ^ "))\\\\\\}"
+ | (PtrTo elt | CstPtrTo elt) ->
+ "\\\\\\[" ^ (analyze_shape_elt elt) ^ "\\\\\\]"
+ | Element_of_dreg -> (analyze_shape_elt Dreg) ^ "\\\\\\[\\[0-9\\]+\\\\\\]"
+ | Element_of_qreg -> (analyze_shape_elt Qreg) ^ "\\\\\\[\\[0-9\\]+\\\\\\]"
+ | All_elements_of_dreg -> (analyze_shape_elt Dreg) ^ "\\\\\\[\\\\\\]"
+ in
+ match shape with
+ All (n, elt) -> commas analyze_shape_elt (n_things n elt) ""
+ | Long -> (analyze_shape_elt Qreg) ^ ", " ^ (analyze_shape_elt Dreg) ^
+ ", " ^ (analyze_shape_elt Dreg)
+ | Long_noreg elt -> (analyze_shape_elt elt) ^ ", " ^ (analyze_shape_elt elt)
+ | Wide -> (analyze_shape_elt Qreg) ^ ", " ^ (analyze_shape_elt Qreg) ^
+ ", " ^ (analyze_shape_elt Dreg)
+ | Wide_noreg elt -> analyze_shape (Long_noreg elt)
+ | Narrow -> (analyze_shape_elt Dreg) ^ ", " ^ (analyze_shape_elt Qreg) ^
+ ", " ^ (analyze_shape_elt Qreg)
+ | Use_operands elts -> commas analyze_shape_elt (Array.to_list elts) ""
+ | By_scalar Dreg ->
+ analyze_shape (Use_operands [| Dreg; Dreg; Element_of_dreg |])
+ | By_scalar Qreg ->
+ analyze_shape (Use_operands [| Qreg; Qreg; Element_of_dreg |])
+ | By_scalar _ -> assert false
+ | Wide_lane ->
+ analyze_shape (Use_operands [| Qreg; Dreg; Element_of_dreg |])
+ | Wide_scalar ->
+ analyze_shape (Use_operands [| Qreg; Dreg; Element_of_dreg |])
+ | Pair_result elt ->
+ let elt_regexp = analyze_shape_elt elt in
+ elt_regexp ^ ", " ^ elt_regexp
+ | Unary_scalar _ -> "FIXME Unary_scalar"
+ | Binary_imm elt -> analyze_shape (Use_operands [| elt; elt; Immed |])
+ | Narrow_imm -> analyze_shape (Use_operands [| Dreg; Qreg; Immed |])
+ | Long_imm -> analyze_shape (Use_operands [| Qreg; Dreg; Immed |])
+
+(* Generate tests for one intrinsic. *)
+let test_intrinsic dir opcode features shape name munge elt_ty =
+ (* Open the test source file. *)
+ let test_name = name ^ (string_of_elt elt_ty) in
+ let chan = open_test_file dir test_name in
+ (* Work out what argument and return types the intrinsic has. *)
+ let c_arity, new_elt_ty = munge shape elt_ty in
+ let c_types = check_types (strings_of_arity c_arity) in
+ (* Extract any constant valuator (a function specifying what constant
+ values are to be written into the intrinsic call) from the features
+ list. *)
+ let const_valuator =
+ try
+ match (List.find (fun feature -> match feature with
+ Const_valuator _ -> true
+ | _ -> false) features) with
+ Const_valuator f -> Some f
+ | _ -> assert false
+ with Not_found -> None
+ in
+ (* Work out what instruction name(s) to expect. *)
+ let insns = get_insn_names features name in
+ let no_suffix = (new_elt_ty = NoElts) in
+ let insns =
+ if no_suffix then insns
+ else List.map (fun insn ->
+ let suffix = string_of_elt_dots new_elt_ty in
+ insn ^ "\\." ^ suffix) insns
+ in
+ (* Construct a regexp to match against the expected instruction name(s). *)
+ let insn_regexp =
+ match insns with
+ [] -> assert false
+ | [insn] -> insn
+ | _ ->
+ let rec calc_regexp insns cur_regexp =
+ match insns with
+ [] -> cur_regexp
+ | [insn] -> cur_regexp ^ "(" ^ insn ^ "))"
+ | insn::insns -> calc_regexp insns (cur_regexp ^ "(" ^ insn ^ ")|")
+ in calc_regexp insns "("
+ in
+ (* Construct regexps to match against the instructions that this
+ intrinsic expands to. Watch out for any writeback character and
+ comments after the instruction. *)
+ let regexps = List.map (fun regexp -> insn_regexp ^ "\\[ \t\\]+" ^ regexp ^
+ "!?\\(\\[ \t\\]+@\\[a-zA-Z0-9 \\]+\\)?\\n")
+ (analyze_all_shapes features shape analyze_shape)
+ in
+ (* Emit file and function prologues. *)
+ emit_prologue chan test_name;
+ (* Emit local variable declarations. *)
+ emit_automatics chan c_types;
+ Printf.fprintf chan "\n";
+ (* Emit the call to the intrinsic. *)
+ emit_call chan const_valuator c_types name elt_ty;
+ (* Emit the function epilogue and the DejaGNU scan-assembler directives. *)
+ emit_epilogue chan features regexps;
+ (* Close the test file. *)
+ close_out chan
+
+(* Generate tests for one element of the "ops" table. *)
+let test_intrinsic_group dir (opcode, features, shape, name, munge, types) =
+ List.iter (test_intrinsic dir opcode features shape name munge) types
+
+(* Program entry point. *)
+let _ =
+ let directory = if Array.length Sys.argv <> 1 then Sys.argv.(1) else "." in
+ List.iter (test_intrinsic_group directory) (reinterp @ ops)
+
Added: llvm-gcc-4.2/trunk/gcc/config/arm/neon.md
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/neon.md?rev=76781&view=auto
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/neon.md (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/neon.md Wed Jul 22 15:36:27 2009
@@ -0,0 +1,4917 @@
+;; APPLE LOCAL file v7 support. Merge from Codesourcery
+;; ARM NEON coprocessor Machine Description
+;; Copyright (C) 2006 Free Software Foundation, Inc.
+;; Written by CodeSourcery.
+;;
+;; This file is part of GCC.
+;;
+;; GCC is free software; you can redistribute it and/or modify it
+;; under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 2, or (at your option)
+;; any later version.
+;;
+;; GCC is distributed in the hope that it will be useful, but
+;; WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+;; General Public License for more details.
+;;
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING. If not, write to the Free
+;; Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+;; 02110-1301, USA.
+
+;; Constants for unspecs.
+(define_constants
+ [(UNSPEC_VPADD 65)
+ (UNSPEC_VPSMIN 66)
+ (UNSPEC_VPUMIN 67)
+ (UNSPEC_VPSMAX 68)
+ (UNSPEC_VPUMAX 69)
+ (UNSPEC_ASHIFT_SIGNED 70)
+ (UNSPEC_ASHIFT_UNSIGNED 71)
+ (UNSPEC_VADD 72)
+ (UNSPEC_VADDL 73)
+ (UNSPEC_VADDW 74)
+ (UNSPEC_VHADD 75)
+ (UNSPEC_VQADD 76)
+ (UNSPEC_VADDHN 77)
+ (UNSPEC_VABS 78)
+ (UNSPEC_VQABS 79)
+ (UNSPEC_VGET_LANE 80)
+ (UNSPEC_VSET_LANE 81)
+ (UNSPEC_VDUP_N 82)
+ (UNSPEC_VCOMBINE 83)
+ (UNSPEC_VGET_HIGH 84)
+ (UNSPEC_VGET_LOW 85)
+ (UNSPEC_VMOVN 87)
+ (UNSPEC_VQMOVN 88)
+ (UNSPEC_VQMOVUN 89)
+ (UNSPEC_VMOVL 90)
+ (UNSPEC_VMUL_LANE 91)
+ (UNSPEC_VMLA_LANE 92)
+ (UNSPEC_VMLAL_LANE 93)
+ (UNSPEC_VQDMLAL_LANE 94)
+ (UNSPEC_VMUL_N 95)
+ (UNSPEC_VCVT 96)
+ (UNSPEC_VEXT 97)
+ (UNSPEC_VREV64 98)
+ (UNSPEC_VREV32 99)
+ (UNSPEC_VREV16 100)
+ (UNSPEC_VBSL 101)
+ (UNSPEC_VLD1 102)
+ (UNSPEC_VLD1_LANE 103)
+ (UNSPEC_VLD1_DUP 104)
+ (UNSPEC_VST1 105)
+ (UNSPEC_VST1_LANE 106)
+ (UNSPEC_VSTRUCTDUMMY 107)
+ (UNSPEC_VLD2 108)
+ (UNSPEC_VLD2_LANE 109)
+ (UNSPEC_VLD2_DUP 110)
+ (UNSPEC_VST2 111)
+ (UNSPEC_VST2_LANE 112)
+ (UNSPEC_VLD3 113)
+ (UNSPEC_VLD3A 114)
+ (UNSPEC_VLD3B 115)
+ (UNSPEC_VLD3_LANE 116)
+ (UNSPEC_VLD3_DUP 117)
+ (UNSPEC_VST3 118)
+ (UNSPEC_VST3A 119)
+ (UNSPEC_VST3B 120)
+ (UNSPEC_VST3_LANE 121)
+ (UNSPEC_VLD4 122)
+ (UNSPEC_VLD4A 123)
+ (UNSPEC_VLD4B 124)
+ (UNSPEC_VLD4_LANE 125)
+ (UNSPEC_VLD4_DUP 126)
+ (UNSPEC_VST4 127)
+ (UNSPEC_VST4A 128)
+ (UNSPEC_VST4B 129)
+ (UNSPEC_VST4_LANE 130)
+ (UNSPEC_VTRN1 131)
+ (UNSPEC_VTRN2 132)
+ (UNSPEC_VTBL 133)
+ (UNSPEC_VTBX 134)
+ (UNSPEC_VAND 135)
+ (UNSPEC_VORR 136)
+ (UNSPEC_VEOR 137)
+ (UNSPEC_VBIC 138)
+ (UNSPEC_VORN 139)
+ (UNSPEC_VCVT_N 140)
+ (UNSPEC_VQNEG 142)
+ (UNSPEC_VMVN 143)
+ (UNSPEC_VCLS 144)
+ (UNSPEC_VCLZ 145)
+ (UNSPEC_VCNT 146)
+ (UNSPEC_VRECPE 147)
+ (UNSPEC_VRSQRTE 148)
+ (UNSPEC_VMUL 149)
+ (UNSPEC_VMLA 150)
+ (UNSPEC_VMLAL 151)
+ (UNSPEC_VMLS 152)
+ (UNSPEC_VMLSL 153)
+ (UNSPEC_VQDMULH 154)
+ (UNSPEC_VQDMLAL 155)
+ (UNSPEC_VQDMLSL 156)
+ (UNSPEC_VMULL 157)
+ (UNSPEC_VQDMULL 158)
+ (UNSPEC_VMLS_LANE 159)
+ (UNSPEC_VMLSL_LANE 160)
+ (UNSPEC_VQDMLSL_LANE 161)
+ (UNSPEC_VDUP_LANE 162)
+ (UNSPEC_VZIP1 163)
+ (UNSPEC_VZIP2 164)
+ (UNSPEC_VUZP1 165)
+ (UNSPEC_VUZP2 166)
+ (UNSPEC_VSRI 167)
+ (UNSPEC_VSLI 168)
+ (UNSPEC_VSRA_N 169)
+ (UNSPEC_VSHL_N 170)
+ (UNSPEC_VQSHL_N 171)
+ (UNSPEC_VQSHLU_N 172)
+ (UNSPEC_VSHLL_N 173)
+ (UNSPEC_VSHR_N 174)
+ (UNSPEC_VSHRN_N 175)
+ (UNSPEC_VQSHRN_N 176)
+ (UNSPEC_VQSHRUN_N 177)
+ (UNSPEC_VSUB 178)
+ (UNSPEC_VSUBL 179)
+ (UNSPEC_VSUBW 180)
+ (UNSPEC_VQSUB 181)
+ (UNSPEC_VHSUB 182)
+ (UNSPEC_VSUBHN 183)
+ (UNSPEC_VCEQ 184)
+ (UNSPEC_VCGE 185)
+ (UNSPEC_VCGT 186)
+ (UNSPEC_VCAGE 187)
+ (UNSPEC_VCAGT 188)
+ (UNSPEC_VTST 189)
+ (UNSPEC_VABD 190)
+ (UNSPEC_VABDL 191)
+ (UNSPEC_VABA 192)
+ (UNSPEC_VABAL 193)
+ (UNSPEC_VMAX 194)
+ (UNSPEC_VMIN 195)
+ (UNSPEC_VPADDL 196)
+ (UNSPEC_VPADAL 197)
+ (UNSPEC_VSHL 198)
+ (UNSPEC_VQSHL 199)
+ (UNSPEC_VPMAX 200)
+ (UNSPEC_VPMIN 201)
+ (UNSPEC_VRECPS 202)
+ (UNSPEC_VRSQRTS 203)
+ (UNSPEC_VMULL_LANE 204)
+ (UNSPEC_VQDMULL_LANE 205)
+ (UNSPEC_VQDMULH_LANE 206)])
+
+
+;; Double-width vector modes.
+(define_mode_macro VD [V8QI V4HI V2SI V2SF])
+
+;; Double-width vector modes plus 64-bit elements.
+(define_mode_macro VDX [V8QI V4HI V2SI V2SF DI])
+
+;; Same, without floating-point elements.
+(define_mode_macro VDI [V8QI V4HI V2SI])
+
+;; Quad-width vector modes.
+(define_mode_macro VQ [V16QI V8HI V4SI V4SF])
+
+;; Quad-width vector modes plus 64-bit elements.
+(define_mode_macro VQX [V16QI V8HI V4SI V4SF V2DI])
+
+;; Same, without floating-point elements.
+(define_mode_macro VQI [V16QI V8HI V4SI])
+
+;; Same, with TImode added, for moves.
+(define_mode_macro VQXMOV [V16QI V8HI V4SI V4SF V2DI TI])
+
+;; Opaque structure types wider than TImode.
+(define_mode_macro VSTRUCT [EI OI CI XI])
+
+;; Number of instructions needed to load/store struct elements. FIXME!
+(define_mode_attr V_slen [(EI "2") (OI "2") (CI "3") (XI "4")])
+
+;; Opaque structure types used in table lookups (except vtbl1/vtbx1).
+(define_mode_macro VTAB [TI EI OI])
+
+;; vtbl<n> suffix for above modes.
+(define_mode_attr VTAB_n [(TI "2") (EI "3") (OI "4")])
+
+;; Widenable modes.
+(define_mode_macro VW [V8QI V4HI V2SI])
+
+;; Narrowable modes.
+(define_mode_macro VN [V8HI V4SI V2DI])
+
+;; All supported vector modes (except singleton DImode).
+(define_mode_macro VDQ [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF V2DI])
+
+;; All supported vector modes (except those with 64-bit integer elements).
+(define_mode_macro VDQW [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF])
+
+;; Supported integer vector modes (not 64 bit elements).
+(define_mode_macro VDQIW [V8QI V16QI V4HI V8HI V2SI V4SI])
+
+;; Supported integer vector modes (not singleton DI)
+(define_mode_macro VDQI [V8QI V16QI V4HI V8HI V2SI V4SI V2DI])
+
+;; Vector modes, including 64-bit integer elements.
+(define_mode_macro VDQX [V8QI V16QI V4HI V8HI V2SI V4SI V2SF V4SF DI V2DI])
+
+;; Vector modes including 64-bit integer elements, but no floats.
+(define_mode_macro VDQIX [V8QI V16QI V4HI V8HI V2SI V4SI DI V2DI])
+
+;; Vector modes for float->int conversions.
+(define_mode_macro VCVTF [V2SF V4SF])
+
+;; Vector modes form int->float conversions.
+(define_mode_macro VCVTI [V2SI V4SI])
+
+;; Vector modes for doubleword multiply-accumulate, etc. insns.
+(define_mode_macro VMD [V4HI V2SI V2SF])
+
+;; Vector modes for quadword multiply-accumulate, etc. insns.
+(define_mode_macro VMQ [V8HI V4SI V4SF])
+
+;; Above modes combined.
+(define_mode_macro VMDQ [V4HI V2SI V2SF V8HI V4SI V4SF])
+
+;; As VMD, but integer modes only.
+(define_mode_macro VMDI [V4HI V2SI])
+
+;; As VMQ, but integer modes only.
+(define_mode_macro VMQI [V8HI V4SI])
+
+;; Above modes combined.
+(define_mode_macro VMDQI [V4HI V2SI V8HI V4SI])
+
+;; Modes with 8-bit and 16-bit elements.
+(define_mode_macro VX [V8QI V4HI V16QI V8HI])
+
+;; Modes with 8-bit elements.
+(define_mode_macro VE [V8QI V16QI])
+
+;; Modes with 64-bit elements only.
+(define_mode_macro V64 [DI V2DI])
+
+;; Modes with 32-bit elements only.
+(define_mode_macro V32 [V2SI V2SF V4SI V4SF])
+
+;; (Opposite) mode to convert to/from for above conversions.
+(define_mode_attr V_CVTTO [(V2SI "V2SF") (V2SF "V2SI")
+ (V4SI "V4SF") (V4SF "V4SI")])
+
+;; Define element mode for each vector mode.
+(define_mode_attr V_elem [(V8QI "QI") (V16QI "QI")
+ (V4HI "HI") (V8HI "HI")
+ (V2SI "SI") (V4SI "SI")
+ (V2SF "SF") (V4SF "SF")
+ (DI "DI") (V2DI "DI")])
+
+;; Mode of pair of elements for each vector mode, to define transfer
+;; size for structure lane/dup loads and stores.
+(define_mode_attr V_two_elem [(V8QI "HI") (V16QI "HI")
+ (V4HI "SI") (V8HI "SI")
+ (V2SI "V2SI") (V4SI "V2SI")
+ (V2SF "V2SF") (V4SF "V2SF")
+ (DI "V2DI") (V2DI "V2DI")])
+
+;; Similar, for three elements.
+;; ??? Should we define extra modes so that sizes of all three-element
+;; accesses can be accurately represented?
+(define_mode_attr V_three_elem [(V8QI "SI") (V16QI "SI")
+ (V4HI "V4HI") (V8HI "V4HI")
+ (V2SI "V4SI") (V4SI "V4SI")
+ (V2SF "V4SF") (V4SF "V4SF")
+ (DI "EI") (V2DI "EI")])
+
+;; Similar, for four elements.
+(define_mode_attr V_four_elem [(V8QI "SI") (V16QI "SI")
+ (V4HI "V4HI") (V8HI "V4HI")
+ (V2SI "V4SI") (V4SI "V4SI")
+ (V2SF "V4SF") (V4SF "V4SF")
+ (DI "OI") (V2DI "OI")])
+
+;; Register width from element mode
+(define_mode_attr V_reg [(V8QI "P") (V16QI "q")
+ (V4HI "P") (V8HI "q")
+ (V2SI "P") (V4SI "q")
+ (V2SF "P") (V4SF "q")
+ (DI "P") (V2DI "q")])
+
+;; Wider modes with the same number of elements.
+(define_mode_attr V_widen [(V8QI "V8HI") (V4HI "V4SI") (V2SI "V2DI")])
+
+;; Narrower modes with the same number of elements.
+(define_mode_attr V_narrow [(V8HI "V8QI") (V4SI "V4HI") (V2DI "V2SI")])
+
+;; Modes with half the number of equal-sized elements.
+(define_mode_attr V_HALF [(V16QI "V8QI") (V8HI "V4HI")
+ (V4SI "V2SI") (V4SF "V2SF")
+ (V2DI "DI")])
+
+;; Same, but lower-case.
+(define_mode_attr V_half [(V16QI "v8qi") (V8HI "v4hi")
+ (V4SI "v2si") (V4SF "v2sf")
+ (V2DI "di")])
+
+;; Modes with twice the number of equal-sized elements.
+(define_mode_attr V_DOUBLE [(V8QI "V16QI") (V4HI "V8HI")
+ (V2SI "V4SI") (V2SF "V4SF")
+ (DI "V2DI")])
+
+;; Same, but lower-case.
+(define_mode_attr V_double [(V8QI "v16qi") (V4HI "v8hi")
+ (V2SI "v4si") (V2SF "v4sf")
+ (DI "v2di")])
+
+;; Modes with double-width elements.
+(define_mode_attr V_double_width [(V8QI "V4HI") (V16QI "V8HI")
+ (V4HI "V2SI") (V8HI "V4SI")
+ (V2SI "DI") (V4SI "V2DI")])
+
+;; Mode of result of comparison operations (and bit-select operand 1).
+(define_mode_attr V_cmp_result [(V8QI "V8QI") (V16QI "V16QI")
+ (V4HI "V4HI") (V8HI "V8HI")
+ (V2SI "V2SI") (V4SI "V4SI")
+ (V2SF "V2SI") (V4SF "V4SI")
+ (DI "DI") (V2DI "V2DI")])
+
+;; Get element type from double-width mode, for operations where we don't care
+;; about signedness.
+(define_mode_attr V_if_elem [(V8QI "i8") (V16QI "i8")
+ (V4HI "i16") (V8HI "i16")
+ (V2SI "i32") (V4SI "i32")
+ (DI "i64") (V2DI "i64")
+ (V2SF "f32") (V4SF "f32")])
+
+;; Same, but for operations which work on signed values.
+(define_mode_attr V_s_elem [(V8QI "s8") (V16QI "s8")
+ (V4HI "s16") (V8HI "s16")
+ (V2SI "s32") (V4SI "s32")
+ (DI "s64") (V2DI "s64")
+ (V2SF "f32") (V4SF "f32")])
+
+;; Same, but for operations which work on unsigned values.
+(define_mode_attr V_u_elem [(V8QI "u8") (V16QI "u8")
+ (V4HI "u16") (V8HI "u16")
+ (V2SI "u32") (V4SI "u32")
+ (DI "u64") (V2DI "u64")
+ (V2SF "f32") (V4SF "f32")])
+
+;; Element types for extraction of unsigned scalars.
+(define_mode_attr V_uf_sclr [(V8QI "u8") (V16QI "u8")
+ (V4HI "u16") (V8HI "u16")
+ (V2SI "32") (V4SI "32")
+ (V2SF "32") (V4SF "32")])
+
+(define_mode_attr V_sz_elem [(V8QI "8") (V16QI "8")
+ (V4HI "16") (V8HI "16")
+ (V2SI "32") (V4SI "32")
+ (DI "64") (V2DI "64")
+ (V2SF "32") (V4SF "32")])
+
+;; Element sizes for duplicating ARM registers to all elements of a vector.
+(define_mode_attr VD_dup [(V8QI "8") (V4HI "16") (V2SI "32") (V2SF "32")])
+
+;; Opaque integer types for results of pair-forming intrinsics (vtrn, etc.)
+(define_mode_attr V_PAIR [(V8QI "TI") (V16QI "OI")
+ (V4HI "TI") (V8HI "OI")
+ (V2SI "TI") (V4SI "OI")
+ (V2SF "TI") (V4SF "OI")
+ (DI "TI") (V2DI "OI")])
+
+;; Same, but lower-case.
+(define_mode_attr V_pair [(V8QI "ti") (V16QI "oi")
+ (V4HI "ti") (V8HI "oi")
+ (V2SI "ti") (V4SI "oi")
+ (V2SF "ti") (V4SF "oi")
+ (DI "ti") (V2DI "oi")])
+
+;; Operations on two halves of a quadword vector.
+(define_code_macro vqh_ops [plus smin smax umin umax])
+
+;; Same, without unsigned variants (for use with *SFmode pattern).
+(define_code_macro vqhs_ops [plus smin smax])
+
+;; Assembler mnemonics for above codes.
+(define_code_attr VQH_mnem [(plus "vadd") (smin "vmin") (smax "vmax")
+ (umin "vmin") (umax "vmax")])
+
+;; Signs of above, where relevant.
+(define_code_attr VQH_sign [(plus "i") (smin "s") (smax "s") (umin "u")
+ (umax "u")])
+
+;; Extra suffix on some 64-bit insn names (to avoid collision with standard
+;; names which we don't want to define).
+(define_mode_attr V_suf64 [(V8QI "") (V16QI "")
+ (V4HI "") (V8HI "")
+ (V2SI "") (V4SI "")
+ (V2SF "") (V4SF "")
+ (DI "_neon") (V2DI "")])
+
+;; Scalars to be presented to scalar multiplication instructions
+;; must satisfy the following constraints.
+;; 1. If the mode specifies 16-bit elements, the scalar must be in D0-D7.
+;; 2. If the mode specifies 32-bit elements, the scalar must be in D0-D15.
+;; This mode attribute is used to obtain the correct register constraints.
+(define_mode_attr scalar_mul_constraint [(V4HI "x") (V2SI "t") (V2SF "t")
+ (V8HI "x") (V4SI "t") (V4SF "t")])
+
+;; Attribute used to permit string comparisons against <VQH_mnem> in
+;; neon_type attribute definitions.
+(define_attr "vqh_mnem" "vadd,vmin,vmax" (const_string "vadd"))
+
+;; Classification of NEON instructions for scheduling purposes.
+;; Do not set this attribute and the "type" attribute together in
+;; any one instruction pattern.
+(define_attr "neon_type"
+ "neon_int_1,\
+ neon_int_2,\
+ neon_int_3,\
+ neon_int_4,\
+ neon_int_5,\
+ neon_vqneg_vqabs,\
+ neon_vmov,\
+ neon_vaba,\
+ neon_vsma,\
+ neon_vaba_qqq,\
+ neon_mul_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mul_qqq_8_16_32_ddd_32,\
+ neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar,\
+ neon_mla_ddd_8_16_qdd_16_8_long_32_16_long,\
+ neon_mla_qqq_8_16,\
+ neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long,\
+ neon_mla_qqq_32_qqd_32_scalar,\
+ neon_mul_ddd_16_scalar_32_16_long_scalar,\
+ neon_mul_qqd_32_scalar,\
+ neon_mla_ddd_16_scalar_qdd_32_16_long_scalar,\
+ neon_shift_1,\
+ neon_shift_2,\
+ neon_shift_3,\
+ neon_vshl_ddd,\
+ neon_vqshl_vrshl_vqrshl_qqq,\
+ neon_vsra_vrsra,\
+ neon_fp_vadd_ddd_vabs_dd,\
+ neon_fp_vadd_qqq_vabs_qq,\
+ neon_fp_vsum,\
+ neon_fp_vmul_ddd,\
+ neon_fp_vmul_qqd,\
+ neon_fp_vmla_ddd,\
+ neon_fp_vmla_qqq,\
+ neon_fp_vmla_ddd_scalar,\
+ neon_fp_vmla_qqq_scalar,\
+ neon_fp_vrecps_vrsqrts_ddd,\
+ neon_fp_vrecps_vrsqrts_qqq,\
+ neon_bp_simple,\
+ neon_bp_2cycle,\
+ neon_bp_3cycle,\
+ neon_ldr,\
+ neon_str,\
+ neon_vld1_1_2_regs,\
+ neon_vld1_3_4_regs,\
+ neon_vld2_2_regs_vld1_vld2_all_lanes,\
+ neon_vld2_4_regs,\
+ neon_vld3_vld4,\
+ neon_vst1_1_2_regs_vst2_2_regs,\
+ neon_vst1_3_4_regs,\
+ neon_vst2_4_regs_vst3_vst4,\
+ neon_vst3_vst4,\
+ neon_vld1_vld2_lane,\
+ neon_vld3_vld4_lane,\
+ neon_vst1_vst2_lane,\
+ neon_vst3_vst4_lane,\
+ neon_vld3_vld4_all_lanes,\
+ neon_mcr,\
+ neon_mcr_2_mcrr,\
+ neon_mrc,\
+ neon_mrrc,\
+ neon_ldm_2,\
+ neon_stm_2,\
+ none"
+ (const_string "none"))
+
+;; Predicates used for setting the above attribute.
+
+(define_mode_attr Is_float_mode [(V8QI "false") (V16QI "false")
+ (V4HI "false") (V8HI "false")
+ (V2SI "false") (V4SI "false")
+ (V2SF "true") (V4SF "true")
+ (DI "false") (V2DI "false")])
+
+(define_mode_attr Scalar_mul_8_16 [(V8QI "true") (V16QI "true")
+ (V4HI "true") (V8HI "true")
+ (V2SI "false") (V4SI "false")
+ (V2SF "false") (V4SF "false")
+ (DI "false") (V2DI "false")])
+
+
+(define_mode_attr Is_d_reg [(V8QI "true") (V16QI "false")
+ (V4HI "true") (V8HI "false")
+ (V2SI "true") (V4SI "false")
+ (V2SF "true") (V4SF "false")
+ (DI "true") (V2DI "false")])
+
+(define_mode_attr V_mode_nunits [(V8QI "8") (V16QI "16")
+ (V4HI "4") (V8HI "8")
+ (V2SI "2") (V4SI "4")
+ (V2SF "2") (V4SF "4")
+ (DI "1") (V2DI "2")])
+
+;; FIXME: Attributes are probably borked.
+(define_insn "*neon_mov<mode>"
+ [(set (match_operand:VD 0 "nonimmediate_operand"
+ "=w,Uv,w, w, ?r,?w,?r,?r, ?Us")
+ (match_operand:VD 1 "general_operand"
+ " w,w, Dn,Uvi, w, r, r, Usi,r"))]
+ "TARGET_NEON"
+{
+ if (which_alternative == 2)
+ {
+ int width, is_valid;
+ static char templ[40];
+
+ is_valid = neon_immediate_valid_for_move (operands[1], <MODE>mode,
+ &operands[1], &width);
+
+ gcc_assert (is_valid != 0);
+
+ if (width == 0)
+ return "vmov.f32\t%P0, %1 @ <mode>";
+ else
+ sprintf (templ, "vmov.i%d\t%%P0, %%1 @ <mode>", width);
+
+ return templ;
+ }
+
+ /* FIXME: If the memory layout is changed in big-endian mode, output_move_vfp
+ below must be changed to output_move_neon (which will use the
+ element/structure loads/stores), and the constraint changed to 'Un' instead
+ of 'Uv'. */
+
+ switch (which_alternative)
+ {
+ case 0: return "vmov\t%P0, %P1 @ <mode>";
+ case 1: case 3: return output_move_vfp (operands);
+ case 2: gcc_unreachable ();
+ case 4: return "vmov\t%Q0, %R0, %P1 @ <mode>";
+ case 5: return "vmov\t%P0, %Q1, %R1 @ <mode>";
+ default: return output_move_double (operands);
+ }
+}
+ [(set_attr "neon_type" "neon_int_1,*,neon_vmov,*,neon_mrrc,neon_mcr_2_mcrr,*,*,*")
+ (set_attr "type" "*,f_stored,*,f_loadd,*,*,alu,load2,store2")
+ (set_attr "insn" "*,*,*,*,*,*,mov,*,*")
+ (set_attr "length" "4,4,4,4,4,4,8,8,8")
+ (set_attr "pool_range" "*,*,*,1020,*,*,*,1020,*")
+ (set_attr "neg_pool_range" "*,*,*,1008,*,*,*,1008,*")])
+
+(define_insn "*neon_mov<mode>"
+ [(set (match_operand:VQXMOV 0 "nonimmediate_operand"
+ "=w,Un,w, w, ?r,?w,?r,?r, ?Us")
+ (match_operand:VQXMOV 1 "general_operand"
+ " w,w, Dn,Uni, w, r, r, Usi, r"))]
+ "TARGET_NEON"
+{
+ if (which_alternative == 2)
+ {
+ int width, is_valid;
+ static char templ[40];
+
+ is_valid = neon_immediate_valid_for_move (operands[1], <MODE>mode,
+ &operands[1], &width);
+
+ gcc_assert (is_valid != 0);
+
+ if (width == 0)
+ return "vmov.f32\t%q0, %1 @ <mode>";
+ else
+ sprintf (templ, "vmov.i%d\t%%q0, %%1 @ <mode>", width);
+
+ return templ;
+ }
+
+ switch (which_alternative)
+ {
+ case 0: return "vmov\t%q0, %q1 @ <mode>";
+ case 1: case 3: return output_move_neon (operands);
+ case 2: gcc_unreachable ();
+ case 4: return "vmov\t%Q0, %R0, %e1 @ <mode>\;vmov\t%J0, %K0, %f1";
+ case 5: return "vmov\t%e0, %Q1, %R1 @ <mode>\;vmov\t%f0, %J1, %K1";
+ default: return output_move_quad (operands);
+ }
+}
+ [(set_attr "neon_type" "neon_int_1,neon_stm_2,neon_vmov,neon_ldm_2,\
+ neon_mrrc,neon_mcr_2_mcrr,*,*,*")
+ (set_attr "type" "*,*,*,*,*,*,alu,load4,store4")
+ (set_attr "insn" "*,*,*,*,*,*,mov,*,*")
+ (set_attr "length" "4,8,4,8,8,8,16,8,16")
+ (set_attr "pool_range" "*,*,*,1020,*,*,*,1020,*")
+ (set_attr "neg_pool_range" "*,*,*,1008,*,*,*,1008,*")])
+
+(define_expand "movti"
+ [(set (match_operand:TI 0 "nonimmediate_operand" "")
+ (match_operand:TI 1 "general_operand" ""))]
+ "TARGET_NEON"
+{
+})
+
+(define_expand "mov<mode>"
+ [(set (match_operand:VSTRUCT 0 "nonimmediate_operand" "")
+ (match_operand:VSTRUCT 1 "general_operand" ""))]
+ "TARGET_NEON"
+{
+})
+
+;; APPLE LOCAL begin 6160917
+(define_expand "reload_in<mode>"
+ [(parallel [(match_operand:VDQW 0 "s_register_operand" "=w")
+ (match_operand:VDQW 1 "neon_reload_mem_operand" "m")
+ (match_operand:SI 2 "s_register_operand" "=&r")])]
+ "TARGET_NEON"
+ "
+{
+ neon_reload_in (operands, <MODE>mode);
+ DONE;
+}")
+
+(define_expand "reload_out<mode>"
+ [(parallel [(match_operand:VDQW 0 "neon_reload_mem_operand" "=m")
+ (match_operand:VDQW 1 "s_register_operand" "w")
+ (match_operand:SI 2 "s_register_operand" "=&r")])]
+ "TARGET_NEON"
+ "
+{
+ neon_reload_out (operands, <MODE>mode);
+ DONE;
+}")
+;; APPLE LOCAL end 6160917
+
+(define_insn "*neon_mov<mode>"
+ [(set (match_operand:VSTRUCT 0 "nonimmediate_operand" "=w,Ut,w")
+ (match_operand:VSTRUCT 1 "general_operand" " w,w, Ut"))]
+ "TARGET_NEON"
+{
+ switch (which_alternative)
+ {
+ case 0: return "#";
+ case 1: case 2: return output_move_neon (operands);
+ default: gcc_unreachable ();
+ }
+}
+ [(set_attr "length" "<V_slen>,<V_slen>,<V_slen>")])
+
+(define_split
+ [(set (match_operand:EI 0 "s_register_operand" "")
+ (match_operand:EI 1 "s_register_operand" ""))]
+ "TARGET_NEON && reload_completed"
+ [(set (match_dup 0) (match_dup 1))
+ (set (match_dup 2) (match_dup 3))]
+{
+ int rdest = REGNO (operands[0]);
+ int rsrc = REGNO (operands[1]);
+ rtx dest[2], src[2];
+
+ dest[0] = gen_rtx_REG (TImode, rdest);
+ src[0] = gen_rtx_REG (TImode, rsrc);
+ dest[1] = gen_rtx_REG (DImode, rdest + 4);
+ src[1] = gen_rtx_REG (DImode, rsrc + 4);
+
+ neon_disambiguate_copy (operands, dest, src, 2);
+})
+
+(define_split
+ [(set (match_operand:OI 0 "s_register_operand" "")
+ (match_operand:OI 1 "s_register_operand" ""))]
+ "TARGET_NEON && reload_completed"
+ [(set (match_dup 0) (match_dup 1))
+ (set (match_dup 2) (match_dup 3))]
+{
+ int rdest = REGNO (operands[0]);
+ int rsrc = REGNO (operands[1]);
+ rtx dest[2], src[2];
+
+ dest[0] = gen_rtx_REG (TImode, rdest);
+ src[0] = gen_rtx_REG (TImode, rsrc);
+ dest[1] = gen_rtx_REG (TImode, rdest + 4);
+ src[1] = gen_rtx_REG (TImode, rsrc + 4);
+
+ neon_disambiguate_copy (operands, dest, src, 2);
+})
+
+(define_split
+ [(set (match_operand:CI 0 "s_register_operand" "")
+ (match_operand:CI 1 "s_register_operand" ""))]
+ "TARGET_NEON && reload_completed"
+ [(set (match_dup 0) (match_dup 1))
+ (set (match_dup 2) (match_dup 3))
+ (set (match_dup 4) (match_dup 5))]
+{
+ int rdest = REGNO (operands[0]);
+ int rsrc = REGNO (operands[1]);
+ rtx dest[3], src[3];
+
+ dest[0] = gen_rtx_REG (TImode, rdest);
+ src[0] = gen_rtx_REG (TImode, rsrc);
+ dest[1] = gen_rtx_REG (TImode, rdest + 4);
+ src[1] = gen_rtx_REG (TImode, rsrc + 4);
+ dest[2] = gen_rtx_REG (TImode, rdest + 8);
+ src[2] = gen_rtx_REG (TImode, rsrc + 8);
+
+ neon_disambiguate_copy (operands, dest, src, 3);
+})
+
+(define_split
+ [(set (match_operand:XI 0 "s_register_operand" "")
+ (match_operand:XI 1 "s_register_operand" ""))]
+ "TARGET_NEON && reload_completed"
+ [(set (match_dup 0) (match_dup 1))
+ (set (match_dup 2) (match_dup 3))
+ (set (match_dup 4) (match_dup 5))
+ (set (match_dup 6) (match_dup 7))]
+{
+ int rdest = REGNO (operands[0]);
+ int rsrc = REGNO (operands[1]);
+ rtx dest[4], src[4];
+
+ dest[0] = gen_rtx_REG (TImode, rdest);
+ src[0] = gen_rtx_REG (TImode, rsrc);
+ dest[1] = gen_rtx_REG (TImode, rdest + 4);
+ src[1] = gen_rtx_REG (TImode, rsrc + 4);
+ dest[2] = gen_rtx_REG (TImode, rdest + 8);
+ src[2] = gen_rtx_REG (TImode, rsrc + 8);
+ dest[3] = gen_rtx_REG (TImode, rdest + 12);
+ src[3] = gen_rtx_REG (TImode, rsrc + 12);
+
+ neon_disambiguate_copy (operands, dest, src, 4);
+})
+
+; FIXME: Set/extract/init quads.
+
+(define_insn "vec_set<mode>"
+ [(set (match_operand:VD 0 "s_register_operand" "+w")
+ (vec_merge:VD
+ (match_operand:VD 3 "s_register_operand" "0")
+ (vec_duplicate:VD
+ (match_operand:<V_elem> 1 "s_register_operand" "r"))
+ (ashift:SI (const_int 1)
+ (match_operand:SI 2 "immediate_operand" "i"))))]
+ "TARGET_NEON"
+ "vmov%?.<V_uf_sclr>\t%P0[%c2], %1"
+ [(set_attr "predicable" "yes")
+ (set_attr "neon_type" "neon_mcr")]
+)
+
+(define_insn "vec_set<mode>"
+ [(set (match_operand:VQ 0 "s_register_operand" "+w")
+ (vec_merge:VQ
+ (match_operand:VQ 3 "s_register_operand" "0")
+ (vec_duplicate:VQ
+ (match_operand:<V_elem> 1 "s_register_operand" "r"))
+ (ashift:SI (const_int 1)
+ (match_operand:SI 2 "immediate_operand" "i"))))]
+ "TARGET_NEON"
+{
+ int half_elts = GET_MODE_NUNITS (<MODE>mode) / 2;
+ int elt = INTVAL (operands[2]) % half_elts;
+ int hi = (INTVAL (operands[2]) / half_elts) * 2;
+ int regno = REGNO (operands[0]);
+
+ operands[0] = gen_rtx_REG (<V_HALF>mode, regno + hi);
+ operands[2] = GEN_INT (elt);
+
+ return "vmov%?.<V_uf_sclr>\t%P0[%c2], %1";
+}
+ [(set_attr "predicable" "yes")
+ (set_attr "neon_type" "neon_mcr")]
+)
+
+(define_insn "vec_setv2di"
+ [(set (match_operand:V2DI 0 "s_register_operand" "+w")
+ (vec_merge:V2DI
+ (match_operand:V2DI 3 "s_register_operand" "0")
+ (vec_duplicate:V2DI
+ (match_operand:DI 1 "s_register_operand" "r"))
+ (ashift:SI (const_int 1)
+ (match_operand:SI 2 "immediate_operand" "i"))))]
+ "TARGET_NEON"
+{
+ int regno = REGNO (operands[0]) + INTVAL (operands[2]);
+
+ operands[0] = gen_rtx_REG (DImode, regno);
+
+ return "vmov%?.64\t%P0, %Q1, %R1";
+}
+ [(set_attr "predicable" "yes")
+ (set_attr "neon_type" "neon_mcr_2_mcrr")]
+)
+
+(define_insn "vec_extract<mode>"
+ [(set (match_operand:<V_elem> 0 "s_register_operand" "=r")
+ (vec_select:<V_elem>
+ (match_operand:VD 1 "s_register_operand" "w")
+ (parallel [(match_operand:SI 2 "immediate_operand" "i")])))]
+ "TARGET_NEON"
+ "vmov%?.<V_uf_sclr>\t%0, %P1[%c2]"
+ [(set_attr "predicable" "yes")
+ (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "vec_extract<mode>"
+ [(set (match_operand:<V_elem> 0 "s_register_operand" "=r")
+ (vec_select:<V_elem>
+ (match_operand:VQ 1 "s_register_operand" "w")
+ (parallel [(match_operand:SI 2 "immediate_operand" "i")])))]
+ "TARGET_NEON"
+{
+ int half_elts = GET_MODE_NUNITS (<MODE>mode) / 2;
+ int elt = INTVAL (operands[2]) % half_elts;
+ int hi = (INTVAL (operands[2]) / half_elts) * 2;
+ int regno = REGNO (operands[1]);
+
+ operands[1] = gen_rtx_REG (<V_HALF>mode, regno + hi);
+ operands[2] = GEN_INT (elt);
+
+ return "vmov%?.<V_uf_sclr>\t%0, %P1[%c2]";
+}
+ [(set_attr "predicable" "yes")
+ (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "vec_extractv2di"
+ [(set (match_operand:DI 0 "s_register_operand" "=r")
+ (vec_select:DI
+ (match_operand:V2DI 1 "s_register_operand" "w")
+ (parallel [(match_operand:SI 2 "immediate_operand" "i")])))]
+ "TARGET_NEON"
+{
+ int regno = REGNO (operands[1]) + INTVAL (operands[2]);
+
+ operands[1] = gen_rtx_REG (DImode, regno);
+
+ return "vmov%?.64\t%Q0, %R0, %P1";
+}
+ [(set_attr "predicable" "yes")
+ (set_attr "neon_type" "neon_int_1")]
+)
+
+(define_expand "vec_init<mode>"
+ [(match_operand:VDQ 0 "s_register_operand" "")
+ (match_operand 1 "" "")]
+ "TARGET_NEON"
+{
+ neon_expand_vector_init (operands[0], operands[1]);
+ DONE;
+})
+
+;; Doubleword and quadword arithmetic.
+
+;; NOTE: vadd/vsub and some other instructions also support 64-bit integer
+;; element size, which we could potentially use for "long long" operations. We
+;; don't want to do this at present though, because moving values from the
+;; vector unit to the ARM core is currently slow and 64-bit addition (etc.) is
+;; easy to do with ARM instructions anyway.
+
+(define_insn "*add<mode>3_neon"
+ [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+ (plus:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
+ (match_operand:VDQ 2 "s_register_operand" "w")))]
+ "TARGET_NEON"
+ "vadd.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq"))
+ (const_string "neon_int_1")))]
+)
+
+(define_insn "*sub<mode>3_neon"
+ [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+ (minus:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
+ (match_operand:VDQ 2 "s_register_operand" "w")))]
+ "TARGET_NEON"
+ "vsub.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq"))
+ (const_string "neon_int_2")))]
+)
+
+(define_insn "*mul<mode>3_neon"
+ [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+ (mult:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
+ (match_operand:VDQ 2 "s_register_operand" "w")))]
+ "TARGET_NEON"
+ "vmul.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq"))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (if_then_else
+ (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long")
+ (const_string "neon_mul_qqq_8_16_32_ddd_32"))
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mul_qqq_8_16_32_ddd_32")
+ (const_string "neon_mul_qqq_8_16_32_ddd_32")))))]
+)
+
+(define_insn "ior<mode>3"
+ [(set (match_operand:VDQ 0 "s_register_operand" "=w,w")
+ (ior:VDQ (match_operand:VDQ 1 "s_register_operand" "w,0")
+ (match_operand:VDQ 2 "neon_logic_op2" "w,Dl")))]
+ "TARGET_NEON"
+{
+ switch (which_alternative)
+ {
+ case 0: return "vorr\t%<V_reg>0, %<V_reg>1, %<V_reg>2";
+ case 1: return neon_output_logic_immediate ("vorr", &operands[2],
+ <MODE>mode, 0, VALID_NEON_QREG_MODE (<MODE>mode));
+ default: gcc_unreachable ();
+ }
+}
+ [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "iordi3_neon"
+ [(set (match_operand:DI 0 "s_register_operand" "=w,w")
+ (unspec:DI [(match_operand:DI 1 "s_register_operand" "w,0")
+ (match_operand:DI 2 "neon_logic_op2" "w,Dl")]
+ UNSPEC_VORR))]
+ "TARGET_NEON"
+{
+ switch (which_alternative)
+ {
+ case 0: return "vorr\t%P0, %P1, %P2";
+ case 1: return neon_output_logic_immediate ("vorr", &operands[2],
+ DImode, 0, VALID_NEON_QREG_MODE (DImode));
+ default: gcc_unreachable ();
+ }
+}
+ [(set_attr "neon_type" "neon_int_1")]
+)
+
+;; The concrete forms of the Neon immediate-logic instructions are vbic and
+;; vorr. We support the pseudo-instruction vand instead, because that
+;; corresponds to the canonical form the middle-end expects to use for
+;; immediate bitwise-ANDs.
+
+(define_insn "and<mode>3"
+ [(set (match_operand:VDQ 0 "s_register_operand" "=w,w")
+ (and:VDQ (match_operand:VDQ 1 "s_register_operand" "w,0")
+ (match_operand:VDQ 2 "neon_inv_logic_op2" "w,DL")))]
+ "TARGET_NEON"
+{
+ switch (which_alternative)
+ {
+ case 0: return "vand\t%<V_reg>0, %<V_reg>1, %<V_reg>2";
+ case 1: return neon_output_logic_immediate ("vand", &operands[2],
+ <MODE>mode, 1, VALID_NEON_QREG_MODE (<MODE>mode));
+ default: gcc_unreachable ();
+ }
+}
+ [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "anddi3_neon"
+ [(set (match_operand:DI 0 "s_register_operand" "=w,w")
+ (unspec:DI [(match_operand:DI 1 "s_register_operand" "w,0")
+ (match_operand:DI 2 "neon_inv_logic_op2" "w,DL")]
+ UNSPEC_VAND))]
+ "TARGET_NEON"
+{
+ switch (which_alternative)
+ {
+ case 0: return "vand\t%P0, %P1, %P2";
+ case 1: return neon_output_logic_immediate ("vand", &operands[2],
+ DImode, 1, VALID_NEON_QREG_MODE (DImode));
+ default: gcc_unreachable ();
+ }
+}
+ [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "orn<mode>3_neon"
+ [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+ (ior:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
+ (not:VDQ (match_operand:VDQ 2 "s_register_operand" "w"))))]
+ "TARGET_NEON"
+ "vorn\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "orndi3_neon"
+ [(set (match_operand:DI 0 "s_register_operand" "=w")
+ (unspec:DI [(match_operand:DI 1 "s_register_operand" "w")
+ (match_operand:DI 2 "s_register_operand" "w")]
+ UNSPEC_VORN))]
+ "TARGET_NEON"
+ "vorn\t%P0, %P1, %P2"
+ [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "bic<mode>3_neon"
+ [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+ (and:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
+ (not:VDQ (match_operand:VDQ 2 "s_register_operand" "w"))))]
+ "TARGET_NEON"
+ "vbic\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "bicdi3_neon"
+ [(set (match_operand:DI 0 "s_register_operand" "=w")
+ (unspec:DI [(match_operand:DI 1 "s_register_operand" "w")
+ (match_operand:DI 2 "s_register_operand" "w")]
+ UNSPEC_VBIC))]
+ "TARGET_NEON"
+ "vbic\t%P0, %P1, %P2"
+ [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "xor<mode>3"
+ [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+ (xor:VDQ (match_operand:VDQ 1 "s_register_operand" "w")
+ (match_operand:VDQ 2 "s_register_operand" "w")))]
+ "TARGET_NEON"
+ "veor\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "xordi3_neon"
+ [(set (match_operand:DI 0 "s_register_operand" "=w")
+ (unspec:DI [(match_operand:DI 1 "s_register_operand" "w")
+ (match_operand:DI 2 "s_register_operand" "w")]
+ UNSPEC_VEOR))]
+ "TARGET_NEON"
+ "veor\t%P0, %P1, %P2"
+ [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "one_cmpl<mode>2"
+ [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+ (not:VDQ (match_operand:VDQ 1 "s_register_operand" "w")))]
+ "TARGET_NEON"
+ "vmvn\t%<V_reg>0, %<V_reg>1"
+ [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "abs<mode>2"
+ [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+ (abs:VDQW (match_operand:VDQW 1 "s_register_operand" "w")))]
+ "TARGET_NEON"
+ "vabs.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq"))
+ (const_string "neon_int_3")))]
+)
+
+(define_insn "neg<mode>2"
+ [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+ (neg:VDQW (match_operand:VDQW 1 "s_register_operand" "w")))]
+ "TARGET_NEON"
+ "vneg.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq"))
+ (const_string "neon_int_3")))]
+)
+
+(define_insn "*umin<mode>3_neon"
+ [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+ (umin:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")
+ (match_operand:VDQIW 2 "s_register_operand" "w")))]
+ "TARGET_NEON"
+ "vmin.<V_u_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set_attr "neon_type" "neon_int_5")]
+)
+
+(define_insn "*umax<mode>3_neon"
+ [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+ (umax:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")
+ (match_operand:VDQIW 2 "s_register_operand" "w")))]
+ "TARGET_NEON"
+ "vmax.<V_u_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set_attr "neon_type" "neon_int_5")]
+)
+
+(define_insn "*smin<mode>3_neon"
+ [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+ (smin:VDQW (match_operand:VDQW 1 "s_register_operand" "w")
+ (match_operand:VDQW 2 "s_register_operand" "w")))]
+ "TARGET_NEON"
+ "vmin.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_int_5")))]
+)
+
+(define_insn "*smax<mode>3_neon"
+ [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+ (smax:VDQW (match_operand:VDQW 1 "s_register_operand" "w")
+ (match_operand:VDQW 2 "s_register_operand" "w")))]
+ "TARGET_NEON"
+ "vmax.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_int_5")))]
+)
+
+; TODO: V2DI shifts are current disabled because there are bugs in the
+; generic vectorizer code. It ends up creating a V2DI constructor with
+; SImode elements.
+
+(define_insn "ashl<mode>3"
+ [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+ (ashift:VDQIW (match_operand:VDQIW 1 "s_register_operand" "w")
+ (match_operand:VDQIW 2 "s_register_operand" "w")))]
+ "TARGET_NEON"
+ "vshl.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_vshl_ddd")
+ (const_string "neon_shift_3")))]
+)
+
+; Used for implementing logical shift-right, which is a left-shift by a negative
+; amount, with signed operands. This is essentially the same as ashl<mode>3
+; above, but using an unspec in case GCC tries anything tricky with negative
+; shift amounts.
+
+(define_insn "ashl<mode>3_signed"
+ [(set (match_operand:VDQI 0 "s_register_operand" "=w")
+ (unspec:VDQI [(match_operand:VDQI 1 "s_register_operand" "w")
+ (match_operand:VDQI 2 "s_register_operand" "w")]
+ UNSPEC_ASHIFT_SIGNED))]
+ "TARGET_NEON"
+ "vshl.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_vshl_ddd")
+ (const_string "neon_shift_3")))]
+)
+
+; Used for implementing logical shift-right, which is a left-shift by a negative
+; amount, with unsigned operands.
+
+(define_insn "ashl<mode>3_unsigned"
+ [(set (match_operand:VDQI 0 "s_register_operand" "=w")
+ (unspec:VDQI [(match_operand:VDQI 1 "s_register_operand" "w")
+ (match_operand:VDQI 2 "s_register_operand" "w")]
+ UNSPEC_ASHIFT_UNSIGNED))]
+ "TARGET_NEON"
+ "vshl.<V_u_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_vshl_ddd")
+ (const_string "neon_shift_3")))]
+)
+
+(define_expand "ashr<mode>3"
+ [(set (match_operand:VDQIW 0 "s_register_operand" "")
+ (ashiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand" "")
+ (match_operand:VDQIW 2 "s_register_operand" "")))]
+ "TARGET_NEON"
+{
+ rtx neg = gen_reg_rtx (<MODE>mode);
+
+ emit_insn (gen_neg<mode>2 (neg, operands[2]));
+ emit_insn (gen_ashl<mode>3_signed (operands[0], operands[1], neg));
+
+ DONE;
+})
+
+(define_expand "lshr<mode>3"
+ [(set (match_operand:VDQIW 0 "s_register_operand" "")
+ (lshiftrt:VDQIW (match_operand:VDQIW 1 "s_register_operand" "")
+ (match_operand:VDQIW 2 "s_register_operand" "")))]
+ "TARGET_NEON"
+{
+ rtx neg = gen_reg_rtx (<MODE>mode);
+
+ emit_insn (gen_neg<mode>2 (neg, operands[2]));
+ emit_insn (gen_ashl<mode>3_unsigned (operands[0], operands[1], neg));
+
+ DONE;
+})
+
+;; Widening operations
+
+;; FIXME: I'm not sure if sign/zero_extend are legal to use on vector modes.
+
+(define_insn "widen_ssum<mode>3"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (plus:<V_widen> (sign_extend:<V_widen>
+ (match_operand:VW 1 "s_register_operand" "%w"))
+ (match_operand:<V_widen> 2 "s_register_operand" "w")))]
+ "TARGET_NEON"
+ "vaddw.<V_s_elem>\t%q0, %q2, %P1"
+ [(set_attr "neon_type" "neon_int_3")]
+)
+
+(define_insn "widen_usum<mode>3"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (plus:<V_widen> (zero_extend:<V_widen>
+ (match_operand:VW 1 "s_register_operand" "%w"))
+ (match_operand:<V_widen> 2 "s_register_operand" "w")))]
+ "TARGET_NEON"
+ "vaddw.<V_u_elem>\t%q0, %q2, %P1"
+ [(set_attr "neon_type" "neon_int_3")]
+)
+
+;; VEXT can be used to synthesize coarse whole-vector shifts with 8-bit
+;; shift-count granularity. That's good enough for the middle-end's current
+;; needs.
+
+(define_expand "vec_shr_<mode>"
+ [(match_operand:VDQ 0 "s_register_operand" "")
+ (match_operand:VDQ 1 "s_register_operand" "")
+ (match_operand:SI 2 "const_multiple_of_8_operand" "")]
+ "TARGET_NEON"
+{
+ rtx zero_reg;
+ HOST_WIDE_INT num_bits = INTVAL (operands[2]);
+ const int width = GET_MODE_BITSIZE (<MODE>mode);
+ const enum machine_mode bvecmode = (width == 128) ? V16QImode : V8QImode;
+ rtx (*gen_ext) (rtx, rtx, rtx, rtx) =
+ (width == 128) ? gen_neon_vextv16qi : gen_neon_vextv8qi;
+
+ if (num_bits == width)
+ {
+ emit_move_insn (operands[0], operands[1]);
+ DONE;
+ }
+
+ zero_reg = force_reg (bvecmode, CONST0_RTX (bvecmode));
+ operands[0] = gen_lowpart (bvecmode, operands[0]);
+ operands[1] = gen_lowpart (bvecmode, operands[1]);
+
+ emit_insn (gen_ext (operands[0], operands[1], zero_reg,
+ GEN_INT (num_bits / BITS_PER_UNIT)));
+ DONE;
+})
+
+(define_expand "vec_shl_<mode>"
+ [(match_operand:VDQ 0 "s_register_operand" "")
+ (match_operand:VDQ 1 "s_register_operand" "")
+ (match_operand:SI 2 "const_multiple_of_8_operand" "")]
+ "TARGET_NEON"
+{
+ rtx zero_reg;
+ HOST_WIDE_INT num_bits = INTVAL (operands[2]);
+ const int width = GET_MODE_BITSIZE (<MODE>mode);
+ const enum machine_mode bvecmode = (width == 128) ? V16QImode : V8QImode;
+ rtx (*gen_ext) (rtx, rtx, rtx, rtx) =
+ (width == 128) ? gen_neon_vextv16qi : gen_neon_vextv8qi;
+
+ if (num_bits == 0)
+ {
+ emit_move_insn (operands[0], CONST0_RTX (<MODE>mode));
+ DONE;
+ }
+
+ num_bits = width - num_bits;
+
+ zero_reg = force_reg (bvecmode, CONST0_RTX (bvecmode));
+ operands[0] = gen_lowpart (bvecmode, operands[0]);
+ operands[1] = gen_lowpart (bvecmode, operands[1]);
+
+ emit_insn (gen_ext (operands[0], zero_reg, operands[1],
+ GEN_INT (num_bits / BITS_PER_UNIT)));
+ DONE;
+})
+
+;; Helpers for quad-word reduction operations
+
+; Add (or smin, smax...) the low N/2 elements of the N-element vector
+; operand[1] to the high N/2 elements of same. Put the result in operand[0], an
+; N/2-element vector.
+
+(define_insn "quad_halves_<code>v4si"
+ [(set (match_operand:V2SI 0 "s_register_operand" "=w")
+ (vqh_ops:V2SI
+ (vec_select:V2SI (match_operand:V4SI 1 "s_register_operand" "w")
+ (parallel [(const_int 0) (const_int 1)]))
+ (vec_select:V2SI (match_dup 1)
+ (parallel [(const_int 2) (const_int 3)]))))]
+ "TARGET_NEON"
+ "<VQH_mnem>.<VQH_sign>32\t%P0, %e1, %f1"
+ [(set_attr "vqh_mnem" "<VQH_mnem>")
+ (set (attr "neon_type")
+ (if_then_else (eq_attr "vqh_mnem" "vadd")
+ (const_string "neon_int_1") (const_string "neon_int_5")))]
+)
+
+(define_insn "quad_halves_<code>v4sf"
+ [(set (match_operand:V2SF 0 "s_register_operand" "=w")
+ (vqhs_ops:V2SF
+ (vec_select:V2SF (match_operand:V4SF 1 "s_register_operand" "w")
+ (parallel [(const_int 0) (const_int 1)]))
+ (vec_select:V2SF (match_dup 1)
+ (parallel [(const_int 2) (const_int 3)]))))]
+ "TARGET_NEON"
+ "<VQH_mnem>.f32\t%P0, %e1, %f1"
+ [(set_attr "vqh_mnem" "<VQH_mnem>")
+ (set (attr "neon_type")
+ (if_then_else (eq_attr "vqh_mnem" "vadd")
+ (const_string "neon_int_1") (const_string "neon_int_5")))]
+)
+
+(define_insn "quad_halves_<code>v8hi"
+ [(set (match_operand:V4HI 0 "s_register_operand" "+w")
+ (vqh_ops:V4HI
+ (vec_select:V4HI (match_operand:V8HI 1 "s_register_operand" "w")
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)]))
+ (vec_select:V4HI (match_dup 1)
+ (parallel [(const_int 4) (const_int 5)
+ (const_int 6) (const_int 7)]))))]
+ "TARGET_NEON"
+ "<VQH_mnem>.<VQH_sign>16\t%P0, %e1, %f1"
+ [(set_attr "vqh_mnem" "<VQH_mnem>")
+ (set (attr "neon_type")
+ (if_then_else (eq_attr "vqh_mnem" "vadd")
+ (const_string "neon_int_1") (const_string "neon_int_5")))]
+)
+
+(define_insn "quad_halves_<code>v16qi"
+ [(set (match_operand:V8QI 0 "s_register_operand" "+w")
+ (vqh_ops:V8QI
+ (vec_select:V8QI (match_operand:V16QI 1 "s_register_operand" "w")
+ (parallel [(const_int 0) (const_int 1)
+ (const_int 2) (const_int 3)
+ (const_int 4) (const_int 5)
+ (const_int 6) (const_int 7)]))
+ (vec_select:V8QI (match_dup 1)
+ (parallel [(const_int 8) (const_int 9)
+ (const_int 10) (const_int 11)
+ (const_int 12) (const_int 13)
+ (const_int 14) (const_int 15)]))))]
+ "TARGET_NEON"
+ "<VQH_mnem>.<VQH_sign>8\t%P0, %e1, %f1"
+ [(set_attr "vqh_mnem" "<VQH_mnem>")
+ (set (attr "neon_type")
+ (if_then_else (eq_attr "vqh_mnem" "vadd")
+ (const_string "neon_int_1") (const_string "neon_int_5")))]
+)
+
+; FIXME: We wouldn't need the following insns if we could write subregs of
+; vector registers. Make an attempt at removing unnecessary moves, though
+; we're really at the mercy of the register allocator.
+
+(define_insn "move_lo_quad_v4si"
+ [(set (match_operand:V4SI 0 "s_register_operand" "+w")
+ (vec_concat:V4SI
+ (match_operand:V2SI 1 "s_register_operand" "w")
+ (vec_select:V2SI (match_dup 0)
+ (parallel [(const_int 2) (const_int 3)]))))]
+ "TARGET_NEON"
+{
+ int dest = REGNO (operands[0]);
+ int src = REGNO (operands[1]);
+
+ if (dest != src)
+ return "vmov\t%e0, %P1";
+ else
+ return "";
+}
+ [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "move_lo_quad_v4sf"
+ [(set (match_operand:V4SF 0 "s_register_operand" "+w")
+ (vec_concat:V4SF
+ (match_operand:V2SF 1 "s_register_operand" "w")
+ (vec_select:V2SF (match_dup 0)
+ (parallel [(const_int 2) (const_int 3)]))))]
+ "TARGET_NEON"
+{
+ int dest = REGNO (operands[0]);
+ int src = REGNO (operands[1]);
+
+ if (dest != src)
+ return "vmov\t%e0, %P1";
+ else
+ return "";
+}
+ [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "move_lo_quad_v8hi"
+ [(set (match_operand:V8HI 0 "s_register_operand" "+w")
+ (vec_concat:V8HI
+ (match_operand:V4HI 1 "s_register_operand" "w")
+ (vec_select:V4HI (match_dup 0)
+ (parallel [(const_int 4) (const_int 5)
+ (const_int 6) (const_int 7)]))))]
+ "TARGET_NEON"
+{
+ int dest = REGNO (operands[0]);
+ int src = REGNO (operands[1]);
+
+ if (dest != src)
+ return "vmov\t%e0, %P1";
+ else
+ return "";
+}
+ [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "move_lo_quad_v16qi"
+ [(set (match_operand:V16QI 0 "s_register_operand" "+w")
+ (vec_concat:V16QI
+ (match_operand:V8QI 1 "s_register_operand" "w")
+ (vec_select:V8QI (match_dup 0)
+ (parallel [(const_int 8) (const_int 9)
+ (const_int 10) (const_int 11)
+ (const_int 12) (const_int 13)
+ (const_int 14) (const_int 15)]))))]
+ "TARGET_NEON"
+{
+ int dest = REGNO (operands[0]);
+ int src = REGNO (operands[1]);
+
+ if (dest != src)
+ return "vmov\t%e0, %P1";
+ else
+ return "";
+}
+ [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+;; Reduction operations
+
+(define_expand "reduc_splus_<mode>"
+ [(match_operand:VD 0 "s_register_operand" "")
+ (match_operand:VD 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ neon_pairwise_reduce (operands[0], operands[1], <MODE>mode,
+ &gen_neon_vpadd_internal<mode>);
+ DONE;
+})
+
+(define_expand "reduc_splus_<mode>"
+ [(match_operand:VQ 0 "s_register_operand" "")
+ (match_operand:VQ 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ rtx step1 = gen_reg_rtx (<V_HALF>mode);
+ rtx res_d = gen_reg_rtx (<V_HALF>mode);
+
+ emit_insn (gen_quad_halves_plus<mode> (step1, operands[1]));
+ emit_insn (gen_reduc_splus_<V_half> (res_d, step1));
+ emit_insn (gen_move_lo_quad_<mode> (operands[0], res_d));
+
+ DONE;
+})
+
+(define_insn "reduc_splus_v2di"
+ [(set (match_operand:V2DI 0 "s_register_operand" "=w")
+ (unspec:V2DI [(match_operand:V2DI 1 "s_register_operand" "w")]
+ UNSPEC_VPADD))]
+ "TARGET_NEON"
+ "vadd.i64\t%e0, %e1, %f1"
+ [(set_attr "neon_type" "neon_int_1")]
+)
+
+;; NEON does not distinguish between signed and unsigned addition except on
+;; widening operations.
+(define_expand "reduc_uplus_<mode>"
+ [(match_operand:VDQI 0 "s_register_operand" "")
+ (match_operand:VDQI 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ emit_insn (gen_reduc_splus_<mode> (operands[0], operands[1]));
+ DONE;
+})
+
+(define_expand "reduc_smin_<mode>"
+ [(match_operand:VD 0 "s_register_operand" "")
+ (match_operand:VD 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ neon_pairwise_reduce (operands[0], operands[1], <MODE>mode,
+ &gen_neon_vpsmin<mode>);
+ DONE;
+})
+
+(define_expand "reduc_smin_<mode>"
+ [(match_operand:VQ 0 "s_register_operand" "")
+ (match_operand:VQ 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ rtx step1 = gen_reg_rtx (<V_HALF>mode);
+ rtx res_d = gen_reg_rtx (<V_HALF>mode);
+
+ emit_insn (gen_quad_halves_smin<mode> (step1, operands[1]));
+ emit_insn (gen_reduc_smin_<V_half> (res_d, step1));
+ emit_insn (gen_move_lo_quad_<mode> (operands[0], res_d));
+
+ DONE;
+})
+
+(define_expand "reduc_smax_<mode>"
+ [(match_operand:VD 0 "s_register_operand" "")
+ (match_operand:VD 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ neon_pairwise_reduce (operands[0], operands[1], <MODE>mode,
+ &gen_neon_vpsmax<mode>);
+ DONE;
+})
+
+(define_expand "reduc_smax_<mode>"
+ [(match_operand:VQ 0 "s_register_operand" "")
+ (match_operand:VQ 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ rtx step1 = gen_reg_rtx (<V_HALF>mode);
+ rtx res_d = gen_reg_rtx (<V_HALF>mode);
+
+ emit_insn (gen_quad_halves_smax<mode> (step1, operands[1]));
+ emit_insn (gen_reduc_smax_<V_half> (res_d, step1));
+ emit_insn (gen_move_lo_quad_<mode> (operands[0], res_d));
+
+ DONE;
+})
+
+(define_expand "reduc_umin_<mode>"
+ [(match_operand:VDI 0 "s_register_operand" "")
+ (match_operand:VDI 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ neon_pairwise_reduce (operands[0], operands[1], <MODE>mode,
+ &gen_neon_vpumin<mode>);
+ DONE;
+})
+
+(define_expand "reduc_umin_<mode>"
+ [(match_operand:VQI 0 "s_register_operand" "")
+ (match_operand:VQI 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ rtx step1 = gen_reg_rtx (<V_HALF>mode);
+ rtx res_d = gen_reg_rtx (<V_HALF>mode);
+
+ emit_insn (gen_quad_halves_umin<mode> (step1, operands[1]));
+ emit_insn (gen_reduc_umin_<V_half> (res_d, step1));
+ emit_insn (gen_move_lo_quad_<mode> (operands[0], res_d));
+
+ DONE;
+})
+
+(define_expand "reduc_umax_<mode>"
+ [(match_operand:VDI 0 "s_register_operand" "")
+ (match_operand:VDI 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ neon_pairwise_reduce (operands[0], operands[1], <MODE>mode,
+ &gen_neon_vpumax<mode>);
+ DONE;
+})
+
+(define_expand "reduc_umax_<mode>"
+ [(match_operand:VQI 0 "s_register_operand" "")
+ (match_operand:VQI 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ rtx step1 = gen_reg_rtx (<V_HALF>mode);
+ rtx res_d = gen_reg_rtx (<V_HALF>mode);
+
+ emit_insn (gen_quad_halves_umax<mode> (step1, operands[1]));
+ emit_insn (gen_reduc_umax_<V_half> (res_d, step1));
+ emit_insn (gen_move_lo_quad_<mode> (operands[0], res_d));
+
+ DONE;
+})
+
+(define_insn "neon_vpadd_internal<mode>"
+ [(set (match_operand:VD 0 "s_register_operand" "=w")
+ (unspec:VD [(match_operand:VD 1 "s_register_operand" "w")
+ (match_operand:VD 2 "s_register_operand" "w")]
+ UNSPEC_VPADD))]
+ "TARGET_NEON"
+ "vpadd.<V_if_elem>\t%P0, %P1, %P2"
+ ;; Assume this schedules like vadd.
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq"))
+ (const_string "neon_int_1")))]
+)
+
+(define_insn "neon_vpsmin<mode>"
+ [(set (match_operand:VD 0 "s_register_operand" "=w")
+ (unspec:VD [(match_operand:VD 1 "s_register_operand" "w")
+ (match_operand:VD 2 "s_register_operand" "w")]
+ UNSPEC_VPSMIN))]
+ "TARGET_NEON"
+ "vpmin.<V_s_elem>\t%P0, %P1, %P2"
+ ;; Assume this schedules like vmin.
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vpsmax<mode>"
+ [(set (match_operand:VD 0 "s_register_operand" "=w")
+ (unspec:VD [(match_operand:VD 1 "s_register_operand" "w")
+ (match_operand:VD 2 "s_register_operand" "w")]
+ UNSPEC_VPSMAX))]
+ "TARGET_NEON"
+ "vpmax.<V_s_elem>\t%P0, %P1, %P2"
+ ;; Assume this schedules like vmax.
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vpumin<mode>"
+ [(set (match_operand:VDI 0 "s_register_operand" "=w")
+ (unspec:VDI [(match_operand:VDI 1 "s_register_operand" "w")
+ (match_operand:VDI 2 "s_register_operand" "w")]
+ UNSPEC_VPUMIN))]
+ "TARGET_NEON"
+ "vpmin.<V_u_elem>\t%P0, %P1, %P2"
+ ;; Assume this schedules like umin.
+ [(set_attr "neon_type" "neon_int_5")]
+)
+
+(define_insn "neon_vpumax<mode>"
+ [(set (match_operand:VDI 0 "s_register_operand" "=w")
+ (unspec:VDI [(match_operand:VDI 1 "s_register_operand" "w")
+ (match_operand:VDI 2 "s_register_operand" "w")]
+ UNSPEC_VPUMAX))]
+ "TARGET_NEON"
+ "vpmax.<V_u_elem>\t%P0, %P1, %P2"
+ ;; Assume this schedules like umax.
+ [(set_attr "neon_type" "neon_int_5")]
+)
+
+;; Saturating arithmetic
+
+; NOTE: Neon supports many more saturating variants of instructions than the
+; following, but these are all GCC currently understands.
+; FIXME: Actually, GCC doesn't know how to create saturating add/sub by itself
+; yet either, although these patterns may be used by intrinsics when they're
+; added.
+
+(define_insn "*ss_add<mode>_neon"
+ [(set (match_operand:VD 0 "s_register_operand" "=w")
+ (ss_plus:VD (match_operand:VD 1 "s_register_operand" "w")
+ (match_operand:VD 2 "s_register_operand" "w")))]
+ "TARGET_NEON"
+ "vqadd.<V_s_elem>\t%P0, %P1, %P2"
+ [(set_attr "neon_type" "neon_int_4")]
+)
+
+(define_insn "*us_add<mode>_neon"
+ [(set (match_operand:VD 0 "s_register_operand" "=w")
+ (us_plus:VD (match_operand:VD 1 "s_register_operand" "w")
+ (match_operand:VD 2 "s_register_operand" "w")))]
+ "TARGET_NEON"
+ "vqadd.<V_u_elem>\t%P0, %P1, %P2"
+ [(set_attr "neon_type" "neon_int_4")]
+)
+
+(define_insn "*ss_sub<mode>_neon"
+ [(set (match_operand:VD 0 "s_register_operand" "=w")
+ (ss_minus:VD (match_operand:VD 1 "s_register_operand" "w")
+ (match_operand:VD 2 "s_register_operand" "w")))]
+ "TARGET_NEON"
+ "vqsub.<V_s_elem>\t%P0, %P1, %P2"
+ [(set_attr "neon_type" "neon_int_5")]
+)
+
+(define_insn "*us_sub<mode>_neon"
+ [(set (match_operand:VD 0 "s_register_operand" "=w")
+ (us_minus:VD (match_operand:VD 1 "s_register_operand" "w")
+ (match_operand:VD 2 "s_register_operand" "w")))]
+ "TARGET_NEON"
+ "vqsub.<V_u_elem>\t%P0, %P1, %P2"
+ [(set_attr "neon_type" "neon_int_5")]
+)
+
+; FIXME: These instructions aren't supported in GCC 4.1, but are documented
+; for the current trunk. Uncomment when this code is merged to a GCC version
+; which supports them.
+
+;(define_insn "*ss_neg<mode>_neon"
+; [(set (match_operand:VD 0 "s_register_operand" "=w")
+; (ss_neg:VD 1 (match_operand:VD 1 "s_register_operand" "w")))]
+; "TARGET_NEON"
+; "vqneg.<V_s_elem>\t%P0, %P1")
+
+;(define_insn "*ss_ashift<mode>_neon"
+; [(set (match_operand:VD 0 "s_register_operand" "=w")
+; (ss_ashift:VD (match_operand:VD 1 "s_register_operand" "w")
+; (match_operand:VD 2 "s_register_operand" "w")))]
+; "TARGET_NEON"
+; "vqshl.<V_s_elem>\t%P0, %P1, %P2")
+
+;; Patterns for builtins.
+
+; good for plain vadd, vaddq.
+
+(define_insn "neon_vadd<mode>"
+ [(set (match_operand:VDQX 0 "s_register_operand" "=w")
+ (unspec:VDQX [(match_operand:VDQX 1 "s_register_operand" "w")
+ (match_operand:VDQX 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VADD))]
+ "TARGET_NEON"
+ "vadd.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq"))
+ (const_string "neon_int_1")))]
+)
+
+; operand 3 represents in bits:
+; bit 0: signed (vs unsigned).
+; bit 1: rounding (vs none).
+
+(define_insn "neon_vaddl<mode>"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (unspec:<V_widen> [(match_operand:VDI 1 "s_register_operand" "w")
+ (match_operand:VDI 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VADDL))]
+ "TARGET_NEON"
+ "vaddl.%T3%#<V_sz_elem>\t%q0, %P1, %P2"
+ [(set_attr "neon_type" "neon_int_3")]
+)
+
+(define_insn "neon_vaddw<mode>"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "w")
+ (match_operand:VDI 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VADDW))]
+ "TARGET_NEON"
+ "vaddw.%T3%#<V_sz_elem>\t%q0, %q1, %P2"
+ [(set_attr "neon_type" "neon_int_2")]
+)
+
+; vhadd and vrhadd.
+
+(define_insn "neon_vhadd<mode>"
+ [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+ (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
+ (match_operand:VDQIW 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VHADD))]
+ "TARGET_NEON"
+ "v%O3hadd.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set_attr "neon_type" "neon_int_4")]
+)
+
+(define_insn "neon_vqadd<mode>"
+ [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+ (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
+ (match_operand:VDQIX 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VQADD))]
+ "TARGET_NEON"
+ "vqadd.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set_attr "neon_type" "neon_int_4")]
+)
+
+(define_insn "neon_vaddhn<mode>"
+ [(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
+ (unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
+ (match_operand:VN 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VADDHN))]
+ "TARGET_NEON"
+ "v%O3addhn.<V_if_elem>\t%P0, %q1, %q2"
+ [(set_attr "neon_type" "neon_int_4")]
+)
+
+(define_insn "neon_vmul<mode>"
+ [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+ (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "w")
+ (match_operand:VDQW 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VMUL))]
+ "TARGET_NEON"
+ "vmul.%F3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq"))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (if_then_else
+ (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long")
+ (const_string "neon_mul_qqq_8_16_32_ddd_32"))
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mul_qqq_8_16_32_ddd_32")
+ (const_string "neon_mul_qqq_8_16_32_ddd_32")))))]
+)
+
+(define_insn "neon_vmla<mode>"
+ [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+ (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")
+ (match_operand:VDQW 2 "s_register_operand" "w")
+ (match_operand:VDQW 3 "s_register_operand" "w")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ UNSPEC_VMLA))]
+ "TARGET_NEON"
+ "vmla.<V_if_elem>\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vmla_ddd")
+ (const_string "neon_fp_vmla_qqq"))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (if_then_else
+ (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long")
+ (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long"))
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mla_qqq_8_16")
+ (const_string "neon_mla_qqq_32_qqd_32_scalar")))))]
+)
+
+(define_insn "neon_vmlal<mode>"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
+ (match_operand:VW 2 "s_register_operand" "w")
+ (match_operand:VW 3 "s_register_operand" "w")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ UNSPEC_VMLAL))]
+ "TARGET_NEON"
+ "vmlal.%T4%#<V_sz_elem>\t%q0, %P2, %P3"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long")
+ (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))]
+)
+
+(define_insn "neon_vmls<mode>"
+ [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+ (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")
+ (match_operand:VDQW 2 "s_register_operand" "w")
+ (match_operand:VDQW 3 "s_register_operand" "w")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ UNSPEC_VMLS))]
+ "TARGET_NEON"
+ "vmls.<V_if_elem>\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vmla_ddd")
+ (const_string "neon_fp_vmla_qqq"))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (if_then_else
+ (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long")
+ (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long"))
+ (if_then_else
+ (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mla_qqq_8_16")
+ (const_string "neon_mla_qqq_32_qqd_32_scalar")))))]
+)
+
+(define_insn "neon_vmlsl<mode>"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
+ (match_operand:VW 2 "s_register_operand" "w")
+ (match_operand:VW 3 "s_register_operand" "w")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ UNSPEC_VMLSL))]
+ "TARGET_NEON"
+ "vmlsl.%T4%#<V_sz_elem>\t%q0, %P2, %P3"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long")
+ (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))]
+)
+
+(define_insn "neon_vqdmulh<mode>"
+ [(set (match_operand:VMDQI 0 "s_register_operand" "=w")
+ (unspec:VMDQI [(match_operand:VMDQI 1 "s_register_operand" "w")
+ (match_operand:VMDQI 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VQDMULH))]
+ "TARGET_NEON"
+ "vq%O3dmulh.<V_s_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long")
+ (const_string "neon_mul_qqq_8_16_32_ddd_32"))
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mul_qqq_8_16_32_ddd_32")
+ (const_string "neon_mul_qqq_8_16_32_ddd_32"))))]
+)
+
+(define_insn "neon_vqdmlal<mode>"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
+ (match_operand:VMDI 2 "s_register_operand" "w")
+ (match_operand:VMDI 3 "s_register_operand" "w")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ UNSPEC_VQDMLAL))]
+ "TARGET_NEON"
+ "vqdmlal.<V_s_elem>\t%q0, %P2, %P3"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long")
+ (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))]
+)
+
+(define_insn "neon_vqdmlsl<mode>"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
+ (match_operand:VMDI 2 "s_register_operand" "w")
+ (match_operand:VMDI 3 "s_register_operand" "w")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ UNSPEC_VQDMLSL))]
+ "TARGET_NEON"
+ "vqdmlsl.<V_s_elem>\t%q0, %P2, %P3"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mla_ddd_8_16_qdd_16_8_long_32_16_long")
+ (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))]
+)
+
+(define_insn "neon_vmull<mode>"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (unspec:<V_widen> [(match_operand:VW 1 "s_register_operand" "w")
+ (match_operand:VW 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VMULL))]
+ "TARGET_NEON"
+ "vmull.%T3%#<V_sz_elem>\t%q0, %P1, %P2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long")
+ (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")))]
+)
+
+(define_insn "neon_vqdmull<mode>"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (unspec:<V_widen> [(match_operand:VMDI 1 "s_register_operand" "w")
+ (match_operand:VMDI 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VQDMULL))]
+ "TARGET_NEON"
+ "vqdmull.<V_s_elem>\t%q0, %P1, %P2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mul_ddd_8_16_qdd_16_8_long_32_16_long")
+ (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")))]
+)
+
+(define_insn "neon_vsub<mode>"
+ [(set (match_operand:VDQX 0 "s_register_operand" "=w")
+ (unspec:VDQX [(match_operand:VDQX 1 "s_register_operand" "w")
+ (match_operand:VDQX 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VSUB))]
+ "TARGET_NEON"
+ "vsub.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq"))
+ (const_string "neon_int_2")))]
+)
+
+(define_insn "neon_vsubl<mode>"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (unspec:<V_widen> [(match_operand:VDI 1 "s_register_operand" "w")
+ (match_operand:VDI 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VSUBL))]
+ "TARGET_NEON"
+ "vsubl.%T3%#<V_sz_elem>\t%q0, %P1, %P2"
+ [(set_attr "neon_type" "neon_int_2")]
+)
+
+(define_insn "neon_vsubw<mode>"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "w")
+ (match_operand:VDI 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VSUBW))]
+ "TARGET_NEON"
+ "vsubw.%T3%#<V_sz_elem>\t%q0, %q1, %P2"
+ [(set_attr "neon_type" "neon_int_2")]
+)
+
+(define_insn "neon_vqsub<mode>"
+ [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+ (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
+ (match_operand:VDQIX 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VQSUB))]
+ "TARGET_NEON"
+ "vqsub.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set_attr "neon_type" "neon_int_5")]
+)
+
+(define_insn "neon_vhsub<mode>"
+ [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+ (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
+ (match_operand:VDQIW 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VHSUB))]
+ "TARGET_NEON"
+ "vhsub.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set_attr "neon_type" "neon_int_5")]
+)
+
+(define_insn "neon_vsubhn<mode>"
+ [(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
+ (unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
+ (match_operand:VN 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VSUBHN))]
+ "TARGET_NEON"
+ "v%O3subhn.<V_if_elem>\t%P0, %q1, %q2"
+ [(set_attr "neon_type" "neon_int_4")]
+)
+
+(define_insn "neon_vceq<mode>"
+ [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
+ (unspec:<V_cmp_result> [(match_operand:VDQW 1 "s_register_operand" "w")
+ (match_operand:VDQW 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VCEQ))]
+ "TARGET_NEON"
+ "vceq.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq"))
+ (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vcge<mode>"
+ [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
+ (unspec:<V_cmp_result> [(match_operand:VDQW 1 "s_register_operand" "w")
+ (match_operand:VDQW 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VCGE))]
+ "TARGET_NEON"
+ "vcge.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq"))
+ (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vcgt<mode>"
+ [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
+ (unspec:<V_cmp_result> [(match_operand:VDQW 1 "s_register_operand" "w")
+ (match_operand:VDQW 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VCGT))]
+ "TARGET_NEON"
+ "vcgt.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq"))
+ (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vcage<mode>"
+ [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
+ (unspec:<V_cmp_result> [(match_operand:VCVTF 1 "s_register_operand" "w")
+ (match_operand:VCVTF 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VCAGE))]
+ "TARGET_NEON"
+ "vacge.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq")))]
+)
+
+(define_insn "neon_vcagt<mode>"
+ [(set (match_operand:<V_cmp_result> 0 "s_register_operand" "=w")
+ (unspec:<V_cmp_result> [(match_operand:VCVTF 1 "s_register_operand" "w")
+ (match_operand:VCVTF 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VCAGT))]
+ "TARGET_NEON"
+ "vacgt.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq")))]
+)
+
+(define_insn "neon_vtst<mode>"
+ [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+ (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
+ (match_operand:VDQIW 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VTST))]
+ "TARGET_NEON"
+ "vtst.<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set_attr "neon_type" "neon_int_4")]
+)
+
+(define_insn "neon_vabd<mode>"
+ [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+ (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "w")
+ (match_operand:VDQW 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VABD))]
+ "TARGET_NEON"
+ "vabd.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq"))
+ (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vabdl<mode>"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (unspec:<V_widen> [(match_operand:VW 1 "s_register_operand" "w")
+ (match_operand:VW 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VABDL))]
+ "TARGET_NEON"
+ "vabdl.%T3%#<V_sz_elem>\t%q0, %P1, %P2"
+ [(set_attr "neon_type" "neon_int_5")]
+)
+
+(define_insn "neon_vaba<mode>"
+ [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+ (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "0")
+ (match_operand:VDQIW 2 "s_register_operand" "w")
+ (match_operand:VDQIW 3 "s_register_operand" "w")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ UNSPEC_VABA))]
+ "TARGET_NEON"
+ "vaba.%T4%#<V_sz_elem>\t%<V_reg>0, %<V_reg>2, %<V_reg>3"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_vaba") (const_string "neon_vaba_qqq")))]
+)
+
+(define_insn "neon_vabal<mode>"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
+ (match_operand:VW 2 "s_register_operand" "w")
+ (match_operand:VW 3 "s_register_operand" "w")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ UNSPEC_VABAL))]
+ "TARGET_NEON"
+ "vabal.%T4%#<V_sz_elem>\t%q0, %P2, %P3"
+ [(set_attr "neon_type" "neon_vaba")]
+)
+
+(define_insn "neon_vmax<mode>"
+ [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+ (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "w")
+ (match_operand:VDQW 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VMAX))]
+ "TARGET_NEON"
+ "vmax.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq"))
+ (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vmin<mode>"
+ [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+ (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "w")
+ (match_operand:VDQW 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VMIN))]
+ "TARGET_NEON"
+ "vmin.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq"))
+ (const_string "neon_int_5")))]
+)
+
+(define_expand "neon_vpadd<mode>"
+ [(match_operand:VD 0 "s_register_operand" "=w")
+ (match_operand:VD 1 "s_register_operand" "w")
+ (match_operand:VD 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ "TARGET_NEON"
+{
+ emit_insn (gen_neon_vpadd_internal<mode> (operands[0], operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_insn "neon_vpaddl<mode>"
+ [(set (match_operand:<V_double_width> 0 "s_register_operand" "=w")
+ (unspec:<V_double_width> [(match_operand:VDQIW 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VPADDL))]
+ "TARGET_NEON"
+ "vpaddl.%T2%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1"
+ ;; Assume this schedules like vaddl.
+ [(set_attr "neon_type" "neon_int_3")]
+)
+
+(define_insn "neon_vpadal<mode>"
+ [(set (match_operand:<V_double_width> 0 "s_register_operand" "=w")
+ (unspec:<V_double_width> [(match_operand:<V_double_width> 1 "s_register_operand" "0")
+ (match_operand:VDQIW 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VPADAL))]
+ "TARGET_NEON"
+ "vpadal.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
+ ;; Assume this schedules like vpadd.
+ [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "neon_vpmax<mode>"
+ [(set (match_operand:VD 0 "s_register_operand" "=w")
+ (unspec:VD [(match_operand:VD 1 "s_register_operand" "w")
+ (match_operand:VD 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VPMAX))]
+ "TARGET_NEON"
+ "vpmax.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ ;; Assume this schedules like vmax.
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vpmin<mode>"
+ [(set (match_operand:VD 0 "s_register_operand" "=w")
+ (unspec:VD [(match_operand:VD 1 "s_register_operand" "w")
+ (match_operand:VD 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VPMIN))]
+ "TARGET_NEON"
+ "vpmin.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ ;; Assume this schedules like vmin.
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_int_5")))]
+)
+
+(define_insn "neon_vrecps<mode>"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+ (unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
+ (match_operand:VCVTF 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VRECPS))]
+ "TARGET_NEON"
+ "vrecps.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vrecps_vrsqrts_ddd")
+ (const_string "neon_fp_vrecps_vrsqrts_qqq")))]
+)
+
+(define_insn "neon_vrsqrts<mode>"
+ [(set (match_operand:VCVTF 0 "s_register_operand" "=w")
+ (unspec:VCVTF [(match_operand:VCVTF 1 "s_register_operand" "w")
+ (match_operand:VCVTF 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VRSQRTS))]
+ "TARGET_NEON"
+ "vrsqrts.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vrecps_vrsqrts_ddd")
+ (const_string "neon_fp_vrecps_vrsqrts_qqq")))]
+)
+
+(define_insn "neon_vabs<mode>"
+ [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+ (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VABS))]
+ "TARGET_NEON"
+ "vabs.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
+ [(set (attr "neon_type")
+ (if_then_else (ior (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (ne (symbol_ref "<Is_float_mode>") (const_int 0)))
+ (if_then_else
+ (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq"))
+ (const_string "neon_vqneg_vqabs")))]
+)
+
+(define_insn "neon_vqabs<mode>"
+ [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+ (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VQABS))]
+ "TARGET_NEON"
+ "vqabs.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
+ [(set_attr "neon_type" "neon_vqneg_vqabs")]
+)
+
+(define_expand "neon_vneg<mode>"
+ [(match_operand:VDQW 0 "s_register_operand" "")
+ (match_operand:VDQW 1 "s_register_operand" "")
+ (match_operand:SI 2 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ emit_insn (gen_neg<mode>2 (operands[0], operands[1]));
+ DONE;
+})
+
+(define_insn "neon_vqneg<mode>"
+ [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+ (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VQNEG))]
+ "TARGET_NEON"
+ "vqneg.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
+ [(set_attr "neon_type" "neon_vqneg_vqabs")]
+)
+
+(define_insn "neon_vcls<mode>"
+ [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+ (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VCLS))]
+ "TARGET_NEON"
+ "vcls.<V_s_elem>\t%<V_reg>0, %<V_reg>1"
+ [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "neon_vclz<mode>"
+ [(set (match_operand:VDQIW 0 "s_register_operand" "=w")
+ (unspec:VDQIW [(match_operand:VDQIW 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VCLZ))]
+ "TARGET_NEON"
+ "vclz.<V_if_elem>\t%<V_reg>0, %<V_reg>1"
+ [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "neon_vcnt<mode>"
+ [(set (match_operand:VE 0 "s_register_operand" "=w")
+ (unspec:VE [(match_operand:VE 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VCNT))]
+ "TARGET_NEON"
+ "vcnt.<V_sz_elem>\t%<V_reg>0, %<V_reg>1"
+ [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_insn "neon_vrecpe<mode>"
+ [(set (match_operand:V32 0 "s_register_operand" "=w")
+ (unspec:V32 [(match_operand:V32 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VRECPE))]
+ "TARGET_NEON"
+ "vrecpe.<V_u_elem>\t%<V_reg>0, %<V_reg>1"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq")))]
+)
+
+(define_insn "neon_vrsqrte<mode>"
+ [(set (match_operand:V32 0 "s_register_operand" "=w")
+ (unspec:V32 [(match_operand:V32 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VRSQRTE))]
+ "TARGET_NEON"
+ "vrsqrte.<V_u_elem>\t%<V_reg>0, %<V_reg>1"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq")))]
+)
+
+(define_expand "neon_vmvn<mode>"
+ [(match_operand:VDQIW 0 "s_register_operand" "")
+ (match_operand:VDQIW 1 "s_register_operand" "")
+ (match_operand:SI 2 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ emit_insn (gen_one_cmpl<mode>2 (operands[0], operands[1]));
+ DONE;
+})
+
+;; FIXME: 32-bit element sizes are a bit funky (should be output as .32 not
+;; .u32), but the assembler should cope with that.
+
+(define_insn "neon_vget_lane<mode>"
+ [(set (match_operand:<V_elem> 0 "s_register_operand" "=r")
+ (unspec:<V_elem> [(match_operand:VD 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VGET_LANE))]
+ "TARGET_NEON"
+ "vmov%?.%t3%#<V_sz_elem>\t%0, %P1[%c2]"
+ [(set_attr "predicable" "yes")
+ (set_attr "neon_type" "neon_bp_simple")]
+)
+
+; Operand 2 (lane number) is ignored because we can only extract the zeroth lane
+; with this insn. Operand 3 (info word) is ignored because it does nothing
+; useful with 64-bit elements.
+
+(define_insn "neon_vget_lanedi"
+ [(set (match_operand:DI 0 "s_register_operand" "=r")
+ (unspec:DI [(match_operand:DI 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VGET_LANE))]
+ "TARGET_NEON"
+ "vmov%?\t%Q0, %R0, %P1 @ di"
+ [(set_attr "predicable" "yes")
+ (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vget_lane<mode>"
+ [(set (match_operand:<V_elem> 0 "s_register_operand" "=r")
+ (unspec:<V_elem> [(match_operand:VQ 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VGET_LANE))]
+ "TARGET_NEON"
+{
+ rtx ops[4];
+ int regno = REGNO (operands[1]);
+ unsigned int halfelts = GET_MODE_NUNITS (<MODE>mode) / 2;
+ unsigned int elt = INTVAL (operands[2]);
+
+ ops[0] = operands[0];
+ ops[1] = gen_rtx_REG (<V_HALF>mode, regno + 2 * (elt / halfelts));
+ ops[2] = GEN_INT (elt % halfelts);
+ ops[3] = operands[3];
+ output_asm_insn ("vmov%?.%t3%#<V_sz_elem>\t%0, %P1[%c2]", ops);
+
+ return "";
+}
+ [(set_attr "predicable" "yes")
+ (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vget_lanev2di"
+ [(set (match_operand:DI 0 "s_register_operand" "=r")
+ (unspec:DI [(match_operand:V2DI 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VGET_LANE))]
+ "TARGET_NEON"
+{
+ rtx ops[2];
+ unsigned int regno = REGNO (operands[1]);
+ unsigned int elt = INTVAL (operands[2]);
+
+ ops[0] = operands[0];
+ ops[1] = gen_rtx_REG (DImode, regno + 2 * elt);
+ output_asm_insn ("vmov%?\t%Q0, %R0, %P1 @ v2di", ops);
+
+ return "";
+}
+ [(set_attr "predicable" "yes")
+ (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vset_lane<mode>"
+ [(set (match_operand:VD 0 "s_register_operand" "=w")
+ (unspec:VD [(match_operand:<V_elem> 1 "s_register_operand" "r")
+ (match_operand:VD 2 "s_register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VSET_LANE))]
+ "TARGET_NEON"
+ "vmov%?.<V_sz_elem>\t%P0[%c3], %1"
+ [(set_attr "predicable" "yes")
+ (set_attr "neon_type" "neon_bp_simple")]
+)
+
+; See neon_vget_lanedi comment for reasons operands 2 & 3 are ignored.
+
+(define_insn "neon_vset_lanedi"
+ [(set (match_operand:DI 0 "s_register_operand" "=w")
+ (unspec:DI [(match_operand:DI 1 "s_register_operand" "r")
+ (match_operand:DI 2 "s_register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VSET_LANE))]
+ "TARGET_NEON"
+ "vmov%?\t%P0, %Q1, %R1 @ di"
+ [(set_attr "predicable" "yes")
+ (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vset_lane<mode>"
+ [(set (match_operand:VQ 0 "s_register_operand" "=w")
+ (unspec:VQ [(match_operand:<V_elem> 1 "s_register_operand" "r")
+ (match_operand:VQ 2 "s_register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VSET_LANE))]
+ "TARGET_NEON"
+{
+ rtx ops[4];
+ unsigned int regno = REGNO (operands[0]);
+ unsigned int halfelts = GET_MODE_NUNITS (<MODE>mode) / 2;
+ unsigned int elt = INTVAL (operands[3]);
+
+ ops[0] = gen_rtx_REG (<V_HALF>mode, regno + 2 * (elt / halfelts));
+ ops[1] = operands[1];
+ ops[2] = GEN_INT (elt % halfelts);
+ output_asm_insn ("vmov%?.<V_sz_elem>\t%P0[%c2], %1", ops);
+
+ return "";
+}
+ [(set_attr "predicable" "yes")
+ (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vset_lanev2di"
+ [(set (match_operand:V2DI 0 "s_register_operand" "=w")
+ (unspec:V2DI [(match_operand:DI 1 "s_register_operand" "r")
+ (match_operand:V2DI 2 "s_register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VSET_LANE))]
+ "TARGET_NEON"
+{
+ rtx ops[2];
+ unsigned int regno = REGNO (operands[0]);
+ unsigned int elt = INTVAL (operands[3]);
+
+ ops[0] = gen_rtx_REG (DImode, regno + 2 * elt);
+ ops[1] = operands[1];
+ output_asm_insn ("vmov%?\t%P0, %Q1, %R1 @ v2di", ops);
+
+ return "";
+}
+ [(set_attr "predicable" "yes")
+ (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_expand "neon_vcreate<mode>"
+ [(match_operand:VDX 0 "s_register_operand" "")
+ (match_operand:DI 1 "general_operand" "")]
+ "TARGET_NEON"
+{
+ rtx src = gen_lowpart (<MODE>mode, operands[1]);
+ emit_move_insn (operands[0], src);
+ DONE;
+})
+
+(define_insn "neon_vdup_n<mode>"
+ [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+ (unspec:VDQW [(match_operand:<V_elem> 1 "s_register_operand" "r")]
+ UNSPEC_VDUP_N))]
+ "TARGET_NEON"
+ "vdup%?.<V_sz_elem>\t%<V_reg>0, %1"
+ ;; Assume this schedules like vmov.
+ [(set_attr "predicable" "yes")
+ (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vdup_ndi"
+ [(set (match_operand:DI 0 "s_register_operand" "=w")
+ (unspec:DI [(match_operand:DI 1 "s_register_operand" "r")]
+ UNSPEC_VDUP_N))]
+ "TARGET_NEON"
+ "vmov%?\t%P0, %Q1, %R1"
+ [(set_attr "predicable" "yes")
+ (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vdup_nv2di"
+ [(set (match_operand:V2DI 0 "s_register_operand" "=w")
+ (unspec:V2DI [(match_operand:DI 1 "s_register_operand" "r")]
+ UNSPEC_VDUP_N))]
+ "TARGET_NEON"
+ "vmov%?\t%e0, %Q1, %R1\;vmov%?\t%f0, %Q1, %R1"
+ [(set_attr "predicable" "yes")
+ (set_attr "length" "8")
+ (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vdup_lane<mode>"
+ [(set (match_operand:VD 0 "s_register_operand" "=w")
+ (unspec:VD [(match_operand:VD 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VDUP_LANE))]
+ "TARGET_NEON"
+ "vdup.<V_sz_elem>\t%P0, %P1[%c2]"
+ ;; Assume this schedules like vmov.
+ [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vdup_lane<mode>"
+ [(set (match_operand:VQ 0 "s_register_operand" "=w")
+ (unspec:VQ [(match_operand:<V_HALF> 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VDUP_LANE))]
+ "TARGET_NEON"
+ "vdup.<V_sz_elem>\t%q0, %P1[%c2]"
+ ;; Assume this schedules like vmov.
+ [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+; Scalar index is ignored, since only zero is valid here.
+(define_expand "neon_vdup_lanedi"
+ [(set (match_operand:DI 0 "s_register_operand" "=w")
+ (unspec:DI [(match_operand:DI 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VDUP_LANE))]
+ "TARGET_NEON"
+{
+ emit_move_insn (operands[0], operands[1]);
+ DONE;
+})
+
+; Likewise.
+(define_insn "neon_vdup_lanev2di"
+ [(set (match_operand:V2DI 0 "s_register_operand" "=w")
+ (unspec:V2DI [(match_operand:DI 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VDUP_LANE))]
+ "TARGET_NEON"
+ "vmov\t%e0, %P1\;vmov\t%f0, %P1"
+ [(set_attr "length" "8")
+ (set_attr "neon_type" "neon_bp_simple")]
+)
+
+;; In this insn, operand 1 should be low, and operand 2 the high part of the
+;; dest vector.
+;; FIXME: A different implementation of this builtin could make it much
+;; more likely that we wouldn't actually need to output anything (we could make
+;; it so that the reg allocator puts things in the right places magically
+;; instead). Lack of subregs for vectors makes that tricky though, I think.
+
+(define_insn "neon_vcombine<mode>"
+ [(set (match_operand:<V_DOUBLE> 0 "s_register_operand" "=w")
+ (unspec:<V_DOUBLE> [(match_operand:VDX 1 "s_register_operand" "w")
+ (match_operand:VDX 2 "s_register_operand" "w")]
+ UNSPEC_VCOMBINE))]
+ "TARGET_NEON"
+{
+ int dest = REGNO (operands[0]);
+ int src1 = REGNO (operands[1]);
+ int src2 = REGNO (operands[2]);
+ rtx destlo;
+
+ if (src1 == dest && src2 == dest + 2)
+ return "";
+ else if (src2 == dest && src1 == dest + 2)
+ /* Special case of reversed high/low parts. */
+ return "vswp\t%P1, %P2";
+
+ destlo = gen_rtx_REG (<MODE>mode, dest);
+
+ if (!reg_overlap_mentioned_p (operands[2], destlo))
+ {
+ /* Try to avoid unnecessary moves if part of the result is in the right
+ place already. */
+ if (src1 != dest)
+ output_asm_insn ("vmov\t%e0, %P1", operands);
+ if (src2 != dest + 2)
+ output_asm_insn ("vmov\t%f0, %P2", operands);
+ }
+ else
+ {
+ if (src2 != dest + 2)
+ output_asm_insn ("vmov\t%f0, %P2", operands);
+ if (src1 != dest)
+ output_asm_insn ("vmov\t%e0, %P1", operands);
+ }
+
+ return "";
+}
+ ;; We set the neon_type attribute based on the vmov instructions above.
+ [(set_attr "length" "8")
+ (set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vget_high<mode>"
+ [(set (match_operand:<V_HALF> 0 "s_register_operand" "=w")
+ (unspec:<V_HALF> [(match_operand:VQX 1 "s_register_operand" "w")]
+ UNSPEC_VGET_HIGH))]
+ "TARGET_NEON"
+{
+ int dest = REGNO (operands[0]);
+ int src = REGNO (operands[1]);
+
+ if (dest != src + 2)
+ return "vmov\t%P0, %f1";
+ else
+ return "";
+}
+ [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vget_low<mode>"
+ [(set (match_operand:<V_HALF> 0 "s_register_operand" "=w")
+ (unspec:<V_HALF> [(match_operand:VQX 1 "s_register_operand" "w")]
+ UNSPEC_VGET_LOW))]
+ "TARGET_NEON"
+{
+ int dest = REGNO (operands[0]);
+ int src = REGNO (operands[1]);
+
+ if (dest != src)
+ return "vmov\t%P0, %e1";
+ else
+ return "";
+}
+ [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vcvt<mode>"
+ [(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
+ (unspec:<V_CVTTO> [(match_operand:VCVTF 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VCVT))]
+ "TARGET_NEON"
+ "vcvt.%T2%#32.f32\t%<V_reg>0, %<V_reg>1"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq")))]
+)
+
+(define_insn "neon_vcvt<mode>"
+ [(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
+ (unspec:<V_CVTTO> [(match_operand:VCVTI 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VCVT))]
+ "TARGET_NEON"
+ "vcvt.f32.%T2%#32\t%<V_reg>0, %<V_reg>1"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq")))]
+)
+
+(define_insn "neon_vcvt_n<mode>"
+ [(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
+ (unspec:<V_CVTTO> [(match_operand:VCVTF 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VCVT_N))]
+ "TARGET_NEON"
+ "vcvt.%T3%#32.f32\t%<V_reg>0, %<V_reg>1, %2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq")))]
+)
+
+(define_insn "neon_vcvt_n<mode>"
+ [(set (match_operand:<V_CVTTO> 0 "s_register_operand" "=w")
+ (unspec:<V_CVTTO> [(match_operand:VCVTI 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VCVT_N))]
+ "TARGET_NEON"
+ "vcvt.f32.%T3%#32\t%<V_reg>0, %<V_reg>1, %2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_fp_vadd_ddd_vabs_dd")
+ (const_string "neon_fp_vadd_qqq_vabs_qq")))]
+)
+
+(define_insn "neon_vmovn<mode>"
+ [(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
+ (unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VMOVN))]
+ "TARGET_NEON"
+ "vmovn.<V_if_elem>\t%P0, %q1"
+ [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vqmovn<mode>"
+ [(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
+ (unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VQMOVN))]
+ "TARGET_NEON"
+ "vqmovn.%T2%#<V_sz_elem>\t%P0, %q1"
+ [(set_attr "neon_type" "neon_shift_2")]
+)
+
+(define_insn "neon_vqmovun<mode>"
+ [(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
+ (unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VQMOVUN))]
+ "TARGET_NEON"
+ "vqmovun.<V_s_elem>\t%P0, %q1"
+ [(set_attr "neon_type" "neon_shift_2")]
+)
+
+(define_insn "neon_vmovl<mode>"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (unspec:<V_widen> [(match_operand:VW 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VMOVL))]
+ "TARGET_NEON"
+ "vmovl.%T2%#<V_sz_elem>\t%q0, %P1"
+ [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_insn "neon_vmul_lane<mode>"
+ [(set (match_operand:VMD 0 "s_register_operand" "=w")
+ (unspec:VMD [(match_operand:VMD 1 "s_register_operand" "w")
+ (match_operand:VMD 2 "s_register_operand"
+ "<scalar_mul_constraint>")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ UNSPEC_VMUL_LANE))]
+ "TARGET_NEON"
+ "vmul.<V_if_elem>\t%P0, %P1, %P2[%c3]"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (const_string "neon_fp_vmul_ddd")
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mul_ddd_16_scalar_32_16_long_scalar")
+ (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar"))))]
+)
+
+(define_insn "neon_vmul_lane<mode>"
+ [(set (match_operand:VMQ 0 "s_register_operand" "=w")
+ (unspec:VMQ [(match_operand:VMQ 1 "s_register_operand" "w")
+ (match_operand:<V_HALF> 2 "s_register_operand"
+ "<scalar_mul_constraint>")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ UNSPEC_VMUL_LANE))]
+ "TARGET_NEON"
+ "vmul.<V_if_elem>\t%q0, %q1, %P2[%c3]"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (const_string "neon_fp_vmul_qqd")
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")
+ (const_string "neon_mul_qqd_32_scalar"))))]
+)
+
+(define_insn "neon_vmull_lane<mode>"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (unspec:<V_widen> [(match_operand:VMDI 1 "s_register_operand" "w")
+ (match_operand:VMDI 2 "s_register_operand"
+ "<scalar_mul_constraint>")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ UNSPEC_VMULL_LANE))]
+ "TARGET_NEON"
+ "vmull.%T4%#<V_sz_elem>\t%q0, %P1, %P2[%c3]"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mul_ddd_16_scalar_32_16_long_scalar")
+ (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")))]
+)
+
+(define_insn "neon_vqdmull_lane<mode>"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (unspec:<V_widen> [(match_operand:VMDI 1 "s_register_operand" "w")
+ (match_operand:VMDI 2 "s_register_operand"
+ "<scalar_mul_constraint>")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ UNSPEC_VQDMULL_LANE))]
+ "TARGET_NEON"
+ "vqdmull.<V_s_elem>\t%q0, %P1, %P2[%c3]"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mul_ddd_16_scalar_32_16_long_scalar")
+ (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")))]
+)
+
+(define_insn "neon_vqdmulh_lane<mode>"
+ [(set (match_operand:VMQI 0 "s_register_operand" "=w")
+ (unspec:VMQI [(match_operand:VMQI 1 "s_register_operand" "w")
+ (match_operand:<V_HALF> 2 "s_register_operand"
+ "<scalar_mul_constraint>")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ UNSPEC_VQDMULH_LANE))]
+ "TARGET_NEON"
+ "vq%O4dmulh.%T4%#<V_sz_elem>\t%q0, %q1, %P2[%c3]"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")
+ (const_string "neon_mul_qqd_32_scalar")))]
+)
+
+(define_insn "neon_vqdmulh_lane<mode>"
+ [(set (match_operand:VMDI 0 "s_register_operand" "=w")
+ (unspec:VMDI [(match_operand:VMDI 1 "s_register_operand" "w")
+ (match_operand:VMDI 2 "s_register_operand"
+ "<scalar_mul_constraint>")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ UNSPEC_VQDMULH_LANE))]
+ "TARGET_NEON"
+ "vq%O4dmulh.%T4%#<V_sz_elem>\t%P0, %P1, %P2[%c3]"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mul_ddd_16_scalar_32_16_long_scalar")
+ (const_string "neon_mul_qdd_64_32_long_qqd_16_ddd_32_scalar_64_32_long_scalar")))]
+)
+
+(define_insn "neon_vmla_lane<mode>"
+ [(set (match_operand:VMD 0 "s_register_operand" "=w")
+ (unspec:VMD [(match_operand:VMD 1 "s_register_operand" "0")
+ (match_operand:VMD 2 "s_register_operand" "w")
+ (match_operand:VMD 3 "s_register_operand"
+ "<scalar_mul_constraint>")
+ (match_operand:SI 4 "immediate_operand" "i")
+ (match_operand:SI 5 "immediate_operand" "i")]
+ UNSPEC_VMLA_LANE))]
+ "TARGET_NEON"
+ "vmla.<V_if_elem>\t%P0, %P2, %P3[%c4]"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (const_string "neon_fp_vmla_ddd_scalar")
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar")
+ (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long"))))]
+)
+
+(define_insn "neon_vmla_lane<mode>"
+ [(set (match_operand:VMQ 0 "s_register_operand" "=w")
+ (unspec:VMQ [(match_operand:VMQ 1 "s_register_operand" "0")
+ (match_operand:VMQ 2 "s_register_operand" "w")
+ (match_operand:<V_HALF> 3 "s_register_operand"
+ "<scalar_mul_constraint>")
+ (match_operand:SI 4 "immediate_operand" "i")
+ (match_operand:SI 5 "immediate_operand" "i")]
+ UNSPEC_VMLA_LANE))]
+ "TARGET_NEON"
+ "vmla.<V_if_elem>\t%q0, %q2, %P3[%c4]"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (const_string "neon_fp_vmla_qqq_scalar")
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")
+ (const_string "neon_mla_qqq_32_qqd_32_scalar"))))]
+)
+
+(define_insn "neon_vmlal_lane<mode>"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
+ (match_operand:VMDI 2 "s_register_operand" "w")
+ (match_operand:VMDI 3 "s_register_operand"
+ "<scalar_mul_constraint>")
+ (match_operand:SI 4 "immediate_operand" "i")
+ (match_operand:SI 5 "immediate_operand" "i")]
+ UNSPEC_VMLAL_LANE))]
+ "TARGET_NEON"
+ "vmlal.%T5%#<V_sz_elem>\t%q0, %P2, %P3[%c4]"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar")
+ (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))]
+)
+
+(define_insn "neon_vqdmlal_lane<mode>"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
+ (match_operand:VMDI 2 "s_register_operand" "w")
+ (match_operand:VMDI 3 "s_register_operand"
+ "<scalar_mul_constraint>")
+ (match_operand:SI 4 "immediate_operand" "i")
+ (match_operand:SI 5 "immediate_operand" "i")]
+ UNSPEC_VQDMLAL_LANE))]
+ "TARGET_NEON"
+ "vqdmlal.<V_s_elem>\t%q0, %P2, %P3[%c4]"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar")
+ (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))]
+)
+
+(define_insn "neon_vmls_lane<mode>"
+ [(set (match_operand:VMD 0 "s_register_operand" "=w")
+ (unspec:VMD [(match_operand:VMD 1 "s_register_operand" "0")
+ (match_operand:VMD 2 "s_register_operand" "w")
+ (match_operand:VMD 3 "s_register_operand"
+ "<scalar_mul_constraint>")
+ (match_operand:SI 4 "immediate_operand" "i")
+ (match_operand:SI 5 "immediate_operand" "i")]
+ UNSPEC_VMLS_LANE))]
+ "TARGET_NEON"
+ "vmls.<V_if_elem>\t%P0, %P2, %P3[%c4]"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (const_string "neon_fp_vmla_ddd_scalar")
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar")
+ (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long"))))]
+)
+
+(define_insn "neon_vmls_lane<mode>"
+ [(set (match_operand:VMQ 0 "s_register_operand" "=w")
+ (unspec:VMQ [(match_operand:VMQ 1 "s_register_operand" "0")
+ (match_operand:VMQ 2 "s_register_operand" "w")
+ (match_operand:<V_HALF> 3 "s_register_operand"
+ "<scalar_mul_constraint>")
+ (match_operand:SI 4 "immediate_operand" "i")
+ (match_operand:SI 5 "immediate_operand" "i")]
+ UNSPEC_VMLS_LANE))]
+ "TARGET_NEON"
+ "vmls.<V_if_elem>\t%q0, %q2, %P3[%c4]"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_float_mode>") (const_int 0))
+ (const_string "neon_fp_vmla_qqq_scalar")
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")
+ (const_string "neon_mla_qqq_32_qqd_32_scalar"))))]
+)
+
+(define_insn "neon_vmlsl_lane<mode>"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
+ (match_operand:VMDI 2 "s_register_operand" "w")
+ (match_operand:VMDI 3 "s_register_operand"
+ "<scalar_mul_constraint>")
+ (match_operand:SI 4 "immediate_operand" "i")
+ (match_operand:SI 5 "immediate_operand" "i")]
+ UNSPEC_VMLSL_LANE))]
+ "TARGET_NEON"
+ "vmlsl.%T5%#<V_sz_elem>\t%q0, %P2, %P3[%c4]"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar")
+ (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))]
+)
+
+(define_insn "neon_vqdmlsl_lane<mode>"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (unspec:<V_widen> [(match_operand:<V_widen> 1 "s_register_operand" "0")
+ (match_operand:VMDI 2 "s_register_operand" "w")
+ (match_operand:VMDI 3 "s_register_operand"
+ "<scalar_mul_constraint>")
+ (match_operand:SI 4 "immediate_operand" "i")
+ (match_operand:SI 5 "immediate_operand" "i")]
+ UNSPEC_VQDMLSL_LANE))]
+ "TARGET_NEON"
+ "vqdmlsl.<V_s_elem>\t%q0, %P2, %P3[%c4]"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Scalar_mul_8_16>") (const_int 0))
+ (const_string "neon_mla_ddd_16_scalar_qdd_32_16_long_scalar")
+ (const_string "neon_mla_ddd_32_qqd_16_ddd_32_scalar_qdd_64_32_long_scalar_qdd_64_32_long")))]
+)
+
+; FIXME: For the "_n" multiply/multiply-accumulate insns, we copy a value in a
+; core register into a temp register, then use a scalar taken from that. This
+; isn't an optimal solution if e.g. the scalar has just been read from memory
+; or extracted from another vector. The latter case it's currently better to
+; use the "_lane" variant, and the former case can probably be implemented
+; using vld1_lane, but that hasn't been done yet.
+
+(define_expand "neon_vmul_n<mode>"
+ [(match_operand:VMD 0 "s_register_operand" "")
+ (match_operand:VMD 1 "s_register_operand" "")
+ (match_operand:<V_elem> 2 "s_register_operand" "")
+ (match_operand:SI 3 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_neon_vset_lane<mode> (tmp, operands[2], tmp, const0_rtx));
+ emit_insn (gen_neon_vmul_lane<mode> (operands[0], operands[1], tmp,
+ const0_rtx, const0_rtx));
+ DONE;
+})
+
+(define_expand "neon_vmul_n<mode>"
+ [(match_operand:VMQ 0 "s_register_operand" "")
+ (match_operand:VMQ 1 "s_register_operand" "")
+ (match_operand:<V_elem> 2 "s_register_operand" "")
+ (match_operand:SI 3 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ rtx tmp = gen_reg_rtx (<V_HALF>mode);
+ emit_insn (gen_neon_vset_lane<V_half> (tmp, operands[2], tmp, const0_rtx));
+ emit_insn (gen_neon_vmul_lane<mode> (operands[0], operands[1], tmp,
+ const0_rtx, const0_rtx));
+ DONE;
+})
+
+(define_expand "neon_vmull_n<mode>"
+ [(match_operand:<V_widen> 0 "s_register_operand" "")
+ (match_operand:VMDI 1 "s_register_operand" "")
+ (match_operand:<V_elem> 2 "s_register_operand" "")
+ (match_operand:SI 3 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_neon_vset_lane<mode> (tmp, operands[2], tmp, const0_rtx));
+ emit_insn (gen_neon_vmull_lane<mode> (operands[0], operands[1], tmp,
+ const0_rtx, operands[3]));
+ DONE;
+})
+
+(define_expand "neon_vqdmull_n<mode>"
+ [(match_operand:<V_widen> 0 "s_register_operand" "")
+ (match_operand:VMDI 1 "s_register_operand" "")
+ (match_operand:<V_elem> 2 "s_register_operand" "")
+ (match_operand:SI 3 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_neon_vset_lane<mode> (tmp, operands[2], tmp, const0_rtx));
+ emit_insn (gen_neon_vqdmull_lane<mode> (operands[0], operands[1], tmp,
+ const0_rtx, const0_rtx));
+ DONE;
+})
+
+(define_expand "neon_vqdmulh_n<mode>"
+ [(match_operand:VMDI 0 "s_register_operand" "")
+ (match_operand:VMDI 1 "s_register_operand" "")
+ (match_operand:<V_elem> 2 "s_register_operand" "")
+ (match_operand:SI 3 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_neon_vset_lane<mode> (tmp, operands[2], tmp, const0_rtx));
+ emit_insn (gen_neon_vqdmulh_lane<mode> (operands[0], operands[1], tmp,
+ const0_rtx, operands[3]));
+ DONE;
+})
+
+(define_expand "neon_vqdmulh_n<mode>"
+ [(match_operand:VMQI 0 "s_register_operand" "")
+ (match_operand:VMQI 1 "s_register_operand" "")
+ (match_operand:<V_elem> 2 "s_register_operand" "")
+ (match_operand:SI 3 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ rtx tmp = gen_reg_rtx (<V_HALF>mode);
+ emit_insn (gen_neon_vset_lane<V_half> (tmp, operands[2], tmp, const0_rtx));
+ emit_insn (gen_neon_vqdmulh_lane<mode> (operands[0], operands[1], tmp,
+ const0_rtx, operands[3]));
+ DONE;
+})
+
+(define_expand "neon_vmla_n<mode>"
+ [(match_operand:VMD 0 "s_register_operand" "")
+ (match_operand:VMD 1 "s_register_operand" "")
+ (match_operand:VMD 2 "s_register_operand" "")
+ (match_operand:<V_elem> 3 "s_register_operand" "")
+ (match_operand:SI 4 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_neon_vset_lane<mode> (tmp, operands[3], tmp, const0_rtx));
+ emit_insn (gen_neon_vmla_lane<mode> (operands[0], operands[1], operands[2],
+ tmp, const0_rtx, operands[4]));
+ DONE;
+})
+
+(define_expand "neon_vmla_n<mode>"
+ [(match_operand:VMQ 0 "s_register_operand" "")
+ (match_operand:VMQ 1 "s_register_operand" "")
+ (match_operand:VMQ 2 "s_register_operand" "")
+ (match_operand:<V_elem> 3 "s_register_operand" "")
+ (match_operand:SI 4 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ rtx tmp = gen_reg_rtx (<V_HALF>mode);
+ emit_insn (gen_neon_vset_lane<V_half> (tmp, operands[3], tmp, const0_rtx));
+ emit_insn (gen_neon_vmla_lane<mode> (operands[0], operands[1], operands[2],
+ tmp, const0_rtx, operands[4]));
+ DONE;
+})
+
+(define_expand "neon_vmlal_n<mode>"
+ [(match_operand:<V_widen> 0 "s_register_operand" "")
+ (match_operand:<V_widen> 1 "s_register_operand" "")
+ (match_operand:VMDI 2 "s_register_operand" "")
+ (match_operand:<V_elem> 3 "s_register_operand" "")
+ (match_operand:SI 4 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_neon_vset_lane<mode> (tmp, operands[3], tmp, const0_rtx));
+ emit_insn (gen_neon_vmlal_lane<mode> (operands[0], operands[1], operands[2],
+ tmp, const0_rtx, operands[4]));
+ DONE;
+})
+
+(define_expand "neon_vqdmlal_n<mode>"
+ [(match_operand:<V_widen> 0 "s_register_operand" "")
+ (match_operand:<V_widen> 1 "s_register_operand" "")
+ (match_operand:VMDI 2 "s_register_operand" "")
+ (match_operand:<V_elem> 3 "s_register_operand" "")
+ (match_operand:SI 4 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_neon_vset_lane<mode> (tmp, operands[3], tmp, const0_rtx));
+ emit_insn (gen_neon_vqdmlal_lane<mode> (operands[0], operands[1], operands[2],
+ tmp, const0_rtx, operands[4]));
+ DONE;
+})
+
+(define_expand "neon_vmls_n<mode>"
+ [(match_operand:VMD 0 "s_register_operand" "")
+ (match_operand:VMD 1 "s_register_operand" "")
+ (match_operand:VMD 2 "s_register_operand" "")
+ (match_operand:<V_elem> 3 "s_register_operand" "")
+ (match_operand:SI 4 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_neon_vset_lane<mode> (tmp, operands[3], tmp, const0_rtx));
+ emit_insn (gen_neon_vmls_lane<mode> (operands[0], operands[1], operands[2],
+ tmp, const0_rtx, operands[4]));
+ DONE;
+})
+
+(define_expand "neon_vmls_n<mode>"
+ [(match_operand:VMQ 0 "s_register_operand" "")
+ (match_operand:VMQ 1 "s_register_operand" "")
+ (match_operand:VMQ 2 "s_register_operand" "")
+ (match_operand:<V_elem> 3 "s_register_operand" "")
+ (match_operand:SI 4 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ rtx tmp = gen_reg_rtx (<V_HALF>mode);
+ emit_insn (gen_neon_vset_lane<V_half> (tmp, operands[3], tmp, const0_rtx));
+ emit_insn (gen_neon_vmls_lane<mode> (operands[0], operands[1], operands[2],
+ tmp, const0_rtx, operands[4]));
+ DONE;
+})
+
+(define_expand "neon_vmlsl_n<mode>"
+ [(match_operand:<V_widen> 0 "s_register_operand" "")
+ (match_operand:<V_widen> 1 "s_register_operand" "")
+ (match_operand:VMDI 2 "s_register_operand" "")
+ (match_operand:<V_elem> 3 "s_register_operand" "")
+ (match_operand:SI 4 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_neon_vset_lane<mode> (tmp, operands[3], tmp, const0_rtx));
+ emit_insn (gen_neon_vmlsl_lane<mode> (operands[0], operands[1], operands[2],
+ tmp, const0_rtx, operands[4]));
+ DONE;
+})
+
+(define_expand "neon_vqdmlsl_n<mode>"
+ [(match_operand:<V_widen> 0 "s_register_operand" "")
+ (match_operand:<V_widen> 1 "s_register_operand" "")
+ (match_operand:VMDI 2 "s_register_operand" "")
+ (match_operand:<V_elem> 3 "s_register_operand" "")
+ (match_operand:SI 4 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ rtx tmp = gen_reg_rtx (<MODE>mode);
+ emit_insn (gen_neon_vset_lane<mode> (tmp, operands[3], tmp, const0_rtx));
+ emit_insn (gen_neon_vqdmlsl_lane<mode> (operands[0], operands[1], operands[2],
+ tmp, const0_rtx, operands[4]));
+ DONE;
+})
+
+(define_insn "neon_vext<mode>"
+ [(set (match_operand:VDQX 0 "s_register_operand" "=w")
+ (unspec:VDQX [(match_operand:VDQX 1 "s_register_operand" "w")
+ (match_operand:VDQX 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VEXT))]
+ "TARGET_NEON"
+ "vext.<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2, %3"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_bp_simple")
+ (const_string "neon_bp_2cycle")))]
+)
+
+(define_insn "neon_vrev64<mode>"
+ [(set (match_operand:VDQ 0 "s_register_operand" "=w")
+ (unspec:VDQ [(match_operand:VDQ 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VREV64))]
+ "TARGET_NEON"
+ "vrev64.<V_sz_elem>\t%<V_reg>0, %<V_reg>1"
+ [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vrev32<mode>"
+ [(set (match_operand:VX 0 "s_register_operand" "=w")
+ (unspec:VX [(match_operand:VX 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VREV32))]
+ "TARGET_NEON"
+ "vrev32.<V_sz_elem>\t%<V_reg>0, %<V_reg>1"
+ [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+(define_insn "neon_vrev16<mode>"
+ [(set (match_operand:VE 0 "s_register_operand" "=w")
+ (unspec:VE [(match_operand:VE 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_VREV16))]
+ "TARGET_NEON"
+ "vrev16.<V_sz_elem>\t%<V_reg>0, %<V_reg>1"
+ [(set_attr "neon_type" "neon_bp_simple")]
+)
+
+; vbsl_* intrinsics may compile to any of vbsl/vbif/vbit depending on register
+; allocation. For an intrinsic of form:
+; rD = vbsl_* (rS, rN, rM)
+; We can use any of:
+; vbsl rS, rN, rM (if D = S)
+; vbit rD, rN, rS (if D = M, so 1-bits in rS choose bits from rN, else rM)
+; vbif rD, rM, rS (if D = N, so 0-bits in rS choose bits from rM, else rN)
+
+(define_insn "neon_vbsl<mode>_internal"
+ [(set (match_operand:VDQX 0 "s_register_operand" "=w,w,w")
+ (unspec:VDQX [(match_operand:VDQX 1 "s_register_operand" " 0,w,w")
+ (match_operand:VDQX 2 "s_register_operand" " w,w,0")
+ (match_operand:VDQX 3 "s_register_operand" " w,0,w")]
+ UNSPEC_VBSL))]
+ "TARGET_NEON"
+ "@
+ vbsl\t%<V_reg>0, %<V_reg>2, %<V_reg>3
+ vbit\t%<V_reg>0, %<V_reg>2, %<V_reg>1
+ vbif\t%<V_reg>0, %<V_reg>3, %<V_reg>1"
+ [(set_attr "neon_type" "neon_int_1")]
+)
+
+(define_expand "neon_vbsl<mode>"
+ [(set (match_operand:VDQX 0 "s_register_operand" "")
+ (unspec:VDQX [(match_operand:<V_cmp_result> 1 "s_register_operand" "")
+ (match_operand:VDQX 2 "s_register_operand" "")
+ (match_operand:VDQX 3 "s_register_operand" "")]
+ UNSPEC_VBSL))]
+ "TARGET_NEON"
+{
+ /* We can't alias operands together if they have different modes. */
+ operands[1] = gen_lowpart (<MODE>mode, operands[1]);
+})
+
+(define_insn "neon_vshl<mode>"
+ [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+ (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
+ (match_operand:VDQIX 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VSHL))]
+ "TARGET_NEON"
+ "v%O3shl.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_vshl_ddd")
+ (const_string "neon_shift_3")))]
+)
+
+(define_insn "neon_vqshl<mode>"
+ [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+ (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
+ (match_operand:VDQIX 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VQSHL))]
+ "TARGET_NEON"
+ "vq%O3shl.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_shift_2")
+ (const_string "neon_vqshl_vrshl_vqrshl_qqq")))]
+)
+
+(define_insn "neon_vshr_n<mode>"
+ [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+ (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VSHR_N))]
+ "TARGET_NEON"
+ "v%O3shr.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %2"
+ [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_insn "neon_vshrn_n<mode>"
+ [(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
+ (unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VSHRN_N))]
+ "TARGET_NEON"
+ "v%O3shrn.<V_if_elem>\t%P0, %q1, %2"
+ [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_insn "neon_vqshrn_n<mode>"
+ [(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
+ (unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VQSHRN_N))]
+ "TARGET_NEON"
+ "vq%O3shrn.%T3%#<V_sz_elem>\t%P0, %q1, %2"
+ [(set_attr "neon_type" "neon_shift_2")]
+)
+
+(define_insn "neon_vqshrun_n<mode>"
+ [(set (match_operand:<V_narrow> 0 "s_register_operand" "=w")
+ (unspec:<V_narrow> [(match_operand:VN 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VQSHRUN_N))]
+ "TARGET_NEON"
+ "vq%O3shrun.%T3%#<V_sz_elem>\t%P0, %q1, %2"
+ [(set_attr "neon_type" "neon_shift_2")]
+)
+
+(define_insn "neon_vshl_n<mode>"
+ [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+ (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VSHL_N))]
+ "TARGET_NEON"
+ "vshl.<V_if_elem>\t%<V_reg>0, %<V_reg>1, %2"
+ [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_insn "neon_vqshl_n<mode>"
+ [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+ (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VQSHL_N))]
+ "TARGET_NEON"
+ "vqshl.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %2"
+ [(set_attr "neon_type" "neon_shift_2")]
+)
+
+(define_insn "neon_vqshlu_n<mode>"
+ [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+ (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VQSHLU_N))]
+ "TARGET_NEON"
+ "vqshlu.%T3%#<V_sz_elem>\t%<V_reg>0, %<V_reg>1, %2"
+ [(set_attr "neon_type" "neon_shift_2")]
+)
+
+(define_insn "neon_vshll_n<mode>"
+ [(set (match_operand:<V_widen> 0 "s_register_operand" "=w")
+ (unspec:<V_widen> [(match_operand:VW 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VSHLL_N))]
+ "TARGET_NEON"
+ "vshll.%T3%#<V_sz_elem>\t%q0, %P1, %2"
+ [(set_attr "neon_type" "neon_shift_1")]
+)
+
+(define_insn "neon_vsra_n<mode>"
+ [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+ (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "0")
+ (match_operand:VDQIX 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (match_operand:SI 4 "immediate_operand" "i")]
+ UNSPEC_VSRA_N))]
+ "TARGET_NEON"
+ "v%O4sra.%T4%#<V_sz_elem>\t%<V_reg>0, %<V_reg>2, %3"
+ [(set_attr "neon_type" "neon_vsra_vrsra")]
+)
+
+(define_insn "neon_vsri_n<mode>"
+ [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+ (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "0")
+ (match_operand:VDQIX 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VSRI))]
+ "TARGET_NEON"
+ "vsri.<V_sz_elem>\t%<V_reg>0, %<V_reg>2, %3"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_shift_1")
+ (const_string "neon_shift_3")))]
+)
+
+(define_insn "neon_vsli_n<mode>"
+ [(set (match_operand:VDQIX 0 "s_register_operand" "=w")
+ (unspec:VDQIX [(match_operand:VDQIX 1 "s_register_operand" "0")
+ (match_operand:VDQIX 2 "s_register_operand" "w")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VSLI))]
+ "TARGET_NEON"
+ "vsli.<V_sz_elem>\t%<V_reg>0, %<V_reg>2, %3"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_shift_1")
+ (const_string "neon_shift_3")))]
+)
+
+(define_insn "neon_vtbl1v8qi"
+ [(set (match_operand:V8QI 0 "s_register_operand" "=w")
+ (unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "w")
+ (match_operand:V8QI 2 "s_register_operand" "w")]
+ UNSPEC_VTBL))]
+ "TARGET_NEON"
+ "vtbl.8\t%P0, {%P1}, %P2"
+ [(set_attr "neon_type" "neon_bp_2cycle")]
+)
+
+(define_insn "neon_vtbl2v8qi"
+ [(set (match_operand:V8QI 0 "s_register_operand" "=w")
+ (unspec:V8QI [(match_operand:TI 1 "s_register_operand" "w")
+ (match_operand:V8QI 2 "s_register_operand" "w")]
+ UNSPEC_VTBL))]
+ "TARGET_NEON"
+{
+ rtx ops[4];
+ int tabbase = REGNO (operands[1]);
+
+ ops[0] = operands[0];
+ ops[1] = gen_rtx_REG (V8QImode, tabbase);
+ ops[2] = gen_rtx_REG (V8QImode, tabbase + 2);
+ ops[3] = operands[2];
+ output_asm_insn ("vtbl.8\t%P0, {%P1, %P2}, %P3", ops);
+
+ return "";
+}
+ [(set_attr "neon_type" "neon_bp_2cycle")]
+)
+
+(define_insn "neon_vtbl3v8qi"
+ [(set (match_operand:V8QI 0 "s_register_operand" "=w")
+ (unspec:V8QI [(match_operand:EI 1 "s_register_operand" "w")
+ (match_operand:V8QI 2 "s_register_operand" "w")]
+ UNSPEC_VTBL))]
+ "TARGET_NEON"
+{
+ rtx ops[5];
+ int tabbase = REGNO (operands[1]);
+
+ ops[0] = operands[0];
+ ops[1] = gen_rtx_REG (V8QImode, tabbase);
+ ops[2] = gen_rtx_REG (V8QImode, tabbase + 2);
+ ops[3] = gen_rtx_REG (V8QImode, tabbase + 4);
+ ops[4] = operands[2];
+ output_asm_insn ("vtbl.8\t%P0, {%P1, %P2, %P3}, %P4", ops);
+
+ return "";
+}
+ [(set_attr "neon_type" "neon_bp_3cycle")]
+)
+
+(define_insn "neon_vtbl4v8qi"
+ [(set (match_operand:V8QI 0 "s_register_operand" "=w")
+ (unspec:V8QI [(match_operand:OI 1 "s_register_operand" "w")
+ (match_operand:V8QI 2 "s_register_operand" "w")]
+ UNSPEC_VTBL))]
+ "TARGET_NEON"
+{
+ rtx ops[6];
+ int tabbase = REGNO (operands[1]);
+
+ ops[0] = operands[0];
+ ops[1] = gen_rtx_REG (V8QImode, tabbase);
+ ops[2] = gen_rtx_REG (V8QImode, tabbase + 2);
+ ops[3] = gen_rtx_REG (V8QImode, tabbase + 4);
+ ops[4] = gen_rtx_REG (V8QImode, tabbase + 6);
+ ops[5] = operands[2];
+ output_asm_insn ("vtbl.8\t%P0, {%P1, %P2, %P3, %P4}, %P5", ops);
+
+ return "";
+}
+ [(set_attr "neon_type" "neon_bp_3cycle")]
+)
+
+(define_insn "neon_vtbx1v8qi"
+ [(set (match_operand:V8QI 0 "s_register_operand" "=w")
+ (unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0")
+ (match_operand:V8QI 2 "s_register_operand" "w")
+ (match_operand:V8QI 3 "s_register_operand" "w")]
+ UNSPEC_VTBX))]
+ "TARGET_NEON"
+ "vtbx.8\t%P0, {%P2}, %P3"
+ [(set_attr "neon_type" "neon_bp_2cycle")]
+)
+
+(define_insn "neon_vtbx2v8qi"
+ [(set (match_operand:V8QI 0 "s_register_operand" "=w")
+ (unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0")
+ (match_operand:TI 2 "s_register_operand" "w")
+ (match_operand:V8QI 3 "s_register_operand" "w")]
+ UNSPEC_VTBX))]
+ "TARGET_NEON"
+{
+ rtx ops[4];
+ int tabbase = REGNO (operands[2]);
+
+ ops[0] = operands[0];
+ ops[1] = gen_rtx_REG (V8QImode, tabbase);
+ ops[2] = gen_rtx_REG (V8QImode, tabbase + 2);
+ ops[3] = operands[3];
+ output_asm_insn ("vtbx.8\t%P0, {%P1, %P2}, %P3", ops);
+
+ return "";
+}
+ [(set_attr "neon_type" "neon_bp_2cycle")]
+)
+
+(define_insn "neon_vtbx3v8qi"
+ [(set (match_operand:V8QI 0 "s_register_operand" "=w")
+ (unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0")
+ (match_operand:EI 2 "s_register_operand" "w")
+ (match_operand:V8QI 3 "s_register_operand" "w")]
+ UNSPEC_VTBX))]
+ "TARGET_NEON"
+{
+ rtx ops[5];
+ int tabbase = REGNO (operands[2]);
+
+ ops[0] = operands[0];
+ ops[1] = gen_rtx_REG (V8QImode, tabbase);
+ ops[2] = gen_rtx_REG (V8QImode, tabbase + 2);
+ ops[3] = gen_rtx_REG (V8QImode, tabbase + 4);
+ ops[4] = operands[3];
+ output_asm_insn ("vtbx.8\t%P0, {%P1, %P2, %P3}, %P4", ops);
+
+ return "";
+}
+ [(set_attr "neon_type" "neon_bp_3cycle")]
+)
+
+(define_insn "neon_vtbx4v8qi"
+ [(set (match_operand:V8QI 0 "s_register_operand" "=w")
+ (unspec:V8QI [(match_operand:V8QI 1 "s_register_operand" "0")
+ (match_operand:OI 2 "s_register_operand" "w")
+ (match_operand:V8QI 3 "s_register_operand" "w")]
+ UNSPEC_VTBX))]
+ "TARGET_NEON"
+{
+ rtx ops[6];
+ int tabbase = REGNO (operands[2]);
+
+ ops[0] = operands[0];
+ ops[1] = gen_rtx_REG (V8QImode, tabbase);
+ ops[2] = gen_rtx_REG (V8QImode, tabbase + 2);
+ ops[3] = gen_rtx_REG (V8QImode, tabbase + 4);
+ ops[4] = gen_rtx_REG (V8QImode, tabbase + 6);
+ ops[5] = operands[3];
+ output_asm_insn ("vtbx.8\t%P0, {%P1, %P2, %P3, %P4}, %P5", ops);
+
+ return "";
+}
+ [(set_attr "neon_type" "neon_bp_3cycle")]
+)
+
+(define_insn "neon_vtrn<mode>_internal"
+ [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+ (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")]
+ UNSPEC_VTRN1))
+ (set (match_operand:VDQW 2 "s_register_operand" "=w")
+ (unspec:VDQW [(match_operand:VDQW 3 "s_register_operand" "2")]
+ UNSPEC_VTRN2))]
+ "TARGET_NEON"
+ "vtrn.<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_bp_simple")
+ (const_string "neon_bp_3cycle")))]
+)
+
+(define_expand "neon_vtrn<mode>"
+ [(match_operand:SI 0 "s_register_operand" "r")
+ (match_operand:VDQW 1 "s_register_operand" "w")
+ (match_operand:VDQW 2 "s_register_operand" "w")]
+ "TARGET_NEON"
+{
+ neon_emit_pair_result_insn (<MODE>mode, gen_neon_vtrn<mode>_internal,
+ operands[0], operands[1], operands[2]);
+ DONE;
+})
+
+(define_insn "neon_vzip<mode>_internal"
+ [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+ (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")]
+ UNSPEC_VZIP1))
+ (set (match_operand:VDQW 2 "s_register_operand" "=w")
+ (unspec:VDQW [(match_operand:VDQW 3 "s_register_operand" "2")]
+ UNSPEC_VZIP2))]
+ "TARGET_NEON"
+ "vzip.<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_bp_simple")
+ (const_string "neon_bp_3cycle")))]
+)
+
+(define_expand "neon_vzip<mode>"
+ [(match_operand:SI 0 "s_register_operand" "r")
+ (match_operand:VDQW 1 "s_register_operand" "w")
+ (match_operand:VDQW 2 "s_register_operand" "w")]
+ "TARGET_NEON"
+{
+ neon_emit_pair_result_insn (<MODE>mode, gen_neon_vzip<mode>_internal,
+ operands[0], operands[1], operands[2]);
+ DONE;
+})
+
+(define_insn "neon_vuzp<mode>_internal"
+ [(set (match_operand:VDQW 0 "s_register_operand" "=w")
+ (unspec:VDQW [(match_operand:VDQW 1 "s_register_operand" "0")]
+ UNSPEC_VUZP1))
+ (set (match_operand:VDQW 2 "s_register_operand" "=w")
+ (unspec:VDQW [(match_operand:VDQW 3 "s_register_operand" "2")]
+ UNSPEC_VUZP2))]
+ "TARGET_NEON"
+ "vuzp.<V_sz_elem>\t%<V_reg>0, %<V_reg>2"
+ [(set (attr "neon_type")
+ (if_then_else (ne (symbol_ref "<Is_d_reg>") (const_int 0))
+ (const_string "neon_bp_simple")
+ (const_string "neon_bp_3cycle")))]
+)
+
+(define_expand "neon_vuzp<mode>"
+ [(match_operand:SI 0 "s_register_operand" "r")
+ (match_operand:VDQW 1 "s_register_operand" "w")
+ (match_operand:VDQW 2 "s_register_operand" "w")]
+ "TARGET_NEON"
+{
+ neon_emit_pair_result_insn (<MODE>mode, gen_neon_vuzp<mode>_internal,
+ operands[0], operands[1], operands[2]);
+ DONE;
+})
+
+(define_expand "neon_vreinterpretv8qi<mode>"
+ [(match_operand:V8QI 0 "s_register_operand" "")
+ (match_operand:VDX 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ neon_reinterpret (operands[0], operands[1]);
+ DONE;
+})
+
+(define_expand "neon_vreinterpretv4hi<mode>"
+ [(match_operand:V4HI 0 "s_register_operand" "")
+ (match_operand:VDX 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ neon_reinterpret (operands[0], operands[1]);
+ DONE;
+})
+
+(define_expand "neon_vreinterpretv2si<mode>"
+ [(match_operand:V2SI 0 "s_register_operand" "")
+ (match_operand:VDX 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ neon_reinterpret (operands[0], operands[1]);
+ DONE;
+})
+
+(define_expand "neon_vreinterpretv2sf<mode>"
+ [(match_operand:V2SF 0 "s_register_operand" "")
+ (match_operand:VDX 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ neon_reinterpret (operands[0], operands[1]);
+ DONE;
+})
+
+(define_expand "neon_vreinterpretdi<mode>"
+ [(match_operand:DI 0 "s_register_operand" "")
+ (match_operand:VDX 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ neon_reinterpret (operands[0], operands[1]);
+ DONE;
+})
+
+(define_expand "neon_vreinterpretv16qi<mode>"
+ [(match_operand:V16QI 0 "s_register_operand" "")
+ (match_operand:VQX 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ neon_reinterpret (operands[0], operands[1]);
+ DONE;
+})
+
+(define_expand "neon_vreinterpretv8hi<mode>"
+ [(match_operand:V8HI 0 "s_register_operand" "")
+ (match_operand:VQX 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ neon_reinterpret (operands[0], operands[1]);
+ DONE;
+})
+
+(define_expand "neon_vreinterpretv4si<mode>"
+ [(match_operand:V4SI 0 "s_register_operand" "")
+ (match_operand:VQX 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ neon_reinterpret (operands[0], operands[1]);
+ DONE;
+})
+
+(define_expand "neon_vreinterpretv4sf<mode>"
+ [(match_operand:V4SF 0 "s_register_operand" "")
+ (match_operand:VQX 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ neon_reinterpret (operands[0], operands[1]);
+ DONE;
+})
+
+(define_expand "neon_vreinterpretv2di<mode>"
+ [(match_operand:V2DI 0 "s_register_operand" "")
+ (match_operand:VQX 1 "s_register_operand" "")]
+ "TARGET_NEON"
+{
+ neon_reinterpret (operands[0], operands[1]);
+ DONE;
+})
+
+(define_insn "neon_vld1<mode>"
+ [(set (match_operand:VDQX 0 "s_register_operand" "=w")
+ (unspec:VDQX [(mem:VDQX (match_operand:SI 1 "s_register_operand" "r"))]
+ UNSPEC_VLD1))]
+ "TARGET_NEON"
+ "vld1.<V_sz_elem>\t%h0, [%1]"
+ [(set_attr "neon_type" "neon_vld1_1_2_regs")]
+)
+
+(define_insn "neon_vld1_lane<mode>"
+ [(set (match_operand:VDX 0 "s_register_operand" "=w")
+ (unspec:VDX [(mem:<V_elem> (match_operand:SI 1 "s_register_operand" "r"))
+ (match_operand:VDX 2 "s_register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VLD1_LANE))]
+ "TARGET_NEON"
+{
+ HOST_WIDE_INT lane = INTVAL (operands[3]);
+ HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+ if (lane < 0 || lane >= max)
+ error ("lane out of range");
+ if (max == 1)
+ return "vld1.<V_sz_elem>\t%P0, [%1]";
+ else
+ return "vld1.<V_sz_elem>\t{%P0[%c3]}, [%1]";
+}
+ [(set (attr "neon_type")
+ (if_then_else (eq (const_string "<V_mode_nunits>") (const_int 2))
+ (const_string "neon_vld1_1_2_regs")
+ (const_string "neon_vld1_vld2_lane")))]
+)
+
+(define_insn "neon_vld1_lane<mode>"
+ [(set (match_operand:VQX 0 "s_register_operand" "=w")
+ (unspec:VQX [(mem:<V_elem> (match_operand:SI 1 "s_register_operand" "r"))
+ (match_operand:VQX 2 "s_register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")]
+ UNSPEC_VLD1_LANE))]
+ "TARGET_NEON"
+{
+ HOST_WIDE_INT lane = INTVAL (operands[3]);
+ HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+ int regno = REGNO (operands[0]);
+ if (lane < 0 || lane >= max)
+ error ("lane out of range");
+ else if (lane >= max / 2)
+ {
+ lane -= max / 2;
+ regno += 2;
+ operands[3] = GEN_INT (lane);
+ }
+ operands[0] = gen_rtx_REG (<V_HALF>mode, regno);
+ if (max == 2)
+ return "vld1.<V_sz_elem>\t%P0, [%1]";
+ else
+ return "vld1.<V_sz_elem>\t{%P0[%c3]}, [%1]";
+}
+ [(set (attr "neon_type")
+ (if_then_else (eq (const_string "<V_mode_nunits>") (const_int 2))
+ (const_string "neon_vld1_1_2_regs")
+ (const_string "neon_vld1_vld2_lane")))]
+)
+
+(define_insn "neon_vld1_dup<mode>"
+ [(set (match_operand:VDX 0 "s_register_operand" "=w")
+ (unspec:VDX [(mem:<V_elem> (match_operand:SI 1 "s_register_operand" "r"))]
+ UNSPEC_VLD1_DUP))]
+ "TARGET_NEON"
+{
+ if (GET_MODE_NUNITS (<MODE>mode) > 1)
+ return "vld1.<V_sz_elem>\t{%P0[]}, [%1]";
+ else
+ return "vld1.<V_sz_elem>\t%h0, [%1]";
+}
+ [(set (attr "neon_type")
+ (if_then_else (gt (const_string "<V_mode_nunits>") (const_string "1"))
+ (const_string "neon_vld2_2_regs_vld1_vld2_all_lanes")
+ (const_string "neon_vld1_1_2_regs")))]
+)
+
+(define_insn "neon_vld1_dup<mode>"
+ [(set (match_operand:VQX 0 "s_register_operand" "=w")
+ (unspec:VQX [(mem:<V_elem> (match_operand:SI 1 "s_register_operand" "r"))]
+ UNSPEC_VLD1_DUP))]
+ "TARGET_NEON"
+{
+ if (GET_MODE_NUNITS (<MODE>mode) > 2)
+ return "vld1.<V_sz_elem>\t{%e0[], %f0[]}, [%1]";
+ else
+ return "vld1.<V_sz_elem>\t%h0, [%1]";
+}
+ [(set (attr "neon_type")
+ (if_then_else (gt (const_string "<V_mode_nunits>") (const_string "1"))
+ (const_string "neon_vld2_2_regs_vld1_vld2_all_lanes")
+ (const_string "neon_vld1_1_2_regs")))]
+)
+
+(define_insn "neon_vst1<mode>"
+ [(set (mem:VDQX (match_operand:SI 0 "s_register_operand" "r"))
+ (unspec:VDQX [(match_operand:VDQX 1 "s_register_operand" "w")]
+ UNSPEC_VST1))]
+ "TARGET_NEON"
+ "vst1.<V_sz_elem>\t%h1, [%0]"
+ [(set_attr "neon_type" "neon_vst1_1_2_regs_vst2_2_regs")])
+
+(define_insn "neon_vst1_lane<mode>"
+ [(set (mem:<V_elem> (match_operand:SI 0 "s_register_operand" "r"))
+ (vec_select:<V_elem>
+ (match_operand:VDX 1 "s_register_operand" "w")
+ (parallel [(match_operand:SI 2 "neon_lane_number" "i")])))]
+ "TARGET_NEON"
+{
+ HOST_WIDE_INT lane = INTVAL (operands[2]);
+ HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+ if (lane < 0 || lane >= max)
+ error ("lane out of range");
+ if (max == 1)
+ return "vst1.<V_sz_elem>\t{%P1}, [%0]";
+ else
+ return "vst1.<V_sz_elem>\t{%P1[%c2]}, [%0]";
+}
+ [(set (attr "neon_type")
+ (if_then_else (eq (const_string "<V_mode_nunits>") (const_int 1))
+ (const_string "neon_vst1_1_2_regs_vst2_2_regs")
+ (const_string "neon_vst1_vst2_lane")))])
+
+(define_insn "neon_vst1_lane<mode>"
+ [(set (mem:<V_elem> (match_operand:SI 0 "s_register_operand" "r"))
+ (vec_select:<V_elem>
+ (match_operand:VQX 1 "s_register_operand" "w")
+ (parallel [(match_operand:SI 2 "neon_lane_number" "i")])))]
+ "TARGET_NEON"
+{
+ HOST_WIDE_INT lane = INTVAL (operands[2]);
+ HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+ int regno = REGNO (operands[1]);
+ if (lane < 0 || lane >= max)
+ error ("lane out of range");
+ else if (lane >= max / 2)
+ {
+ lane -= max / 2;
+ regno += 2;
+ operands[2] = GEN_INT (lane);
+ }
+ operands[1] = gen_rtx_REG (<V_HALF>mode, regno);
+ if (max == 2)
+ return "vst1.<V_sz_elem>\t{%P1}, [%0]";
+ else
+ return "vst1.<V_sz_elem>\t{%P1[%c2]}, [%0]";
+}
+ [(set_attr "neon_type" "neon_vst1_vst2_lane")]
+)
+
+(define_insn "neon_vld2<mode>"
+ [(set (match_operand:TI 0 "s_register_operand" "=w")
+ (unspec:TI [(mem:TI (match_operand:SI 1 "s_register_operand" "r"))
+ (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VLD2))]
+ "TARGET_NEON"
+{
+ if (<V_sz_elem> == 64)
+ return "vld1.64\t%h0, [%1]";
+ else
+ return "vld2.<V_sz_elem>\t%h0, [%1]";
+}
+ [(set (attr "neon_type")
+ (if_then_else (eq (const_string "<V_sz_elem>") (const_string "64"))
+ (const_string "neon_vld1_1_2_regs")
+ (const_string "neon_vld2_2_regs_vld1_vld2_all_lanes")))]
+)
+
+(define_insn "neon_vld2<mode>"
+ [(set (match_operand:OI 0 "s_register_operand" "=w")
+ (unspec:OI [(mem:OI (match_operand:SI 1 "s_register_operand" "r"))
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VLD2))]
+ "TARGET_NEON"
+ "vld2.<V_sz_elem>\t%h0, [%1]"
+ [(set_attr "neon_type" "neon_vld2_2_regs_vld1_vld2_all_lanes")])
+
+(define_insn "neon_vld2_lane<mode>"
+ [(set (match_operand:TI 0 "s_register_operand" "=w")
+ (unspec:TI [(mem:<V_two_elem> (match_operand:SI 1 "s_register_operand" "r"))
+ (match_operand:TI 2 "s_register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VLD2_LANE))]
+ "TARGET_NEON"
+{
+ HOST_WIDE_INT lane = INTVAL (operands[3]);
+ HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+ int regno = REGNO (operands[0]);
+ rtx ops[4];
+ if (lane < 0 || lane >= max)
+ error ("lane out of range");
+ ops[0] = gen_rtx_REG (DImode, regno);
+ ops[1] = gen_rtx_REG (DImode, regno + 2);
+ ops[2] = operands[1];
+ ops[3] = operands[3];
+ output_asm_insn ("vld2.<V_sz_elem>\t{%P0[%c3], %P1[%c3]}, [%2]", ops);
+ return "";
+}
+ [(set_attr "neon_type" "neon_vld1_vld2_lane")]
+)
+
+(define_insn "neon_vld2_lane<mode>"
+ [(set (match_operand:OI 0 "s_register_operand" "=w")
+ (unspec:OI [(mem:<V_two_elem> (match_operand:SI 1 "s_register_operand" "r"))
+ (match_operand:OI 2 "s_register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VLD2_LANE))]
+ "TARGET_NEON"
+{
+ HOST_WIDE_INT lane = INTVAL (operands[3]);
+ HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+ int regno = REGNO (operands[0]);
+ rtx ops[4];
+ if (lane < 0 || lane >= max)
+ error ("lane out of range");
+ else if (lane >= max / 2)
+ {
+ lane -= max / 2;
+ regno += 2;
+ }
+ ops[0] = gen_rtx_REG (DImode, regno);
+ ops[1] = gen_rtx_REG (DImode, regno + 4);
+ ops[2] = operands[1];
+ ops[3] = GEN_INT (lane);
+ output_asm_insn ("vld2.<V_sz_elem>\t{%P0[%c3], %P1[%c3]}, [%2]", ops);
+ return "";
+}
+ [(set_attr "neon_type" "neon_vld1_vld2_lane")]
+)
+
+(define_insn "neon_vld2_dup<mode>"
+ [(set (match_operand:TI 0 "s_register_operand" "=w")
+ (unspec:TI [(mem:<V_two_elem> (match_operand:SI 1 "s_register_operand" "r"))
+ (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VLD2_DUP))]
+ "TARGET_NEON"
+{
+ if (GET_MODE_NUNITS (<MODE>mode) > 1)
+ return "vld2.<V_sz_elem>\t{%e0[], %f0[]}, [%1]";
+ else
+ return "vld1.<V_sz_elem>\t%h0, [%1]";
+}
+ [(set (attr "neon_type")
+ (if_then_else (gt (const_string "<V_mode_nunits>") (const_string "1"))
+ (const_string "neon_vld2_2_regs_vld1_vld2_all_lanes")
+ (const_string "neon_vld1_1_2_regs")))]
+)
+
+(define_insn "neon_vst2<mode>"
+ [(set (mem:TI (match_operand:SI 0 "s_register_operand" "r"))
+ (unspec:TI [(match_operand:TI 1 "s_register_operand" "w")
+ (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VST2))]
+ "TARGET_NEON"
+{
+ if (<V_sz_elem> == 64)
+ return "vst1.64\t%h1, [%0]";
+ else
+ return "vst2.<V_sz_elem>\t%h1, [%0]";
+}
+ [(set (attr "neon_type")
+ (if_then_else (eq (const_string "<V_sz_elem>") (const_string "64"))
+ (const_string "neon_vst1_1_2_regs_vst2_2_regs")
+ (const_string "neon_vst1_1_2_regs_vst2_2_regs")))]
+)
+
+(define_insn "neon_vst2<mode>"
+ [(set (mem:OI (match_operand:SI 0 "s_register_operand" "r"))
+ (unspec:OI [(match_operand:OI 1 "s_register_operand" "w")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VST2))]
+ "TARGET_NEON"
+ "vst2.<V_sz_elem>\t%h1, [%0]"
+ [(set_attr "neon_type" "neon_vst1_1_2_regs_vst2_2_regs")]
+)
+
+(define_insn "neon_vst2_lane<mode>"
+ [(set (mem:<V_two_elem> (match_operand:SI 0 "s_register_operand" "r"))
+ (unspec:<V_two_elem>
+ [(match_operand:TI 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")
+ (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VST2_LANE))]
+ "TARGET_NEON"
+{
+ HOST_WIDE_INT lane = INTVAL (operands[2]);
+ HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+ int regno = REGNO (operands[1]);
+ rtx ops[4];
+ if (lane < 0 || lane >= max)
+ error ("lane out of range");
+ ops[0] = operands[0];
+ ops[1] = gen_rtx_REG (DImode, regno);
+ ops[2] = gen_rtx_REG (DImode, regno + 2);
+ ops[3] = operands[2];
+ output_asm_insn ("vst2.<V_sz_elem>\t{%P1[%c3], %P2[%c3]}, [%0]", ops);
+ return "";
+}
+ [(set_attr "neon_type" "neon_vst1_vst2_lane")]
+)
+
+(define_insn "neon_vst2_lane<mode>"
+ [(set (mem:<V_two_elem> (match_operand:SI 0 "s_register_operand" "r"))
+ (unspec:<V_two_elem>
+ [(match_operand:OI 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")
+ (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VST2_LANE))]
+ "TARGET_NEON"
+{
+ HOST_WIDE_INT lane = INTVAL (operands[2]);
+ HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+ int regno = REGNO (operands[1]);
+ rtx ops[4];
+ if (lane < 0 || lane >= max)
+ error ("lane out of range");
+ else if (lane >= max / 2)
+ {
+ lane -= max / 2;
+ regno += 2;
+ }
+ ops[0] = operands[0];
+ ops[1] = gen_rtx_REG (DImode, regno);
+ ops[2] = gen_rtx_REG (DImode, regno + 4);
+ ops[3] = GEN_INT (lane);
+ output_asm_insn ("vst2.<V_sz_elem>\t{%P1[%c3], %P2[%c3]}, [%0]", ops);
+ return "";
+}
+ [(set_attr "neon_type" "neon_vst1_vst2_lane")]
+)
+
+(define_insn "neon_vld3<mode>"
+ [(set (match_operand:EI 0 "s_register_operand" "=w")
+ (unspec:EI [(mem:EI (match_operand:SI 1 "s_register_operand" "r"))
+ (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VLD3))]
+ "TARGET_NEON"
+{
+ if (<V_sz_elem> == 64)
+ return "vld1.64\t%h0, [%1]";
+ else
+ return "vld3.<V_sz_elem>\t%h0, [%1]";
+}
+ [(set (attr "neon_type")
+ (if_then_else (eq (const_string "<V_sz_elem>") (const_string "64"))
+ (const_string "neon_vld1_1_2_regs")
+ (const_string "neon_vld3_vld4")))]
+)
+
+(define_expand "neon_vld3<mode>"
+ [(match_operand:CI 0 "s_register_operand" "=w")
+ (match_operand:SI 1 "s_register_operand" "+r")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_NEON"
+{
+ emit_insn (gen_neon_vld3qa<mode> (operands[0], operands[0],
+ operands[1], operands[1]));
+ emit_insn (gen_neon_vld3qb<mode> (operands[0], operands[0],
+ operands[1], operands[1]));
+ DONE;
+})
+
+(define_insn "neon_vld3qa<mode>"
+ [(set (match_operand:CI 0 "s_register_operand" "=w")
+ (unspec:CI [(mem:CI (match_operand:SI 3 "s_register_operand" "2"))
+ (match_operand:CI 1 "s_register_operand" "0")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VLD3A))
+ (set (match_operand:SI 2 "s_register_operand" "=r")
+ (plus:SI (match_dup 3)
+ (const_int 24)))]
+ "TARGET_NEON"
+{
+ int regno = REGNO (operands[0]);
+ rtx ops[4];
+ ops[0] = gen_rtx_REG (DImode, regno);
+ ops[1] = gen_rtx_REG (DImode, regno + 4);
+ ops[2] = gen_rtx_REG (DImode, regno + 8);
+ ops[3] = operands[2];
+ output_asm_insn ("vld3.<V_sz_elem>\t{%P0, %P1, %P2}, [%3]!", ops);
+ return "";
+}
+ [(set_attr "neon_type" "neon_vld3_vld4")]
+)
+
+(define_insn "neon_vld3qb<mode>"
+ [(set (match_operand:CI 0 "s_register_operand" "=w")
+ (unspec:CI [(mem:CI (match_operand:SI 3 "s_register_operand" "2"))
+ (match_operand:CI 1 "s_register_operand" "0")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VLD3B))
+ (set (match_operand:SI 2 "s_register_operand" "=r")
+ (plus:SI (match_dup 3)
+ (const_int 24)))]
+ "TARGET_NEON"
+{
+ int regno = REGNO (operands[0]);
+ rtx ops[4];
+ ops[0] = gen_rtx_REG (DImode, regno + 2);
+ ops[1] = gen_rtx_REG (DImode, regno + 6);
+ ops[2] = gen_rtx_REG (DImode, regno + 10);
+ ops[3] = operands[2];
+ output_asm_insn ("vld3.<V_sz_elem>\t{%P0, %P1, %P2}, [%3]!", ops);
+ return "";
+}
+ [(set_attr "neon_type" "neon_vld3_vld4")]
+)
+
+(define_insn "neon_vld3_lane<mode>"
+ [(set (match_operand:EI 0 "s_register_operand" "=w")
+ (unspec:EI [(mem:<V_three_elem> (match_operand:SI 1 "s_register_operand" "r"))
+ (match_operand:EI 2 "s_register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VLD3_LANE))]
+ "TARGET_NEON"
+{
+ HOST_WIDE_INT lane = INTVAL (operands[3]);
+ HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+ int regno = REGNO (operands[0]);
+ rtx ops[5];
+ if (lane < 0 || lane >= max)
+ error ("lane out of range");
+ ops[0] = gen_rtx_REG (DImode, regno);
+ ops[1] = gen_rtx_REG (DImode, regno + 2);
+ ops[2] = gen_rtx_REG (DImode, regno + 4);
+ ops[3] = operands[1];
+ ops[4] = operands[3];
+ output_asm_insn ("vld3.<V_sz_elem>\t{%P0[%c4], %P1[%c4], %P2[%c4]}, [%3]",
+ ops);
+ return "";
+}
+ [(set_attr "neon_type" "neon_vld3_vld4_lane")]
+)
+
+(define_insn "neon_vld3_lane<mode>"
+ [(set (match_operand:CI 0 "s_register_operand" "=w")
+ (unspec:CI [(mem:<V_three_elem> (match_operand:SI 1 "s_register_operand" "r"))
+ (match_operand:CI 2 "s_register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VLD3_LANE))]
+ "TARGET_NEON"
+{
+ HOST_WIDE_INT lane = INTVAL (operands[3]);
+ HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+ int regno = REGNO (operands[0]);
+ rtx ops[5];
+ if (lane < 0 || lane >= max)
+ error ("lane out of range");
+ else if (lane >= max / 2)
+ {
+ lane -= max / 2;
+ regno += 2;
+ }
+ ops[0] = gen_rtx_REG (DImode, regno);
+ ops[1] = gen_rtx_REG (DImode, regno + 4);
+ ops[2] = gen_rtx_REG (DImode, regno + 8);
+ ops[3] = operands[1];
+ ops[4] = GEN_INT (lane);
+ output_asm_insn ("vld3.<V_sz_elem>\t{%P0[%c4], %P1[%c4], %P2[%c4]}, [%3]",
+ ops);
+ return "";
+}
+ [(set_attr "neon_type" "neon_vld3_vld4_lane")]
+)
+
+(define_insn "neon_vld3_dup<mode>"
+ [(set (match_operand:EI 0 "s_register_operand" "=w")
+ (unspec:EI [(mem:<V_three_elem> (match_operand:SI 1 "s_register_operand" "r"))
+ (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VLD3_DUP))]
+ "TARGET_NEON"
+{
+ if (GET_MODE_NUNITS (<MODE>mode) > 1)
+ {
+ int regno = REGNO (operands[0]);
+ rtx ops[4];
+ ops[0] = gen_rtx_REG (DImode, regno);
+ ops[1] = gen_rtx_REG (DImode, regno + 2);
+ ops[2] = gen_rtx_REG (DImode, regno + 4);
+ ops[3] = operands[1];
+ output_asm_insn ("vld3.<V_sz_elem>\t{%P0[], %P1[], %P2[]}, [%3]", ops);
+ return "";
+ }
+ else
+ return "vld1.<V_sz_elem>\t%h0, [%1]";
+}
+ [(set (attr "neon_type")
+ (if_then_else (gt (const_string "<V_mode_nunits>") (const_string "1"))
+ (const_string "neon_vld3_vld4_all_lanes")
+ (const_string "neon_vld1_1_2_regs")))])
+
+(define_insn "neon_vst3<mode>"
+ [(set (mem:EI (match_operand:SI 0 "s_register_operand" "r"))
+ (unspec:EI [(match_operand:EI 1 "s_register_operand" "w")
+ (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VST3))]
+ "TARGET_NEON"
+{
+ if (<V_sz_elem> == 64)
+ return "vst1.64\t%h1, [%0]";
+ else
+ return "vst3.<V_sz_elem>\t%h1, [%0]";
+}
+ [(set (attr "neon_type")
+ (if_then_else (eq (const_string "<V_sz_elem>") (const_string "64"))
+ (const_string "neon_vst1_1_2_regs_vst2_2_regs")
+ (const_string "neon_vst2_4_regs_vst3_vst4")))])
+
+(define_expand "neon_vst3<mode>"
+ [(match_operand:SI 0 "s_register_operand" "+r")
+ (match_operand:CI 1 "s_register_operand" "w")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_NEON"
+{
+ emit_insn (gen_neon_vst3qa<mode> (operands[0], operands[0], operands[1]));
+ emit_insn (gen_neon_vst3qb<mode> (operands[0], operands[0], operands[1]));
+ DONE;
+})
+
+(define_insn "neon_vst3qa<mode>"
+ [(set (mem:EI (match_operand:SI 1 "s_register_operand" "0"))
+ (unspec:EI [(match_operand:CI 2 "s_register_operand" "w")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VST3A))
+ (set (match_operand:SI 0 "s_register_operand" "=r")
+ (plus:SI (match_dup 1)
+ (const_int 24)))]
+ "TARGET_NEON"
+{
+ int regno = REGNO (operands[2]);
+ rtx ops[4];
+ ops[0] = operands[0];
+ ops[1] = gen_rtx_REG (DImode, regno);
+ ops[2] = gen_rtx_REG (DImode, regno + 4);
+ ops[3] = gen_rtx_REG (DImode, regno + 8);
+ output_asm_insn ("vst3.<V_sz_elem>\t{%P1, %P2, %P3}, [%0]!", ops);
+ return "";
+}
+ [(set_attr "neon_type" "neon_vst2_4_regs_vst3_vst4")]
+)
+
+(define_insn "neon_vst3qb<mode>"
+ [(set (mem:EI (match_operand:SI 1 "s_register_operand" "0"))
+ (unspec:EI [(match_operand:CI 2 "s_register_operand" "w")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VST3B))
+ (set (match_operand:SI 0 "s_register_operand" "=r")
+ (plus:SI (match_dup 1)
+ (const_int 24)))]
+ "TARGET_NEON"
+{
+ int regno = REGNO (operands[2]);
+ rtx ops[4];
+ ops[0] = operands[0];
+ ops[1] = gen_rtx_REG (DImode, regno + 2);
+ ops[2] = gen_rtx_REG (DImode, regno + 6);
+ ops[3] = gen_rtx_REG (DImode, regno + 10);
+ output_asm_insn ("vst3.<V_sz_elem>\t{%P1, %P2, %P3}, [%0]!", ops);
+ return "";
+}
+ [(set_attr "neon_type" "neon_vst2_4_regs_vst3_vst4")]
+)
+
+(define_insn "neon_vst3_lane<mode>"
+ [(set (mem:<V_three_elem> (match_operand:SI 0 "s_register_operand" "r"))
+ (unspec:<V_three_elem>
+ [(match_operand:EI 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")
+ (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VST3_LANE))]
+ "TARGET_NEON"
+{
+ HOST_WIDE_INT lane = INTVAL (operands[2]);
+ HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+ int regno = REGNO (operands[1]);
+ rtx ops[5];
+ if (lane < 0 || lane >= max)
+ error ("lane out of range");
+ ops[0] = operands[0];
+ ops[1] = gen_rtx_REG (DImode, regno);
+ ops[2] = gen_rtx_REG (DImode, regno + 2);
+ ops[3] = gen_rtx_REG (DImode, regno + 4);
+ ops[4] = operands[2];
+ output_asm_insn ("vst3.<V_sz_elem>\t{%P1[%c4], %P2[%c4], %P3[%c4]}, [%0]",
+ ops);
+ return "";
+}
+ [(set_attr "neon_type" "neon_vst3_vst4_lane")]
+)
+
+(define_insn "neon_vst3_lane<mode>"
+ [(set (mem:<V_three_elem> (match_operand:SI 0 "s_register_operand" "r"))
+ (unspec:<V_three_elem>
+ [(match_operand:CI 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")
+ (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VST3_LANE))]
+ "TARGET_NEON"
+{
+ HOST_WIDE_INT lane = INTVAL (operands[2]);
+ HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+ int regno = REGNO (operands[1]);
+ rtx ops[5];
+ if (lane < 0 || lane >= max)
+ error ("lane out of range");
+ else if (lane >= max / 2)
+ {
+ lane -= max / 2;
+ regno += 2;
+ }
+ ops[0] = operands[0];
+ ops[1] = gen_rtx_REG (DImode, regno);
+ ops[2] = gen_rtx_REG (DImode, regno + 4);
+ ops[3] = gen_rtx_REG (DImode, regno + 8);
+ ops[4] = GEN_INT (lane);
+ output_asm_insn ("vst3.<V_sz_elem>\t{%P1[%c4], %P2[%c4], %P3[%c4]}, [%0]",
+ ops);
+ return "";
+}
+[(set_attr "neon_type" "neon_vst3_vst4_lane")])
+
+(define_insn "neon_vld4<mode>"
+ [(set (match_operand:OI 0 "s_register_operand" "=w")
+ (unspec:OI [(mem:OI (match_operand:SI 1 "s_register_operand" "r"))
+ (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VLD4))]
+ "TARGET_NEON"
+{
+ if (<V_sz_elem> == 64)
+ return "vld1.64\t%h0, [%1]";
+ else
+ return "vld4.<V_sz_elem>\t%h0, [%1]";
+}
+ [(set (attr "neon_type")
+ (if_then_else (eq (const_string "<V_sz_elem>") (const_string "64"))
+ (const_string "neon_vld1_1_2_regs")
+ (const_string "neon_vld3_vld4")))]
+)
+
+(define_expand "neon_vld4<mode>"
+ [(match_operand:XI 0 "s_register_operand" "=w")
+ (match_operand:SI 1 "s_register_operand" "+r")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_NEON"
+{
+ emit_insn (gen_neon_vld4qa<mode> (operands[0], operands[0],
+ operands[1], operands[1]));
+ emit_insn (gen_neon_vld4qb<mode> (operands[0], operands[0],
+ operands[1], operands[1]));
+ DONE;
+})
+
+(define_insn "neon_vld4qa<mode>"
+ [(set (match_operand:XI 0 "s_register_operand" "=w")
+ (unspec:XI [(mem:XI (match_operand:SI 3 "s_register_operand" "2"))
+ (match_operand:XI 1 "s_register_operand" "0")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VLD4A))
+ (set (match_operand:SI 2 "s_register_operand" "=r")
+ (plus:SI (match_dup 3)
+ (const_int 32)))]
+ "TARGET_NEON"
+{
+ int regno = REGNO (operands[0]);
+ rtx ops[5];
+ ops[0] = gen_rtx_REG (DImode, regno);
+ ops[1] = gen_rtx_REG (DImode, regno + 4);
+ ops[2] = gen_rtx_REG (DImode, regno + 8);
+ ops[3] = gen_rtx_REG (DImode, regno + 12);
+ ops[4] = operands[2];
+ output_asm_insn ("vld4.<V_sz_elem>\t{%P0, %P1, %P2, %P3}, [%4]!", ops);
+ return "";
+}
+ [(set_attr "neon_type" "neon_vld3_vld4")]
+)
+
+(define_insn "neon_vld4qb<mode>"
+ [(set (match_operand:XI 0 "s_register_operand" "=w")
+ (unspec:XI [(mem:XI (match_operand:SI 3 "s_register_operand" "2"))
+ (match_operand:XI 1 "s_register_operand" "0")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VLD4B))
+ (set (match_operand:SI 2 "s_register_operand" "=r")
+ (plus:SI (match_dup 3)
+ (const_int 32)))]
+ "TARGET_NEON"
+{
+ int regno = REGNO (operands[0]);
+ rtx ops[5];
+ ops[0] = gen_rtx_REG (DImode, regno + 2);
+ ops[1] = gen_rtx_REG (DImode, regno + 6);
+ ops[2] = gen_rtx_REG (DImode, regno + 10);
+ ops[3] = gen_rtx_REG (DImode, regno + 14);
+ ops[4] = operands[2];
+ output_asm_insn ("vld4.<V_sz_elem>\t{%P0, %P1, %P2, %P3}, [%4]!", ops);
+ return "";
+}
+ [(set_attr "neon_type" "neon_vld3_vld4")]
+)
+
+(define_insn "neon_vld4_lane<mode>"
+ [(set (match_operand:OI 0 "s_register_operand" "=w")
+ (unspec:OI [(mem:<V_four_elem> (match_operand:SI 1 "s_register_operand" "r"))
+ (match_operand:OI 2 "s_register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VLD4_LANE))]
+ "TARGET_NEON"
+{
+ HOST_WIDE_INT lane = INTVAL (operands[3]);
+ HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+ int regno = REGNO (operands[0]);
+ rtx ops[6];
+ if (lane < 0 || lane >= max)
+ error ("lane out of range");
+ ops[0] = gen_rtx_REG (DImode, regno);
+ ops[1] = gen_rtx_REG (DImode, regno + 2);
+ ops[2] = gen_rtx_REG (DImode, regno + 4);
+ ops[3] = gen_rtx_REG (DImode, regno + 6);
+ ops[4] = operands[1];
+ ops[5] = operands[3];
+ output_asm_insn ("vld4.<V_sz_elem>\t{%P0[%c5], %P1[%c5], %P2[%c5], %P3[%c5]}, [%4]",
+ ops);
+ return "";
+}
+ [(set_attr "neon_type" "neon_vld3_vld4_lane")]
+)
+
+(define_insn "neon_vld4_lane<mode>"
+ [(set (match_operand:XI 0 "s_register_operand" "=w")
+ (unspec:XI [(mem:<V_four_elem> (match_operand:SI 1 "s_register_operand" "r"))
+ (match_operand:XI 2 "s_register_operand" "0")
+ (match_operand:SI 3 "immediate_operand" "i")
+ (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VLD4_LANE))]
+ "TARGET_NEON"
+{
+ HOST_WIDE_INT lane = INTVAL (operands[3]);
+ HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+ int regno = REGNO (operands[0]);
+ rtx ops[6];
+ if (lane < 0 || lane >= max)
+ error ("lane out of range");
+ else if (lane >= max / 2)
+ {
+ lane -= max / 2;
+ regno += 2;
+ }
+ ops[0] = gen_rtx_REG (DImode, regno);
+ ops[1] = gen_rtx_REG (DImode, regno + 4);
+ ops[2] = gen_rtx_REG (DImode, regno + 8);
+ ops[3] = gen_rtx_REG (DImode, regno + 12);
+ ops[4] = operands[1];
+ ops[5] = GEN_INT (lane);
+ output_asm_insn ("vld4.<V_sz_elem>\t{%P0[%c5], %P1[%c5], %P2[%c5], %P3[%c5]}, [%4]",
+ ops);
+ return "";
+}
+ [(set_attr "neon_type" "neon_vld3_vld4_lane")]
+)
+
+(define_insn "neon_vld4_dup<mode>"
+ [(set (match_operand:OI 0 "s_register_operand" "=w")
+ (unspec:OI [(mem:<V_four_elem> (match_operand:SI 1 "s_register_operand" "r"))
+ (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VLD4_DUP))]
+ "TARGET_NEON"
+{
+ if (GET_MODE_NUNITS (<MODE>mode) > 1)
+ {
+ int regno = REGNO (operands[0]);
+ rtx ops[5];
+ ops[0] = gen_rtx_REG (DImode, regno);
+ ops[1] = gen_rtx_REG (DImode, regno + 2);
+ ops[2] = gen_rtx_REG (DImode, regno + 4);
+ ops[3] = gen_rtx_REG (DImode, regno + 6);
+ ops[4] = operands[1];
+ output_asm_insn ("vld4.<V_sz_elem>\t{%P0[], %P1[], %P2[], %P3[]}, [%4]",
+ ops);
+ return "";
+ }
+ else
+ return "vld1.<V_sz_elem>\t%h0, [%1]";
+}
+ [(set (attr "neon_type")
+ (if_then_else (gt (const_string "<V_mode_nunits>") (const_string "1"))
+ (const_string "neon_vld3_vld4_all_lanes")
+ (const_string "neon_vld1_1_2_regs")))]
+)
+
+(define_insn "neon_vst4<mode>"
+ [(set (mem:OI (match_operand:SI 0 "s_register_operand" "r"))
+ (unspec:OI [(match_operand:OI 1 "s_register_operand" "w")
+ (unspec:VDX [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VST4))]
+ "TARGET_NEON"
+{
+ if (<V_sz_elem> == 64)
+ return "vst1.64\t%h1, [%0]";
+ else
+ return "vst4.<V_sz_elem>\t%h1, [%0]";
+}
+ [(set (attr "neon_type")
+ (if_then_else (eq (const_string "<V_sz_elem>") (const_string "64"))
+ (const_string "neon_vst1_1_2_regs_vst2_2_regs")
+ (const_string "neon_vst2_4_regs_vst3_vst4")))]
+)
+
+(define_expand "neon_vst4<mode>"
+ [(match_operand:SI 0 "s_register_operand" "+r")
+ (match_operand:XI 1 "s_register_operand" "w")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ "TARGET_NEON"
+{
+ emit_insn (gen_neon_vst4qa<mode> (operands[0], operands[0], operands[1]));
+ emit_insn (gen_neon_vst4qb<mode> (operands[0], operands[0], operands[1]));
+ DONE;
+})
+
+(define_insn "neon_vst4qa<mode>"
+ [(set (mem:OI (match_operand:SI 1 "s_register_operand" "0"))
+ (unspec:OI [(match_operand:XI 2 "s_register_operand" "w")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VST4A))
+ (set (match_operand:SI 0 "s_register_operand" "=r")
+ (plus:SI (match_dup 1)
+ (const_int 32)))]
+ "TARGET_NEON"
+{
+ int regno = REGNO (operands[2]);
+ rtx ops[5];
+ ops[0] = operands[0];
+ ops[1] = gen_rtx_REG (DImode, regno);
+ ops[2] = gen_rtx_REG (DImode, regno + 4);
+ ops[3] = gen_rtx_REG (DImode, regno + 8);
+ ops[4] = gen_rtx_REG (DImode, regno + 12);
+ output_asm_insn ("vst4.<V_sz_elem>\t{%P1, %P2, %P3, %P4}, [%0]!", ops);
+ return "";
+}
+ [(set_attr "neon_type" "neon_vst2_4_regs_vst3_vst4")]
+)
+
+(define_insn "neon_vst4qb<mode>"
+ [(set (mem:OI (match_operand:SI 1 "s_register_operand" "0"))
+ (unspec:OI [(match_operand:XI 2 "s_register_operand" "w")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VST4B))
+ (set (match_operand:SI 0 "s_register_operand" "=r")
+ (plus:SI (match_dup 1)
+ (const_int 32)))]
+ "TARGET_NEON"
+{
+ int regno = REGNO (operands[2]);
+ rtx ops[5];
+ ops[0] = operands[0];
+ ops[1] = gen_rtx_REG (DImode, regno + 2);
+ ops[2] = gen_rtx_REG (DImode, regno + 6);
+ ops[3] = gen_rtx_REG (DImode, regno + 10);
+ ops[4] = gen_rtx_REG (DImode, regno + 14);
+ output_asm_insn ("vst4.<V_sz_elem>\t{%P1, %P2, %P3, %P4}, [%0]!", ops);
+ return "";
+}
+ [(set_attr "neon_type" "neon_vst2_4_regs_vst3_vst4")]
+)
+
+(define_insn "neon_vst4_lane<mode>"
+ [(set (mem:<V_four_elem> (match_operand:SI 0 "s_register_operand" "r"))
+ (unspec:<V_four_elem>
+ [(match_operand:OI 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")
+ (unspec:VD [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VST4_LANE))]
+ "TARGET_NEON"
+{
+ HOST_WIDE_INT lane = INTVAL (operands[2]);
+ HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+ int regno = REGNO (operands[1]);
+ rtx ops[6];
+ if (lane < 0 || lane >= max)
+ error ("lane out of range");
+ ops[0] = operands[0];
+ ops[1] = gen_rtx_REG (DImode, regno);
+ ops[2] = gen_rtx_REG (DImode, regno + 2);
+ ops[3] = gen_rtx_REG (DImode, regno + 4);
+ ops[4] = gen_rtx_REG (DImode, regno + 6);
+ ops[5] = operands[2];
+ output_asm_insn ("vst4.<V_sz_elem>\t{%P1[%c5], %P2[%c5], %P3[%c5], %P4[%c5]}, [%0]",
+ ops);
+ return "";
+}
+ [(set_attr "neon_type" "neon_vst3_vst4_lane")]
+)
+
+(define_insn "neon_vst4_lane<mode>"
+ [(set (mem:<V_four_elem> (match_operand:SI 0 "s_register_operand" "r"))
+ (unspec:<V_four_elem>
+ [(match_operand:XI 1 "s_register_operand" "w")
+ (match_operand:SI 2 "immediate_operand" "i")
+ (unspec:VMQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)]
+ UNSPEC_VST4_LANE))]
+ "TARGET_NEON"
+{
+ HOST_WIDE_INT lane = INTVAL (operands[2]);
+ HOST_WIDE_INT max = GET_MODE_NUNITS (<MODE>mode);
+ int regno = REGNO (operands[1]);
+ rtx ops[6];
+ if (lane < 0 || lane >= max)
+ error ("lane out of range");
+ else if (lane >= max / 2)
+ {
+ lane -= max / 2;
+ regno += 2;
+ }
+ ops[0] = operands[0];
+ ops[1] = gen_rtx_REG (DImode, regno);
+ ops[2] = gen_rtx_REG (DImode, regno + 4);
+ ops[3] = gen_rtx_REG (DImode, regno + 8);
+ ops[4] = gen_rtx_REG (DImode, regno + 12);
+ ops[5] = GEN_INT (lane);
+ output_asm_insn ("vst4.<V_sz_elem>\t{%P1[%c5], %P2[%c5], %P3[%c5], %P4[%c5]}, [%0]",
+ ops);
+ return "";
+}
+ [(set_attr "neon_type" "neon_vst3_vst4_lane")]
+)
+
+(define_expand "neon_vand<mode>"
+ [(match_operand:VDQX 0 "s_register_operand" "")
+ (match_operand:VDQX 1 "s_register_operand" "")
+ (match_operand:VDQX 2 "neon_inv_logic_op2" "")
+ (match_operand:SI 3 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ emit_insn (gen_and<mode>3<V_suf64> (operands[0], operands[1], operands[2]));
+ DONE;
+})
+
+(define_expand "neon_vorr<mode>"
+ [(match_operand:VDQX 0 "s_register_operand" "")
+ (match_operand:VDQX 1 "s_register_operand" "")
+ (match_operand:VDQX 2 "neon_logic_op2" "")
+ (match_operand:SI 3 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ emit_insn (gen_ior<mode>3<V_suf64> (operands[0], operands[1], operands[2]));
+ DONE;
+})
+
+(define_expand "neon_veor<mode>"
+ [(match_operand:VDQX 0 "s_register_operand" "")
+ (match_operand:VDQX 1 "s_register_operand" "")
+ (match_operand:VDQX 2 "s_register_operand" "")
+ (match_operand:SI 3 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ emit_insn (gen_xor<mode>3<V_suf64> (operands[0], operands[1], operands[2]));
+ DONE;
+})
+
+(define_expand "neon_vbic<mode>"
+ [(match_operand:VDQX 0 "s_register_operand" "")
+ (match_operand:VDQX 1 "s_register_operand" "")
+ (match_operand:VDQX 2 "neon_logic_op2" "")
+ (match_operand:SI 3 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ emit_insn (gen_bic<mode>3_neon (operands[0], operands[1], operands[2]));
+ DONE;
+})
+
+(define_expand "neon_vorn<mode>"
+ [(match_operand:VDQX 0 "s_register_operand" "")
+ (match_operand:VDQX 1 "s_register_operand" "")
+ (match_operand:VDQX 2 "neon_inv_logic_op2" "")
+ (match_operand:SI 3 "immediate_operand" "")]
+ "TARGET_NEON"
+{
+ emit_insn (gen_orn<mode>3_neon (operands[0], operands[1], operands[2]));
+ DONE;
+})
+
+;; APPLE LOCAL 6150859 begin use NEON instructions for SF math
+;; When possible, use the NEON instructions for single precision floating
+;; point operations. On NEON CPUs, the VFP instructions are not scoreboarded,
+;; so they perform poorly compared to the NEON ones. We use 32x2 vector
+;; instructions and just ignore the upper values.
+
+(define_insn "*addsf3_neon"
+ [(set (match_operand:SF 0 "s_register_operand" "=t")
+ (plus:SF (match_operand:SF 1 "s_register_operand" "t")
+ (match_operand:SF 2 "s_register_operand" "t")))]
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_NEON"
+ "vadd.f32\\t%p0, %p1, %p2"
+ [(set_attr "neon_type" "neon_fp_vadd_ddd_vabs_dd")]
+)
+
+(define_insn "*subsf3_neon"
+ [(set (match_operand:SF 0 "s_register_operand" "=t")
+ (minus:SF (match_operand:SF 1 "s_register_operand" "t")
+ (match_operand:SF 2 "s_register_operand" "t")))]
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_NEON"
+ "vsub.f32\\t%p0, %p1, %p2"
+ [(set_attr "neon_type" "neon_fp_vadd_ddd_vabs_dd")]
+)
+
+(define_insn "*mulsf3_neon"
+ [(set (match_operand:SF 0 "s_register_operand" "+t")
+ (mult:SF (match_operand:SF 1 "s_register_operand" "t")
+ (match_operand:SF 2 "s_register_operand" "t")))]
+ "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_NEON"
+ "vmul.f32\\t%p0, %p1, %p2"
+ [(set_attr "neon_type" "neon_fp_vadd_ddd_vabs_dd")]
+)
+
+;; APPLE LOCAL begin 6197406 disable vmla.f32 and vmls.f32
+;; The multiply-accumulate and multiply-decrement? instructions cause a
+;; pipeline flush such that they are not useful in general. Disabling
+;; them for now.
+;; Multiply-accumulate insns
+;; 0 = 1 * 2 + 0
+; (define_insn "*mulsf3addsf_neon"
+; [(set (match_operand:SF 0 "s_register_operand" "=t")
+; (plus:SF (mult:SF (match_operand:SF 2 "s_register_operand" "t")
+; (match_operand:SF 3 "s_register_operand" "t"))
+; (match_operand:SF 1 "s_register_operand" "0")))]
+; "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_NEON"
+; "vmla.f32\\t%p0, %p2, %p3"
+; [(set_attr "neon_type" "neon_fp_vmla_ddd")]
+; )
+
+;; APPLE LOCAL begin 6251664 reversed operands for vmls.f32
+;; 0 = 0 - (1 * 2)
+; (define_insn "*mulsf3subsf_neon"
+; [(set (match_operand:SF 0 "s_register_operand" "=t")
+; (minus:SF (match_operand:SF 1 "s_register_operand" "0")
+; (mult:SF (match_operand:SF 2 "s_register_operand" "t")
+; (match_operand:SF 3 "s_register_operand" "t"))))]
+; "TARGET_32BIT && TARGET_HARD_FLOAT && TARGET_NEON"
+; "vmls.f32\\t%p0, %p2, %p3"
+; [(set_attr "neon_type" "neon_fp_vmla_ddd")]
+; )
+;; APPLE LOCAL end 6251664 reversed operands for vmls.f32
+;; APPLE LOCAL end 6197406 disable vmla.f32 and vmls.f32
+;; APPLE LOCAL 6150859 end use NEON instructions for SF math
+
Added: llvm-gcc-4.2/trunk/gcc/config/arm/neon.ml
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/neon.ml?rev=76781&view=auto
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/neon.ml (added)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/neon.ml Wed Jul 22 15:36:27 2009
@@ -0,0 +1,1827 @@
+(* APPLE LOCAL file v7 support. Merge from Codesourcery *)
+(* Common code for ARM NEON header file, documentation and test case
+ generators.
+
+ Copyright (C) 2006 Free Software Foundation, Inc.
+ Contributed by CodeSourcery.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify it under
+ the terms of the GNU General Public License as published by the Free
+ Software Foundation; either version 2, or (at your option) any later
+ version.
+
+ GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+ WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+ for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with GCC; see the file COPYING. If not, write to the Free
+ Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
+ 02110-1301, USA. *)
+
+(* Shorthand types for vector elements. *)
+type elts = S8 | S16 | S32 | S64 | F32 | U8 | U16 | U32 | U64 | P8 | P16
+ | I8 | I16 | I32 | I64 | B8 | B16 | B32 | B64 | Conv of elts * elts
+ | Cast of elts * elts | NoElts
+
+type eltclass = Signed | Unsigned | Float | Poly | Int | Bits
+ | ConvClass of eltclass * eltclass | NoType
+
+(* These vector types correspond directly to C types. *)
+type vectype = T_int8x8 | T_int8x16
+ | T_int16x4 | T_int16x8
+ | T_int32x2 | T_int32x4
+ | T_int64x1 | T_int64x2
+ | T_uint8x8 | T_uint8x16
+ | T_uint16x4 | T_uint16x8
+ | T_uint32x2 | T_uint32x4
+ | T_uint64x1 | T_uint64x2
+ | T_float32x2 | T_float32x4
+ | T_poly8x8 | T_poly8x16
+ | T_poly16x4 | T_poly16x8
+ | T_immediate of int * int
+ | T_int8 | T_int16
+ | T_int32 | T_int64
+ | T_uint8 | T_uint16
+ | T_uint32 | T_uint64
+ | T_poly8 | T_poly16
+ | T_float32 | T_arrayof of int * vectype
+ | T_ptrto of vectype | T_const of vectype
+ | T_void | T_intQI
+ | T_intHI | T_intSI
+ | T_intDI
+
+(* The meanings of the following are:
+ TImode : "Tetra", two registers (four words).
+ EImode : "hExa", three registers (six words).
+ OImode : "Octa", four registers (eight words).
+ CImode : "dodeCa", six registers (twelve words).
+ XImode : "heXadeca", eight registers (sixteen words).
+*)
+
+type inttype = B_TImode | B_EImode | B_OImode | B_CImode | B_XImode
+
+type shape_elt = Dreg | Qreg | Corereg | Immed | VecArray of int * shape_elt
+ | PtrTo of shape_elt | CstPtrTo of shape_elt
+ (* These next ones are used only in the test generator. *)
+ | Element_of_dreg (* Used for "lane" variants. *)
+ | Element_of_qreg (* Likewise. *)
+ | All_elements_of_dreg (* Used for "dup" variants. *)
+
+type shape_form = All of int * shape_elt
+ | Long
+ | Long_noreg of shape_elt
+ | Wide
+ | Wide_noreg of shape_elt
+ | Narrow
+ | Long_imm
+ | Narrow_imm
+ | Binary_imm of shape_elt
+ | Use_operands of shape_elt array
+ | By_scalar of shape_elt
+ | Unary_scalar of shape_elt
+ | Wide_lane
+ | Wide_scalar
+ | Pair_result of shape_elt
+
+type arity = Arity0 of vectype
+ | Arity1 of vectype * vectype
+ | Arity2 of vectype * vectype * vectype
+ | Arity3 of vectype * vectype * vectype * vectype
+ | Arity4 of vectype * vectype * vectype * vectype * vectype
+
+type vecmode = V8QI | V4HI | V2SI | V2SF | DI
+ | V16QI | V8HI | V4SI | V4SF | V2DI
+ | QI | HI | SI | SF
+
+type opcode =
+ (* Binary ops. *)
+ Vadd
+ | Vmul
+ | Vmla
+ | Vmls
+ | Vsub
+ | Vceq
+ | Vcge
+ | Vcgt
+ | Vcle
+ | Vclt
+ | Vcage
+ | Vcagt
+ | Vcale
+ | Vcalt
+ | Vtst
+ | Vabd
+ | Vaba
+ | Vmax
+ | Vmin
+ | Vpadd
+ | Vpada
+ | Vpmax
+ | Vpmin
+ | Vrecps
+ | Vrsqrts
+ | Vshl
+ | Vshr_n
+ | Vshl_n
+ | Vsra_n
+ | Vsri
+ | Vsli
+ (* Logic binops. *)
+ | Vand
+ | Vorr
+ | Veor
+ | Vbic
+ | Vorn
+ | Vbsl
+ (* Ops with scalar. *)
+ | Vmul_lane
+ | Vmla_lane
+ | Vmls_lane
+ | Vmul_n
+ | Vmla_n
+ | Vmls_n
+ | Vmull_n
+ | Vmull_lane
+ | Vqdmull_n
+ | Vqdmull_lane
+ | Vqdmulh_n
+ | Vqdmulh_lane
+ (* Unary ops. *)
+ | Vabs
+ | Vneg
+ | Vcls
+ | Vclz
+ | Vcnt
+ | Vrecpe
+ | Vrsqrte
+ | Vmvn
+ (* Vector extract. *)
+ | Vext
+ (* Reverse elements. *)
+ | Vrev64
+ | Vrev32
+ | Vrev16
+ (* Transposition ops. *)
+ | Vtrn
+ | Vzip
+ | Vuzp
+ (* Loads and stores (VLD1/VST1/VLD2...), elements and structures. *)
+ | Vldx of int
+ | Vstx of int
+ | Vldx_lane of int
+ | Vldx_dup of int
+ | Vstx_lane of int
+ (* Set/extract lanes from a vector. *)
+ | Vget_lane
+ | Vset_lane
+ (* Initialise vector from bit pattern. *)
+ | Vcreate
+ (* Set all lanes to same value. *)
+ | Vdup_n
+ | Vmov_n (* Is this the same? *)
+ (* Duplicate scalar to all lanes of vector. *)
+ | Vdup_lane
+ (* Combine vectors. *)
+ | Vcombine
+ (* Get quadword high/low parts. *)
+ | Vget_high
+ | Vget_low
+ (* Convert vectors. *)
+ | Vcvt
+ | Vcvt_n
+ (* Narrow/lengthen vectors. *)
+ | Vmovn
+ | Vmovl
+ (* Table lookup. *)
+ | Vtbl of int
+ | Vtbx of int
+ (* Reinterpret casts. *)
+ | Vreinterp
+
+(* Features used for documentation, to distinguish between some instruction
+ variants, and to signal special requirements (e.g. swapping arguments). *)
+
+type features =
+ Halving
+ | Rounding
+ | Saturating
+ | Dst_unsign
+ | High_half
+ | Doubling
+ | Flipped of string (* Builtin name to use with flipped arguments. *)
+ | InfoWord (* Pass an extra word for signage/rounding etc. (always passed
+ for All _, Long, Wide, Narrow shape_forms. *)
+ | ReturnPtr (* Pass explicit pointer to return value as first argument. *)
+ (* A specification as to the shape of instruction expected upon
+ disassembly, used if it differs from the shape used to build the
+ intrinsic prototype. Multiple entries in the constructor's argument
+ indicate that the intrinsic expands to more than one assembly
+ instruction, each with a corresponding shape specified here. *)
+ | Disassembles_as of shape_form list
+ | Builtin_name of string (* Override the name of the builtin. *)
+ (* Override the name of the instruction. If more than one name
+ is specified, it means that the instruction can have any of those
+ names. *)
+ | Instruction_name of string list
+ (* Mark that the intrinsic yields no instructions, or expands to yield
+ behaviour that the test generator cannot test. *)
+ | No_op
+ (* Mark that the intrinsic has constant arguments that cannot be set
+ to the defaults (zero for pointers and one otherwise) in the test
+ cases. The function supplied must return the integer to be written
+ into the testcase for the argument number (0-based) supplied to it. *)
+ | Const_valuator of (int -> int)
+
+exception MixedMode of elts * elts
+
+let rec elt_width = function
+ S8 | U8 | P8 | I8 | B8 -> 8
+ | S16 | U16 | P16 | I16 | B16 -> 16
+ | S32 | F32 | U32 | I32 | B32 -> 32
+ | S64 | U64 | I64 | B64 -> 64
+ | Conv (a, b) ->
+ let wa = elt_width a and wb = elt_width b in
+ if wa = wb then wa else failwith "element width?"
+ | Cast (a, b) -> raise (MixedMode (a, b))
+ | NoElts -> failwith "No elts"
+
+let rec elt_class = function
+ S8 | S16 | S32 | S64 -> Signed
+ | U8 | U16 | U32 | U64 -> Unsigned
+ | P8 | P16 -> Poly
+ | F32 -> Float
+ | I8 | I16 | I32 | I64 -> Int
+ | B8 | B16 | B32 | B64 -> Bits
+ | Conv (a, b) | Cast (a, b) -> ConvClass (elt_class a, elt_class b)
+ | NoElts -> NoType
+
+let elt_of_class_width c w =
+ match c, w with
+ Signed, 8 -> S8
+ | Signed, 16 -> S16
+ | Signed, 32 -> S32
+ | Signed, 64 -> S64
+ | Float, 32 -> F32
+ | Unsigned, 8 -> U8
+ | Unsigned, 16 -> U16
+ | Unsigned, 32 -> U32
+ | Unsigned, 64 -> U64
+ | Poly, 8 -> P8
+ | Poly, 16 -> P16
+ | Int, 8 -> I8
+ | Int, 16 -> I16
+ | Int, 32 -> I32
+ | Int, 64 -> I64
+ | Bits, 8 -> B8
+ | Bits, 16 -> B16
+ | Bits, 32 -> B32
+ | Bits, 64 -> B64
+ | _ -> failwith "Bad element type"
+
+(* Return unsigned integer element the same width as argument. *)
+let unsigned_of_elt elt =
+ elt_of_class_width Unsigned (elt_width elt)
+
+let signed_of_elt elt =
+ elt_of_class_width Signed (elt_width elt)
+
+(* Return untyped bits element the same width as argument. *)
+let bits_of_elt elt =
+ elt_of_class_width Bits (elt_width elt)
+
+let non_signed_variant = function
+ S8 -> I8
+ | S16 -> I16
+ | S32 -> I32
+ | S64 -> I64
+ | U8 -> I8
+ | U16 -> I16
+ | U32 -> I32
+ | U64 -> I64
+ | x -> x
+
+let poly_unsigned_variant v =
+ let elclass = match elt_class v with
+ Poly -> Unsigned
+ | x -> x in
+ elt_of_class_width elclass (elt_width v)
+
+let widen_elt elt =
+ let w = elt_width elt
+ and c = elt_class elt in
+ elt_of_class_width c (w * 2)
+
+let narrow_elt elt =
+ let w = elt_width elt
+ and c = elt_class elt in
+ elt_of_class_width c (w / 2)
+
+(* If we're trying to find a mode from a "Use_operands" instruction, use the
+ last vector operand as the dominant mode used to invoke the correct builtin.
+ We must stick to this rule in neon.md. *)
+let find_key_operand operands =
+ let rec scan opno =
+ match operands.(opno) with
+ Qreg -> Qreg
+ | Dreg -> Dreg
+ | VecArray (_, Qreg) -> Qreg
+ | VecArray (_, Dreg) -> Dreg
+ | _ -> scan (opno-1)
+ in
+ scan ((Array.length operands) - 1)
+
+let rec mode_of_elt elt shape =
+ let flt = match elt_class elt with
+ Float | ConvClass(_, Float) -> true | _ -> false in
+ let idx =
+ match elt_width elt with
+ 8 -> 0 | 16 -> 1 | 32 -> 2 | 64 -> 3
+ | _ -> failwith "Bad element width"
+ in match shape with
+ All (_, Dreg) | By_scalar Dreg | Pair_result Dreg | Unary_scalar Dreg
+ | Binary_imm Dreg | Long_noreg Dreg | Wide_noreg Dreg ->
+ [| V8QI; V4HI; if flt then V2SF else V2SI; DI |].(idx)
+ | All (_, Qreg) | By_scalar Qreg | Pair_result Qreg | Unary_scalar Qreg
+ | Binary_imm Qreg | Long_noreg Qreg | Wide_noreg Qreg ->
+ [| V16QI; V8HI; if flt then V4SF else V4SI; V2DI |].(idx)
+ | All (_, (Corereg | PtrTo _ | CstPtrTo _)) ->
+ [| QI; HI; if flt then SF else SI; DI |].(idx)
+ | Long | Wide | Wide_lane | Wide_scalar
+ | Long_imm ->
+ [| V8QI; V4HI; V2SI; DI |].(idx)
+ | Narrow | Narrow_imm -> [| V16QI; V8HI; V4SI; V2DI |].(idx)
+ | Use_operands ops -> mode_of_elt elt (All (0, (find_key_operand ops)))
+ | _ -> failwith "invalid shape"
+
+(* Modify an element type dependent on the shape of the instruction and the
+ operand number. *)
+
+let shapemap shape no =
+ let ident = fun x -> x in
+ match shape with
+ All _ | Use_operands _ | By_scalar _ | Pair_result _ | Unary_scalar _
+ | Binary_imm _ -> ident
+ | Long | Long_noreg _ | Wide_scalar | Long_imm ->
+ [| widen_elt; ident; ident |].(no)
+ | Wide | Wide_noreg _ -> [| widen_elt; widen_elt; ident |].(no)
+ | Wide_lane -> [| widen_elt; ident; ident; ident |].(no)
+ | Narrow | Narrow_imm -> [| narrow_elt; ident; ident |].(no)
+
+(* Register type (D/Q) of an operand, based on shape and operand number. *)
+
+let regmap shape no =
+ match shape with
+ All (_, reg) | Long_noreg reg | Wide_noreg reg -> reg
+ | Long -> [| Qreg; Dreg; Dreg |].(no)
+ | Wide -> [| Qreg; Qreg; Dreg |].(no)
+ | Narrow -> [| Dreg; Qreg; Qreg |].(no)
+ | Wide_lane -> [| Qreg; Dreg; Dreg; Immed |].(no)
+ | Wide_scalar -> [| Qreg; Dreg; Corereg |].(no)
+ | By_scalar reg -> [| reg; reg; Dreg; Immed |].(no)
+ | Unary_scalar reg -> [| reg; Dreg; Immed |].(no)
+ | Pair_result reg -> [| VecArray (2, reg); reg; reg |].(no)
+ | Binary_imm reg -> [| reg; reg; Immed |].(no)
+ | Long_imm -> [| Qreg; Dreg; Immed |].(no)
+ | Narrow_imm -> [| Dreg; Qreg; Immed |].(no)
+ | Use_operands these -> these.(no)
+
+let type_for_elt shape elt no =
+ let elt = (shapemap shape no) elt in
+ let reg = regmap shape no in
+ let rec type_for_reg_elt reg elt =
+ match reg with
+ Dreg ->
+ begin match elt with
+ S8 -> T_int8x8
+ | S16 -> T_int16x4
+ | S32 -> T_int32x2
+ | S64 -> T_int64x1
+ | U8 -> T_uint8x8
+ | U16 -> T_uint16x4
+ | U32 -> T_uint32x2
+ | U64 -> T_uint64x1
+ | F32 -> T_float32x2
+ | P8 -> T_poly8x8
+ | P16 -> T_poly16x4
+ | _ -> failwith "Bad elt type"
+ end
+ | Qreg ->
+ begin match elt with
+ S8 -> T_int8x16
+ | S16 -> T_int16x8
+ | S32 -> T_int32x4
+ | S64 -> T_int64x2
+ | U8 -> T_uint8x16
+ | U16 -> T_uint16x8
+ | U32 -> T_uint32x4
+ | U64 -> T_uint64x2
+ | F32 -> T_float32x4
+ | P8 -> T_poly8x16
+ | P16 -> T_poly16x8
+ | _ -> failwith "Bad elt type"
+ end
+ | Corereg ->
+ begin match elt with
+ S8 -> T_int8
+ | S16 -> T_int16
+ | S32 -> T_int32
+ | S64 -> T_int64
+ | U8 -> T_uint8
+ | U16 -> T_uint16
+ | U32 -> T_uint32
+ | U64 -> T_uint64
+ | P8 -> T_poly8
+ | P16 -> T_poly16
+ | F32 -> T_float32
+ | _ -> failwith "Bad elt type"
+ end
+ | Immed ->
+ T_immediate (0, 0)
+ | VecArray (num, sub) ->
+ T_arrayof (num, type_for_reg_elt sub elt)
+ | PtrTo x ->
+ T_ptrto (type_for_reg_elt x elt)
+ | CstPtrTo x ->
+ T_ptrto (T_const (type_for_reg_elt x elt))
+ (* Anything else is solely for the use of the test generator. *)
+ | _ -> assert false
+ in
+ type_for_reg_elt reg elt
+
+(* Return size of a vector type, in bits. *)
+let vectype_size = function
+ T_int8x8 | T_int16x4 | T_int32x2 | T_int64x1
+ | T_uint8x8 | T_uint16x4 | T_uint32x2 | T_uint64x1
+ | T_float32x2 | T_poly8x8 | T_poly16x4 -> 64
+ | T_int8x16 | T_int16x8 | T_int32x4 | T_int64x2
+ | T_uint8x16 | T_uint16x8 | T_uint32x4 | T_uint64x2
+ | T_float32x4 | T_poly8x16 | T_poly16x8 -> 128
+ | _ -> raise Not_found
+
+let inttype_for_array num elttype =
+ let eltsize = vectype_size elttype in
+ let numwords = (num * eltsize) / 32 in
+ match numwords with
+ 4 -> B_TImode
+ | 6 -> B_EImode
+ | 8 -> B_OImode
+ | 12 -> B_CImode
+ | 16 -> B_XImode
+ | _ -> failwith ("no int type for size " ^ string_of_int numwords)
+
+(* These functions return pairs of (internal, external) types, where "internal"
+ types are those seen by GCC, and "external" are those seen by the assembler.
+ These types aren't necessarily the same, since the intrinsics can munge more
+ than one C type into each assembler opcode. *)
+
+let make_sign_invariant func shape elt =
+ let arity, elt' = func shape elt in
+ arity, non_signed_variant elt'
+
+(* Don't restrict any types. *)
+
+let elts_same make_arity shape elt =
+ let vtype = type_for_elt shape elt in
+ make_arity vtype, elt
+
+(* As sign_invar_*, but when sign matters. *)
+let elts_same_io_lane =
+ elts_same (fun vtype -> Arity4 (vtype 0, vtype 0, vtype 1, vtype 2, vtype 3))
+
+let elts_same_io =
+ elts_same (fun vtype -> Arity3 (vtype 0, vtype 0, vtype 1, vtype 2))
+
+let elts_same_2_lane =
+ elts_same (fun vtype -> Arity3 (vtype 0, vtype 1, vtype 2, vtype 3))
+
+let elts_same_3 = elts_same_2_lane
+
+let elts_same_2 =
+ elts_same (fun vtype -> Arity2 (vtype 0, vtype 1, vtype 2))
+
+let elts_same_1 =
+ elts_same (fun vtype -> Arity1 (vtype 0, vtype 1))
+
+(* Use for signed/unsigned invariant operations (i.e. where the operation
+ doesn't depend on the sign of the data. *)
+
+let sign_invar_io_lane = make_sign_invariant elts_same_io_lane
+let sign_invar_io = make_sign_invariant elts_same_io
+let sign_invar_2_lane = make_sign_invariant elts_same_2_lane
+let sign_invar_2 = make_sign_invariant elts_same_2
+let sign_invar_1 = make_sign_invariant elts_same_1
+
+(* Sign-sensitive comparison. *)
+
+let cmp_sign_matters shape elt =
+ let vtype = type_for_elt shape elt
+ and rtype = type_for_elt shape (unsigned_of_elt elt) 0 in
+ Arity2 (rtype, vtype 1, vtype 2), elt
+
+(* Signed/unsigned invariant comparison. *)
+
+let cmp_sign_invar shape elt =
+ let shape', elt' = cmp_sign_matters shape elt in
+ let elt'' =
+ match non_signed_variant elt' with
+ P8 -> I8
+ | x -> x
+ in
+ shape', elt''
+
+(* Comparison (VTST) where only the element width matters. *)
+
+let cmp_bits shape elt =
+ let vtype = type_for_elt shape elt
+ and rtype = type_for_elt shape (unsigned_of_elt elt) 0
+ and bits_only = bits_of_elt elt in
+ Arity2 (rtype, vtype 1, vtype 2), bits_only
+
+let reg_shift shape elt =
+ let vtype = type_for_elt shape elt
+ and op2type = type_for_elt shape (signed_of_elt elt) 2 in
+ Arity2 (vtype 0, vtype 1, op2type), elt
+
+(* Genericised constant-shift type-generating function. *)
+
+let const_shift mkimm ?arity ?result shape elt =
+ let op2type = (shapemap shape 2) elt in
+ let op2width = elt_width op2type in
+ let op2 = mkimm op2width
+ and op1 = type_for_elt shape elt 1
+ and r_elt =
+ match result with
+ None -> elt
+ | Some restriction -> restriction elt in
+ let rtype = type_for_elt shape r_elt 0 in
+ match arity with
+ None -> Arity2 (rtype, op1, op2), elt
+ | Some mkarity -> mkarity rtype op1 op2, elt
+
+(* Use for immediate right-shifts. *)
+
+let shift_right shape elt =
+ const_shift (fun imm -> T_immediate (1, imm)) shape elt
+
+let shift_right_acc shape elt =
+ const_shift (fun imm -> T_immediate (1, imm))
+ ~arity:(fun dst op1 op2 -> Arity3 (dst, dst, op1, op2)) shape elt
+
+(* Use for immediate right-shifts when the operation doesn't care about
+ signedness. *)
+
+let shift_right_sign_invar =
+ make_sign_invariant shift_right
+
+(* Immediate right-shift; result is unsigned even when operand is signed. *)
+
+let shift_right_to_uns shape elt =
+ const_shift (fun imm -> T_immediate (1, imm)) ~result:unsigned_of_elt
+ shape elt
+
+(* Immediate left-shift. *)
+
+let shift_left shape elt =
+ const_shift (fun imm -> T_immediate (0, imm - 1)) shape elt
+
+(* Immediate left-shift, unsigned result. *)
+
+let shift_left_to_uns shape elt =
+ const_shift (fun imm -> T_immediate (0, imm - 1)) ~result:unsigned_of_elt
+ shape elt
+
+(* Immediate left-shift, don't care about signs. *)
+
+let shift_left_sign_invar =
+ make_sign_invariant shift_left
+
+(* Shift left/right and insert: only element size matters. *)
+
+let shift_insert shape elt =
+ let arity, elt =
+ const_shift (fun imm -> T_immediate (1, imm))
+ ~arity:(fun dst op1 op2 -> Arity3 (dst, dst, op1, op2)) shape elt in
+ arity, bits_of_elt elt
+
+(* Get/set lane. *)
+
+let get_lane shape elt =
+ let vtype = type_for_elt shape elt in
+ Arity2 (vtype 0, vtype 1, vtype 2),
+ (match elt with P8 -> U8 | P16 -> U16 | x -> x)
+
+let set_lane shape elt =
+ let vtype = type_for_elt shape elt in
+ Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), bits_of_elt elt
+
+let set_lane_notype shape elt =
+ let vtype = type_for_elt shape elt in
+ Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), NoElts
+
+let create_vector shape elt =
+ let vtype = type_for_elt shape U64 1
+ and rtype = type_for_elt shape elt 0 in
+ Arity1 (rtype, vtype), elt
+
+let conv make_arity shape elt =
+ let edest, esrc = match elt with
+ Conv (edest, esrc) | Cast (edest, esrc) -> edest, esrc
+ | _ -> failwith "Non-conversion element in conversion" in
+ let vtype = type_for_elt shape esrc
+ and rtype = type_for_elt shape edest 0 in
+ make_arity rtype vtype, elt
+
+let conv_1 = conv (fun rtype vtype -> Arity1 (rtype, vtype 1))
+let conv_2 = conv (fun rtype vtype -> Arity2 (rtype, vtype 1, vtype 2))
+
+(* Operation has an unsigned result even if operands are signed. *)
+
+let dst_unsign make_arity shape elt =
+ let vtype = type_for_elt shape elt
+ and rtype = type_for_elt shape (unsigned_of_elt elt) 0 in
+ make_arity rtype vtype, elt
+
+let dst_unsign_1 = dst_unsign (fun rtype vtype -> Arity1 (rtype, vtype 1))
+
+let make_bits_only func shape elt =
+ let arity, elt' = func shape elt in
+ arity, bits_of_elt elt'
+
+(* Extend operation. *)
+
+let extend shape elt =
+ let vtype = type_for_elt shape elt in
+ Arity3 (vtype 0, vtype 1, vtype 2, vtype 3), bits_of_elt elt
+
+(* Table look-up operations. Operand 2 is signed/unsigned for signed/unsigned
+ integer ops respectively, or unsigned for polynomial ops. *)
+
+let table mkarity shape elt =
+ let vtype = type_for_elt shape elt in
+ let op2 = type_for_elt shape (poly_unsigned_variant elt) 2 in
+ mkarity vtype op2, bits_of_elt elt
+
+let table_2 = table (fun vtype op2 -> Arity2 (vtype 0, vtype 1, op2))
+let table_io = table (fun vtype op2 -> Arity3 (vtype 0, vtype 0, vtype 1, op2))
+
+(* Operations where only bits matter. *)
+
+let bits_1 = make_bits_only elts_same_1
+let bits_2 = make_bits_only elts_same_2
+let bits_3 = make_bits_only elts_same_3
+
+(* Store insns. *)
+let store_1 shape elt =
+ let vtype = type_for_elt shape elt in
+ Arity2 (T_void, vtype 0, vtype 1), bits_of_elt elt
+
+let store_3 shape elt =
+ let vtype = type_for_elt shape elt in
+ Arity3 (T_void, vtype 0, vtype 1, vtype 2), bits_of_elt elt
+
+let make_notype func shape elt =
+ let arity, _ = func shape elt in
+ arity, NoElts
+
+let notype_1 = make_notype elts_same_1
+let notype_2 = make_notype elts_same_2
+let notype_3 = make_notype elts_same_3
+
+(* Bit-select operations (first operand is unsigned int). *)
+
+let bit_select shape elt =
+ let vtype = type_for_elt shape elt
+ and itype = type_for_elt shape (unsigned_of_elt elt) in
+ Arity3 (vtype 0, itype 1, vtype 2, vtype 3), NoElts
+
+(* Common lists of supported element types. *)
+
+let su_8_32 = [S8; S16; S32; U8; U16; U32]
+let su_8_64 = S64 :: U64 :: su_8_32
+let su_16_64 = [S16; S32; S64; U16; U32; U64]
+let pf_su_8_32 = P8 :: P16 :: F32 :: su_8_32
+let pf_su_8_64 = P8 :: P16 :: F32 :: su_8_64
+
+let ops =
+ [
+ (* Addition. *)
+ Vadd, [], All (3, Dreg), "vadd", sign_invar_2, F32 :: su_8_64;
+ Vadd, [], All (3, Qreg), "vaddQ", sign_invar_2, F32 :: su_8_64;
+ Vadd, [], Long, "vaddl", elts_same_2, su_8_32;
+ Vadd, [], Wide, "vaddw", elts_same_2, su_8_32;
+ Vadd, [Halving], All (3, Dreg), "vhadd", elts_same_2, su_8_32;
+ Vadd, [Halving], All (3, Qreg), "vhaddQ", elts_same_2, su_8_32;
+ Vadd, [Instruction_name ["vrhadd"]; Rounding; Halving],
+ All (3, Dreg), "vRhadd", elts_same_2, su_8_32;
+ Vadd, [Instruction_name ["vrhadd"]; Rounding; Halving],
+ All (3, Qreg), "vRhaddQ", elts_same_2, su_8_32;
+ Vadd, [Saturating], All (3, Dreg), "vqadd", elts_same_2, su_8_64;
+ Vadd, [Saturating], All (3, Qreg), "vqaddQ", elts_same_2, su_8_64;
+ Vadd, [High_half], Narrow, "vaddhn", sign_invar_2, su_16_64;
+ Vadd, [Instruction_name ["vraddhn"]; Rounding; High_half],
+ Narrow, "vRaddhn", sign_invar_2, su_16_64;
+
+ (* Multiplication. *)
+ Vmul, [], All (3, Dreg), "vmul", sign_invar_2, P8 :: F32 :: su_8_32;
+ Vmul, [], All (3, Qreg), "vmulQ", sign_invar_2, P8 :: F32 :: su_8_32;
+ Vmul, [Saturating; Doubling; High_half], All (3, Dreg), "vqdmulh",
+ elts_same_2, [S16; S32];
+ Vmul, [Saturating; Doubling; High_half], All (3, Qreg), "vqdmulhQ",
+ elts_same_2, [S16; S32];
+ Vmul,
+ [Saturating; Rounding; Doubling; High_half;
+ Instruction_name ["vqrdmulh"]],
+ All (3, Dreg), "vqRdmulh",
+ elts_same_2, [S16; S32];
+ Vmul,
+ [Saturating; Rounding; Doubling; High_half;
+ Instruction_name ["vqrdmulh"]],
+ All (3, Qreg), "vqRdmulhQ",
+ elts_same_2, [S16; S32];
+ Vmul, [], Long, "vmull", elts_same_2, P8 :: su_8_32;
+ Vmul, [Saturating; Doubling], Long, "vqdmull", elts_same_2, [S16; S32];
+
+ (* Multiply-accumulate. *)
+ Vmla, [], All (3, Dreg), "vmla", sign_invar_io, F32 :: su_8_32;
+ Vmla, [], All (3, Qreg), "vmlaQ", sign_invar_io, F32 :: su_8_32;
+ Vmla, [], Long, "vmlal", elts_same_io, su_8_32;
+ Vmla, [Saturating; Doubling], Long, "vqdmlal", elts_same_io, [S16; S32];
+
+ (* Multiply-subtract. *)
+ Vmls, [], All (3, Dreg), "vmls", sign_invar_io, F32 :: su_8_32;
+ Vmls, [], All (3, Qreg), "vmlsQ", sign_invar_io, F32 :: su_8_32;
+ Vmls, [], Long, "vmlsl", elts_same_io, su_8_32;
+ Vmls, [Saturating; Doubling], Long, "vqdmlsl", elts_same_io, [S16; S32];
+
+ (* Subtraction. *)
+ Vsub, [], All (3, Dreg), "vsub", sign_invar_2, F32 :: su_8_64;
+ Vsub, [], All (3, Qreg), "vsubQ", sign_invar_2, F32 :: su_8_64;
+ Vsub, [], Long, "vsubl", elts_same_2, su_8_32;
+ Vsub, [], Wide, "vsubw", elts_same_2, su_8_32;
+ Vsub, [Halving], All (3, Dreg), "vhsub", elts_same_2, su_8_32;
+ Vsub, [Halving], All (3, Qreg), "vhsubQ", elts_same_2, su_8_32;
+ Vsub, [Saturating], All (3, Dreg), "vqsub", elts_same_2, su_8_64;
+ Vsub, [Saturating], All (3, Qreg), "vqsubQ", elts_same_2, su_8_64;
+ Vsub, [High_half], Narrow, "vsubhn", sign_invar_2, su_16_64;
+ Vsub, [Instruction_name ["vrsubhn"]; Rounding; High_half],
+ Narrow, "vRsubhn", sign_invar_2, su_16_64;
+
+ (* Comparison, equal. *)
+ Vceq, [], All (3, Dreg), "vceq", cmp_sign_invar, P8 :: F32 :: su_8_32;
+ Vceq, [], All (3, Qreg), "vceqQ", cmp_sign_invar, P8 :: F32 :: su_8_32;
+
+ (* Comparison, greater-than or equal. *)
+ Vcge, [], All (3, Dreg), "vcge", cmp_sign_matters, F32 :: su_8_32;
+ Vcge, [], All (3, Qreg), "vcgeQ", cmp_sign_matters, F32 :: su_8_32;
+
+ (* Comparison, less-than or equal. *)
+ Vcle, [Flipped "vcge"], All (3, Dreg), "vcle", cmp_sign_matters,
+ F32 :: su_8_32;
+ Vcle, [Instruction_name ["vcge"]; Flipped "vcgeQ"],
+ All (3, Qreg), "vcleQ", cmp_sign_matters,
+ F32 :: su_8_32;
+
+ (* Comparison, greater-than. *)
+ Vcgt, [], All (3, Dreg), "vcgt", cmp_sign_matters, F32 :: su_8_32;
+ Vcgt, [], All (3, Qreg), "vcgtQ", cmp_sign_matters, F32 :: su_8_32;
+
+ (* Comparison, less-than. *)
+ Vclt, [Flipped "vcgt"], All (3, Dreg), "vclt", cmp_sign_matters,
+ F32 :: su_8_32;
+ Vclt, [Instruction_name ["vcgt"]; Flipped "vcgtQ"],
+ All (3, Qreg), "vcltQ", cmp_sign_matters,
+ F32 :: su_8_32;
+
+ (* Compare absolute greater-than or equal. *)
+ Vcage, [Instruction_name ["vacge"]],
+ All (3, Dreg), "vcage", cmp_sign_matters, [F32];
+ Vcage, [Instruction_name ["vacge"]],
+ All (3, Qreg), "vcageQ", cmp_sign_matters, [F32];
+
+ (* Compare absolute less-than or equal. *)
+ Vcale, [Instruction_name ["vacge"]; Flipped "vcage"],
+ All (3, Dreg), "vcale", cmp_sign_matters, [F32];
+ Vcale, [Instruction_name ["vacge"]; Flipped "vcageQ"],
+ All (3, Qreg), "vcaleQ", cmp_sign_matters, [F32];
+
+ (* Compare absolute greater-than or equal. *)
+ Vcagt, [Instruction_name ["vacgt"]],
+ All (3, Dreg), "vcagt", cmp_sign_matters, [F32];
+ Vcagt, [Instruction_name ["vacgt"]],
+ All (3, Qreg), "vcagtQ", cmp_sign_matters, [F32];
+
+ (* Compare absolute less-than or equal. *)
+ Vcalt, [Instruction_name ["vacgt"]; Flipped "vcagt"],
+ All (3, Dreg), "vcalt", cmp_sign_matters, [F32];
+ Vcalt, [Instruction_name ["vacgt"]; Flipped "vcagtQ"],
+ All (3, Qreg), "vcaltQ", cmp_sign_matters, [F32];
+
+ (* Test bits. *)
+ Vtst, [], All (3, Dreg), "vtst", cmp_bits, P8 :: su_8_32;
+ Vtst, [], All (3, Qreg), "vtstQ", cmp_bits, P8 :: su_8_32;
+
+ (* Absolute difference. *)
+ Vabd, [], All (3, Dreg), "vabd", elts_same_2, F32 :: su_8_32;
+ Vabd, [], All (3, Qreg), "vabdQ", elts_same_2, F32 :: su_8_32;
+ Vabd, [], Long, "vabdl", elts_same_2, su_8_32;
+
+ (* Absolute difference and accumulate. *)
+ Vaba, [], All (3, Dreg), "vaba", elts_same_io, su_8_32;
+ Vaba, [], All (3, Qreg), "vabaQ", elts_same_io, su_8_32;
+ Vaba, [], Long, "vabal", elts_same_io, su_8_32;
+
+ (* Max. *)
+ Vmax, [], All (3, Dreg), "vmax", elts_same_2, F32 :: su_8_32;
+ Vmax, [], All (3, Qreg), "vmaxQ", elts_same_2, F32 :: su_8_32;
+
+ (* Min. *)
+ Vmin, [], All (3, Dreg), "vmin", elts_same_2, F32 :: su_8_32;
+ Vmin, [], All (3, Qreg), "vminQ", elts_same_2, F32 :: su_8_32;
+
+ (* Pairwise add. *)
+ Vpadd, [], All (3, Dreg), "vpadd", sign_invar_2, F32 :: su_8_32;
+ Vpadd, [], Long_noreg Dreg, "vpaddl", elts_same_1, su_8_32;
+ Vpadd, [], Long_noreg Qreg, "vpaddlQ", elts_same_1, su_8_32;
+
+ (* Pairwise add, widen and accumulate. *)
+ Vpada, [], Wide_noreg Dreg, "vpadal", elts_same_2, su_8_32;
+ Vpada, [], Wide_noreg Qreg, "vpadalQ", elts_same_2, su_8_32;
+
+ (* Folding maximum, minimum. *)
+ Vpmax, [], All (3, Dreg), "vpmax", elts_same_2, F32 :: su_8_32;
+ Vpmin, [], All (3, Dreg), "vpmin", elts_same_2, F32 :: su_8_32;
+
+ (* Reciprocal step. *)
+ Vrecps, [], All (3, Dreg), "vrecps", elts_same_2, [F32];
+ Vrecps, [], All (3, Qreg), "vrecpsQ", elts_same_2, [F32];
+ Vrsqrts, [], All (3, Dreg), "vrsqrts", elts_same_2, [F32];
+ Vrsqrts, [], All (3, Qreg), "vrsqrtsQ", elts_same_2, [F32];
+
+ (* Vector shift left. *)
+ Vshl, [], All (3, Dreg), "vshl", reg_shift, su_8_64;
+ Vshl, [], All (3, Qreg), "vshlQ", reg_shift, su_8_64;
+ Vshl, [Instruction_name ["vrshl"]; Rounding],
+ All (3, Dreg), "vRshl", reg_shift, su_8_64;
+ Vshl, [Instruction_name ["vrshl"]; Rounding],
+ All (3, Qreg), "vRshlQ", reg_shift, su_8_64;
+ Vshl, [Saturating], All (3, Dreg), "vqshl", reg_shift, su_8_64;
+ Vshl, [Saturating], All (3, Qreg), "vqshlQ", reg_shift, su_8_64;
+ Vshl, [Instruction_name ["vqrshl"]; Saturating; Rounding],
+ All (3, Dreg), "vqRshl", reg_shift, su_8_64;
+ Vshl, [Instruction_name ["vqrshl"]; Saturating; Rounding],
+ All (3, Qreg), "vqRshlQ", reg_shift, su_8_64;
+
+ (* Vector shift right by constant. *)
+ Vshr_n, [], Binary_imm Dreg, "vshr_n", shift_right, su_8_64;
+ Vshr_n, [], Binary_imm Qreg, "vshrQ_n", shift_right, su_8_64;
+ Vshr_n, [Instruction_name ["vrshr"]; Rounding], Binary_imm Dreg,
+ "vRshr_n", shift_right, su_8_64;
+ Vshr_n, [Instruction_name ["vrshr"]; Rounding], Binary_imm Qreg,
+ "vRshrQ_n", shift_right, su_8_64;
+ Vshr_n, [], Narrow_imm, "vshrn_n", shift_right_sign_invar, su_16_64;
+ Vshr_n, [Instruction_name ["vrshrn"]; Rounding], Narrow_imm, "vRshrn_n",
+ shift_right_sign_invar, su_16_64;
+ Vshr_n, [Saturating], Narrow_imm, "vqshrn_n", shift_right, su_16_64;
+ Vshr_n, [Instruction_name ["vqrshrn"]; Saturating; Rounding], Narrow_imm,
+ "vqRshrn_n", shift_right, su_16_64;
+ Vshr_n, [Saturating; Dst_unsign], Narrow_imm, "vqshrun_n",
+ shift_right_to_uns, [S16; S32; S64];
+ Vshr_n, [Instruction_name ["vqrshrun"]; Saturating; Dst_unsign; Rounding],
+ Narrow_imm, "vqRshrun_n", shift_right_to_uns, [S16; S32; S64];
+
+ (* Vector shift left by constant. *)
+ Vshl_n, [], Binary_imm Dreg, "vshl_n", shift_left_sign_invar, su_8_64;
+ Vshl_n, [], Binary_imm Qreg, "vshlQ_n", shift_left_sign_invar, su_8_64;
+ Vshl_n, [Saturating], Binary_imm Dreg, "vqshl_n", shift_left, su_8_64;
+ Vshl_n, [Saturating], Binary_imm Qreg, "vqshlQ_n", shift_left, su_8_64;
+ Vshl_n, [Saturating; Dst_unsign], Binary_imm Dreg, "vqshlu_n",
+ shift_left_to_uns, [S8; S16; S32; S64];
+ Vshl_n, [Saturating; Dst_unsign], Binary_imm Qreg, "vqshluQ_n",
+ shift_left_to_uns, [S8; S16; S32; S64];
+ Vshl_n, [], Long_imm, "vshll_n", shift_left, su_8_32;
+
+ (* Vector shift right by constant and accumulate. *)
+ Vsra_n, [], Binary_imm Dreg, "vsra_n", shift_right_acc, su_8_64;
+ Vsra_n, [], Binary_imm Qreg, "vsraQ_n", shift_right_acc, su_8_64;
+ Vsra_n, [Instruction_name ["vrsra"]; Rounding], Binary_imm Dreg,
+ "vRsra_n", shift_right_acc, su_8_64;
+ Vsra_n, [Instruction_name ["vrsra"]; Rounding], Binary_imm Qreg,
+ "vRsraQ_n", shift_right_acc, su_8_64;
+
+ (* Vector shift right and insert. *)
+ Vsri, [], Use_operands [| Dreg; Dreg; Immed |], "vsri_n", shift_insert,
+ P8 :: P16 :: su_8_64;
+ Vsri, [], Use_operands [| Qreg; Qreg; Immed |], "vsriQ_n", shift_insert,
+ P8 :: P16 :: su_8_64;
+
+ (* Vector shift left and insert. *)
+ Vsli, [], Use_operands [| Dreg; Dreg; Immed |], "vsli_n", shift_insert,
+ P8 :: P16 :: su_8_64;
+ Vsli, [], Use_operands [| Qreg; Qreg; Immed |], "vsliQ_n", shift_insert,
+ P8 :: P16 :: su_8_64;
+
+ (* Absolute value. *)
+ Vabs, [], All (2, Dreg), "vabs", elts_same_1, [S8; S16; S32; F32];
+ Vabs, [], All (2, Qreg), "vabsQ", elts_same_1, [S8; S16; S32; F32];
+ Vabs, [Saturating], All (2, Dreg), "vqabs", elts_same_1, [S8; S16; S32];
+ Vabs, [Saturating], All (2, Qreg), "vqabsQ", elts_same_1, [S8; S16; S32];
+
+ (* Negate. *)
+ Vneg, [], All (2, Dreg), "vneg", elts_same_1, [S8; S16; S32; F32];
+ Vneg, [], All (2, Qreg), "vnegQ", elts_same_1, [S8; S16; S32; F32];
+ Vneg, [Saturating], All (2, Dreg), "vqneg", elts_same_1, [S8; S16; S32];
+ Vneg, [Saturating], All (2, Qreg), "vqnegQ", elts_same_1, [S8; S16; S32];
+
+ (* Bitwise not. *)
+ Vmvn, [], All (2, Dreg), "vmvn", notype_1, P8 :: su_8_32;
+ Vmvn, [], All (2, Qreg), "vmvnQ", notype_1, P8 :: su_8_32;
+
+ (* Count leading sign bits. *)
+ Vcls, [], All (2, Dreg), "vcls", elts_same_1, [S8; S16; S32];
+ Vcls, [], All (2, Qreg), "vclsQ", elts_same_1, [S8; S16; S32];
+
+ (* Count leading zeros. *)
+ Vclz, [], All (2, Dreg), "vclz", sign_invar_1, su_8_32;
+ Vclz, [], All (2, Qreg), "vclzQ", sign_invar_1, su_8_32;
+
+ (* Count number of set bits. *)
+ Vcnt, [], All (2, Dreg), "vcnt", bits_1, [P8; S8; U8];
+ Vcnt, [], All (2, Qreg), "vcntQ", bits_1, [P8; S8; U8];
+
+ (* Reciprocal estimate. *)
+ Vrecpe, [], All (2, Dreg), "vrecpe", elts_same_1, [U32; F32];
+ Vrecpe, [], All (2, Qreg), "vrecpeQ", elts_same_1, [U32; F32];
+
+ (* Reciprocal square-root estimate. *)
+ Vrsqrte, [], All (2, Dreg), "vrsqrte", elts_same_1, [U32; F32];
+ Vrsqrte, [], All (2, Qreg), "vrsqrteQ", elts_same_1, [U32; F32];
+
+ (* Get lanes from a vector. *)
+ Vget_lane,
+ [InfoWord; Disassembles_as [Use_operands [| Corereg; Element_of_dreg |]];
+ Instruction_name ["vmov"]],
+ Use_operands [| Corereg; Dreg; Immed |],
+ "vget_lane", get_lane, pf_su_8_32;
+ Vget_lane,
+ [InfoWord;
+ Disassembles_as [Use_operands [| Corereg; Corereg; Dreg |]];
+ Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
+ Use_operands [| Corereg; Dreg; Immed |],
+ "vget_lane", notype_2, [S64; U64];
+ Vget_lane,
+ [InfoWord; Disassembles_as [Use_operands [| Corereg; Element_of_dreg |]];
+ Instruction_name ["vmov"]],
+ Use_operands [| Corereg; Qreg; Immed |],
+ "vgetQ_lane", get_lane, pf_su_8_32;
+ Vget_lane,
+ [InfoWord;
+ Disassembles_as [Use_operands [| Corereg; Corereg; Dreg |]];
+ Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
+ Use_operands [| Corereg; Qreg; Immed |],
+ "vgetQ_lane", notype_2, [S64; U64];
+
+ (* Set lanes in a vector. *)
+ Vset_lane, [Disassembles_as [Use_operands [| Element_of_dreg; Corereg |]];
+ Instruction_name ["vmov"]],
+ Use_operands [| Dreg; Corereg; Dreg; Immed |], "vset_lane",
+ set_lane, pf_su_8_32;
+ Vset_lane, [Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]];
+ Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
+ Use_operands [| Dreg; Corereg; Dreg; Immed |], "vset_lane",
+ set_lane_notype, [S64; U64];
+ Vset_lane, [Disassembles_as [Use_operands [| Element_of_dreg; Corereg |]];
+ Instruction_name ["vmov"]],
+ Use_operands [| Qreg; Corereg; Qreg; Immed |], "vsetQ_lane",
+ set_lane, pf_su_8_32;
+ Vset_lane, [Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]];
+ Instruction_name ["vmov"]; Const_valuator (fun _ -> 0)],
+ Use_operands [| Qreg; Corereg; Qreg; Immed |], "vsetQ_lane",
+ set_lane_notype, [S64; U64];
+
+ (* Create vector from literal bit pattern. *)
+ Vcreate,
+ [No_op], (* Not really, but it can yield various things that are too
+ hard for the test generator at this time. *)
+ Use_operands [| Dreg; Corereg |], "vcreate", create_vector,
+ pf_su_8_64;
+
+ (* Set all lanes to the same value. *)
+ Vdup_n, [],
+ Use_operands [| Dreg; Corereg |], "vdup_n", bits_1,
+ pf_su_8_32;
+ Vdup_n,
+ [Instruction_name ["vmov"];
+ Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]],
+ Use_operands [| Dreg; Corereg |], "vdup_n", notype_1,
+ [S64; U64];
+ Vdup_n, [],
+ Use_operands [| Qreg; Corereg |], "vdupQ_n", bits_1,
+ pf_su_8_32;
+ Vdup_n,
+ [Instruction_name ["vmov"];
+ Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |];
+ Use_operands [| Dreg; Corereg; Corereg |]]],
+ Use_operands [| Qreg; Corereg |], "vdupQ_n", notype_1,
+ [S64; U64];
+
+ (* These are just aliases for the above. *)
+ Vmov_n,
+ [Builtin_name "vdup_n"],
+ Use_operands [| Dreg; Corereg |],
+ "vmov_n", bits_1, pf_su_8_32;
+ Vmov_n,
+ [Builtin_name "vdup_n";
+ Instruction_name ["vmov"];
+ Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |]]],
+ Use_operands [| Dreg; Corereg |],
+ "vmov_n", notype_1, [S64; U64];
+ Vmov_n,
+ [Builtin_name "vdupQ_n"],
+ Use_operands [| Qreg; Corereg |],
+ "vmovQ_n", bits_1, pf_su_8_32;
+ Vmov_n,
+ [Builtin_name "vdupQ_n";
+ Instruction_name ["vmov"];
+ Disassembles_as [Use_operands [| Dreg; Corereg; Corereg |];
+ Use_operands [| Dreg; Corereg; Corereg |]]],
+ Use_operands [| Qreg; Corereg |],
+ "vmovQ_n", notype_1, [S64; U64];
+
+ (* Duplicate, lane version. We can't use Use_operands here because the
+ rightmost register (always Dreg) would be picked up by find_key_operand,
+ when we want the leftmost register to be used in this case (otherwise
+ the modes are indistinguishable in neon.md, etc. *)
+ Vdup_lane,
+ [Disassembles_as [Use_operands [| Dreg; Element_of_dreg |]]],
+ Unary_scalar Dreg, "vdup_lane", bits_2, pf_su_8_32;
+ Vdup_lane,
+ [No_op; Const_valuator (fun _ -> 0)],
+ Unary_scalar Dreg, "vdup_lane", bits_2, [S64; U64];
+ Vdup_lane,
+ [Disassembles_as [Use_operands [| Qreg; Element_of_dreg |]]],
+ Unary_scalar Qreg, "vdupQ_lane", bits_2, pf_su_8_32;
+ Vdup_lane,
+ [No_op; Const_valuator (fun _ -> 0)],
+ Unary_scalar Qreg, "vdupQ_lane", bits_2, [S64; U64];
+
+ (* Combining vectors. *)
+ Vcombine, [No_op],
+ Use_operands [| Qreg; Dreg; Dreg |], "vcombine", notype_2,
+ pf_su_8_64;
+
+ (* Splitting vectors. *)
+ Vget_high, [No_op],
+ Use_operands [| Dreg; Qreg |], "vget_high",
+ notype_1, pf_su_8_64;
+ Vget_low, [Instruction_name ["vmov"];
+ Disassembles_as [Use_operands [| Dreg; Dreg |]]],
+ Use_operands [| Dreg; Qreg |], "vget_low",
+ notype_1, pf_su_8_64;
+
+ (* Conversions. *)
+ Vcvt, [InfoWord], All (2, Dreg), "vcvt", conv_1,
+ [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
+ Vcvt, [InfoWord], All (2, Qreg), "vcvtQ", conv_1,
+ [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
+ Vcvt_n, [InfoWord], Use_operands [| Dreg; Dreg; Immed |], "vcvt_n", conv_2,
+ [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
+ Vcvt_n, [InfoWord], Use_operands [| Qreg; Qreg; Immed |], "vcvtQ_n", conv_2,
+ [Conv (S32, F32); Conv (U32, F32); Conv (F32, S32); Conv (F32, U32)];
+
+ (* Move, narrowing. *)
+ Vmovn, [Disassembles_as [Use_operands [| Dreg; Qreg |]]],
+ Narrow, "vmovn", sign_invar_1, su_16_64;
+ Vmovn, [Disassembles_as [Use_operands [| Dreg; Qreg |]]; Saturating],
+ Narrow, "vqmovn", elts_same_1, su_16_64;
+ Vmovn,
+ [Disassembles_as [Use_operands [| Dreg; Qreg |]]; Saturating; Dst_unsign],
+ Narrow, "vqmovun", dst_unsign_1,
+ [S16; S32; S64];
+
+ (* Move, long. *)
+ Vmovl, [Disassembles_as [Use_operands [| Qreg; Dreg |]]],
+ Long, "vmovl", elts_same_1, su_8_32;
+
+ (* Table lookup. *)
+ Vtbl 1,
+ [Instruction_name ["vtbl"];
+ Disassembles_as [Use_operands [| Dreg; VecArray (1, Dreg); Dreg |]]],
+ Use_operands [| Dreg; Dreg; Dreg |], "vtbl1", table_2, [U8; S8; P8];
+ Vtbl 2, [Instruction_name ["vtbl"]],
+ Use_operands [| Dreg; VecArray (2, Dreg); Dreg |], "vtbl2", table_2,
+ [U8; S8; P8];
+ Vtbl 3, [Instruction_name ["vtbl"]],
+ Use_operands [| Dreg; VecArray (3, Dreg); Dreg |], "vtbl3", table_2,
+ [U8; S8; P8];
+ Vtbl 4, [Instruction_name ["vtbl"]],
+ Use_operands [| Dreg; VecArray (4, Dreg); Dreg |], "vtbl4", table_2,
+ [U8; S8; P8];
+
+ (* Extended table lookup. *)
+ Vtbx 1,
+ [Instruction_name ["vtbx"];
+ Disassembles_as [Use_operands [| Dreg; VecArray (1, Dreg); Dreg |]]],
+ Use_operands [| Dreg; Dreg; Dreg |], "vtbx1", table_io, [U8; S8; P8];
+ Vtbx 2, [Instruction_name ["vtbx"]],
+ Use_operands [| Dreg; VecArray (2, Dreg); Dreg |], "vtbx2", table_io,
+ [U8; S8; P8];
+ Vtbx 3, [Instruction_name ["vtbx"]],
+ Use_operands [| Dreg; VecArray (3, Dreg); Dreg |], "vtbx3", table_io,
+ [U8; S8; P8];
+ Vtbx 4, [Instruction_name ["vtbx"]],
+ Use_operands [| Dreg; VecArray (4, Dreg); Dreg |], "vtbx4", table_io,
+ [U8; S8; P8];
+
+ (* Multiply, lane. (note: these were undocumented at the time of
+ writing). *)
+ Vmul_lane, [], By_scalar Dreg, "vmul_lane", sign_invar_2_lane,
+ [S16; S32; U16; U32; F32];
+ Vmul_lane, [], By_scalar Qreg, "vmulQ_lane", sign_invar_2_lane,
+ [S16; S32; U16; U32; F32];
+
+ (* Multiply-accumulate, lane. *)
+ Vmla_lane, [], By_scalar Dreg, "vmla_lane", sign_invar_io_lane,
+ [S16; S32; U16; U32; F32];
+ Vmla_lane, [], By_scalar Qreg, "vmlaQ_lane", sign_invar_io_lane,
+ [S16; S32; U16; U32; F32];
+ Vmla_lane, [], Wide_lane, "vmlal_lane", elts_same_io_lane,
+ [S16; S32; U16; U32];
+ Vmla_lane, [Saturating; Doubling], Wide_lane, "vqdmlal_lane",
+ elts_same_io_lane, [S16; S32];
+
+ (* Multiply-subtract, lane. *)
+ Vmls_lane, [], By_scalar Dreg, "vmls_lane", sign_invar_io_lane,
+ [S16; S32; U16; U32; F32];
+ Vmls_lane, [], By_scalar Qreg, "vmlsQ_lane", sign_invar_io_lane,
+ [S16; S32; U16; U32; F32];
+ Vmls_lane, [], Wide_lane, "vmlsl_lane", elts_same_io_lane,
+ [S16; S32; U16; U32];
+ Vmls_lane, [Saturating; Doubling], Wide_lane, "vqdmlsl_lane",
+ elts_same_io_lane, [S16; S32];
+
+ (* Long multiply, lane. *)
+ Vmull_lane, [],
+ Wide_lane, "vmull_lane", elts_same_2_lane, [S16; S32; U16; U32];
+
+ (* Saturating doubling long multiply, lane. *)
+ Vqdmull_lane, [Saturating; Doubling],
+ Wide_lane, "vqdmull_lane", elts_same_2_lane, [S16; S32];
+
+ (* Saturating doubling long multiply high, lane. *)
+ Vqdmulh_lane, [Saturating; Halving],
+ By_scalar Qreg, "vqdmulhQ_lane", elts_same_2_lane, [S16; S32];
+ Vqdmulh_lane, [Saturating; Halving],
+ By_scalar Dreg, "vqdmulh_lane", elts_same_2_lane, [S16; S32];
+ Vqdmulh_lane, [Saturating; Halving; Rounding;
+ Instruction_name ["vqrdmulh"]],
+ By_scalar Qreg, "vqRdmulhQ_lane", elts_same_2_lane, [S16; S32];
+ Vqdmulh_lane, [Saturating; Halving; Rounding;
+ Instruction_name ["vqrdmulh"]],
+ By_scalar Dreg, "vqRdmulh_lane", elts_same_2_lane, [S16; S32];
+
+ (* Vector multiply by scalar. *)
+ Vmul_n, [InfoWord;
+ Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
+ Use_operands [| Dreg; Dreg; Corereg |], "vmul_n",
+ sign_invar_2, [S16; S32; U16; U32; F32];
+ Vmul_n, [InfoWord;
+ Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
+ Use_operands [| Qreg; Qreg; Corereg |], "vmulQ_n",
+ sign_invar_2, [S16; S32; U16; U32; F32];
+
+ (* Vector long multiply by scalar. *)
+ Vmull_n, [Instruction_name ["vmull"];
+ Disassembles_as [Use_operands [| Qreg; Dreg; Element_of_dreg |]]],
+ Wide_scalar, "vmull_n",
+ elts_same_2, [S16; S32; U16; U32];
+
+ (* Vector saturating doubling long multiply by scalar. *)
+ Vqdmull_n, [Saturating; Doubling;
+ Disassembles_as [Use_operands [| Qreg; Dreg;
+ Element_of_dreg |]]],
+ Wide_scalar, "vqdmull_n",
+ elts_same_2, [S16; S32];
+
+ (* Vector saturating doubling long multiply high by scalar. *)
+ Vqdmulh_n,
+ [Saturating; Halving; InfoWord;
+ Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
+ Use_operands [| Qreg; Qreg; Corereg |],
+ "vqdmulhQ_n", elts_same_2, [S16; S32];
+ Vqdmulh_n,
+ [Saturating; Halving; InfoWord;
+ Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
+ Use_operands [| Dreg; Dreg; Corereg |],
+ "vqdmulh_n", elts_same_2, [S16; S32];
+ Vqdmulh_n,
+ [Saturating; Halving; Rounding; InfoWord;
+ Instruction_name ["vqrdmulh"];
+ Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
+ Use_operands [| Qreg; Qreg; Corereg |],
+ "vqRdmulhQ_n", elts_same_2, [S16; S32];
+ Vqdmulh_n,
+ [Saturating; Halving; Rounding; InfoWord;
+ Instruction_name ["vqrdmulh"];
+ Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
+ Use_operands [| Dreg; Dreg; Corereg |],
+ "vqRdmulh_n", elts_same_2, [S16; S32];
+
+ (* Vector multiply-accumulate by scalar. *)
+ Vmla_n, [InfoWord;
+ Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
+ Use_operands [| Dreg; Dreg; Corereg |], "vmla_n",
+ sign_invar_io, [S16; S32; U16; U32; F32];
+ Vmla_n, [InfoWord;
+ Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
+ Use_operands [| Qreg; Qreg; Corereg |], "vmlaQ_n",
+ sign_invar_io, [S16; S32; U16; U32; F32];
+ Vmla_n, [], Wide_scalar, "vmlal_n", elts_same_io, [S16; S32; U16; U32];
+ Vmla_n, [Saturating; Doubling], Wide_scalar, "vqdmlal_n", elts_same_io,
+ [S16; S32];
+
+ (* Vector multiply subtract by scalar. *)
+ Vmls_n, [InfoWord;
+ Disassembles_as [Use_operands [| Dreg; Dreg; Element_of_dreg |]]],
+ Use_operands [| Dreg; Dreg; Corereg |], "vmls_n",
+ sign_invar_io, [S16; S32; U16; U32; F32];
+ Vmls_n, [InfoWord;
+ Disassembles_as [Use_operands [| Qreg; Qreg; Element_of_dreg |]]],
+ Use_operands [| Qreg; Qreg; Corereg |], "vmlsQ_n",
+ sign_invar_io, [S16; S32; U16; U32; F32];
+ Vmls_n, [], Wide_scalar, "vmlsl_n", elts_same_io, [S16; S32; U16; U32];
+ Vmls_n, [Saturating; Doubling], Wide_scalar, "vqdmlsl_n", elts_same_io,
+ [S16; S32];
+
+ (* Vector extract. *)
+ Vext, [Const_valuator (fun _ -> 0)],
+ Use_operands [| Dreg; Dreg; Dreg; Immed |], "vext", extend,
+ pf_su_8_64;
+ Vext, [Const_valuator (fun _ -> 0)],
+ Use_operands [| Qreg; Qreg; Qreg; Immed |], "vextQ", extend,
+ pf_su_8_64;
+
+ (* Reverse elements. *)
+ Vrev64, [], All (2, Dreg), "vrev64", bits_1, P8 :: P16 :: F32 :: su_8_32;
+ Vrev64, [], All (2, Qreg), "vrev64Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
+ Vrev32, [], All (2, Dreg), "vrev32", bits_1, [P8; P16; S8; U8; S16; U16];
+ Vrev32, [], All (2, Qreg), "vrev32Q", bits_1, [P8; P16; S8; U8; S16; U16];
+ Vrev16, [], All (2, Dreg), "vrev16", bits_1, [P8; S8; U8];
+ Vrev16, [], All (2, Qreg), "vrev16Q", bits_1, [P8; S8; U8];
+
+ (* Bit selection. *)
+ Vbsl,
+ [Instruction_name ["vbsl"; "vbit"; "vbif"];
+ Disassembles_as [Use_operands [| Dreg; Dreg; Dreg |]]],
+ Use_operands [| Dreg; Dreg; Dreg; Dreg |], "vbsl", bit_select,
+ pf_su_8_64;
+ Vbsl,
+ [Instruction_name ["vbsl"; "vbit"; "vbif"];
+ Disassembles_as [Use_operands [| Qreg; Qreg; Qreg |]]],
+ Use_operands [| Qreg; Qreg; Qreg; Qreg |], "vbslQ", bit_select,
+ pf_su_8_64;
+
+ (* Transpose elements. **NOTE** ReturnPtr goes some of the way towards
+ generating good code for intrinsics which return structure types --
+ builtins work well by themselves (and understand that the values being
+ stored on e.g. the stack also reside in registers, so can optimise the
+ stores away entirely if the results are used immediately), but
+ intrinsics are very much less efficient. Maybe something can be improved
+ re: inlining, or tweaking the ABI used for intrinsics (a special call
+ attribute?).
+ *)
+ Vtrn, [ReturnPtr], Pair_result Dreg, "vtrn", bits_2, pf_su_8_32;
+ Vtrn, [ReturnPtr], Pair_result Qreg, "vtrnQ", bits_2, pf_su_8_32;
+
+ (* Zip elements. *)
+ Vzip, [ReturnPtr], Pair_result Dreg, "vzip", bits_2, pf_su_8_32;
+ Vzip, [ReturnPtr], Pair_result Qreg, "vzipQ", bits_2, pf_su_8_32;
+
+ (* Unzip elements. *)
+ Vuzp, [ReturnPtr], Pair_result Dreg, "vuzp", bits_2, pf_su_8_32;
+ Vuzp, [ReturnPtr], Pair_result Qreg, "vuzpQ", bits_2, pf_su_8_32;
+
+ (* Element/structure loads. VLD1 variants. *)
+ Vldx 1,
+ [Disassembles_as [Use_operands [| VecArray (1, Dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| Dreg; CstPtrTo Corereg |], "vld1", bits_1,
+ pf_su_8_64;
+ Vldx 1, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q", bits_1,
+ pf_su_8_64;
+
+ Vldx_lane 1,
+ [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| Dreg; CstPtrTo Corereg; Dreg; Immed |],
+ "vld1_lane", bits_3, pf_su_8_32;
+ Vldx_lane 1,
+ [Disassembles_as [Use_operands [| VecArray (1, Dreg);
+ CstPtrTo Corereg |]];
+ Const_valuator (fun _ -> 0)],
+ Use_operands [| Dreg; CstPtrTo Corereg; Dreg; Immed |],
+ "vld1_lane", bits_3, [S64; U64];
+ Vldx_lane 1,
+ [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| Qreg; CstPtrTo Corereg; Qreg; Immed |],
+ "vld1Q_lane", bits_3, pf_su_8_32;
+ Vldx_lane 1,
+ [Disassembles_as [Use_operands [| VecArray (1, Dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| Qreg; CstPtrTo Corereg; Qreg; Immed |],
+ "vld1Q_lane", bits_3, [S64; U64];
+
+ Vldx_dup 1,
+ [Disassembles_as [Use_operands [| VecArray (1, All_elements_of_dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| Dreg; CstPtrTo Corereg |], "vld1_dup",
+ bits_1, pf_su_8_32;
+ Vldx_dup 1,
+ [Disassembles_as [Use_operands [| VecArray (1, Dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| Dreg; CstPtrTo Corereg |], "vld1_dup",
+ bits_1, [S64; U64];
+ Vldx_dup 1,
+ [Disassembles_as [Use_operands [| VecArray (2, All_elements_of_dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q_dup",
+ bits_1, pf_su_8_32;
+ Vldx_dup 1,
+ [Disassembles_as [Use_operands [| VecArray (2, Dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| Qreg; CstPtrTo Corereg |], "vld1Q_dup",
+ bits_1, [S64; U64];
+
+ (* VST1 variants. *)
+ Vstx 1, [Disassembles_as [Use_operands [| VecArray (1, Dreg);
+ PtrTo Corereg |]]],
+ Use_operands [| PtrTo Corereg; Dreg |], "vst1",
+ store_1, pf_su_8_64;
+ Vstx 1, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
+ PtrTo Corereg |]]],
+ Use_operands [| PtrTo Corereg; Qreg |], "vst1Q",
+ store_1, pf_su_8_64;
+
+ Vstx_lane 1,
+ [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| PtrTo Corereg; Dreg; Immed |],
+ "vst1_lane", store_3, pf_su_8_32;
+ Vstx_lane 1,
+ [Disassembles_as [Use_operands [| VecArray (1, Dreg);
+ CstPtrTo Corereg |]];
+ Const_valuator (fun _ -> 0)],
+ Use_operands [| PtrTo Corereg; Dreg; Immed |],
+ "vst1_lane", store_3, [U64; S64];
+ Vstx_lane 1,
+ [Disassembles_as [Use_operands [| VecArray (1, Element_of_dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| PtrTo Corereg; Qreg; Immed |],
+ "vst1Q_lane", store_3, pf_su_8_32;
+ Vstx_lane 1,
+ [Disassembles_as [Use_operands [| VecArray (1, Dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| PtrTo Corereg; Qreg; Immed |],
+ "vst1Q_lane", store_3, [U64; S64];
+
+ (* VLD2 variants. *)
+ Vldx 2, [], Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
+ "vld2", bits_1, pf_su_8_32;
+ Vldx 2, [Instruction_name ["vld1"]],
+ Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
+ "vld2", bits_1, [S64; U64];
+ Vldx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
+ CstPtrTo Corereg |];
+ Use_operands [| VecArray (2, Dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| VecArray (2, Qreg); CstPtrTo Corereg |],
+ "vld2Q", bits_1, pf_su_8_32;
+
+ Vldx_lane 2,
+ [Disassembles_as [Use_operands
+ [| VecArray (2, Element_of_dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg;
+ VecArray (2, Dreg); Immed |],
+ "vld2_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
+ Vldx_lane 2,
+ [Disassembles_as [Use_operands
+ [| VecArray (2, Element_of_dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| VecArray (2, Qreg); CstPtrTo Corereg;
+ VecArray (2, Qreg); Immed |],
+ "vld2Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
+
+ Vldx_dup 2,
+ [Disassembles_as [Use_operands
+ [| VecArray (2, All_elements_of_dreg); CstPtrTo Corereg |]]],
+ Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
+ "vld2_dup", bits_1, pf_su_8_32;
+ Vldx_dup 2,
+ [Instruction_name ["vld1"]; Disassembles_as [Use_operands
+ [| VecArray (2, Dreg); CstPtrTo Corereg |]]],
+ Use_operands [| VecArray (2, Dreg); CstPtrTo Corereg |],
+ "vld2_dup", bits_1, [S64; U64];
+
+ (* VST2 variants. *)
+ Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
+ PtrTo Corereg |]]],
+ Use_operands [| PtrTo Corereg; VecArray (2, Dreg) |], "vst2",
+ store_1, pf_su_8_32;
+ Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
+ PtrTo Corereg |]];
+ Instruction_name ["vst1"]],
+ Use_operands [| PtrTo Corereg; VecArray (2, Dreg) |], "vst2",
+ store_1, [S64; U64];
+ Vstx 2, [Disassembles_as [Use_operands [| VecArray (2, Dreg);
+ PtrTo Corereg |];
+ Use_operands [| VecArray (2, Dreg);
+ PtrTo Corereg |]]],
+ Use_operands [| PtrTo Corereg; VecArray (2, Qreg) |], "vst2Q",
+ store_1, pf_su_8_32;
+
+ Vstx_lane 2,
+ [Disassembles_as [Use_operands
+ [| VecArray (2, Element_of_dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| PtrTo Corereg; VecArray (2, Dreg); Immed |], "vst2_lane",
+ store_3, P8 :: P16 :: F32 :: su_8_32;
+ Vstx_lane 2,
+ [Disassembles_as [Use_operands
+ [| VecArray (2, Element_of_dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| PtrTo Corereg; VecArray (2, Qreg); Immed |], "vst2Q_lane",
+ store_3, [P16; F32; U16; U32; S16; S32];
+
+ (* VLD3 variants. *)
+ Vldx 3, [], Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
+ "vld3", bits_1, pf_su_8_32;
+ Vldx 3, [Instruction_name ["vld1"]],
+ Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
+ "vld3", bits_1, [S64; U64];
+ Vldx 3, [Disassembles_as [Use_operands [| VecArray (3, Dreg);
+ CstPtrTo Corereg |];
+ Use_operands [| VecArray (3, Dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| VecArray (3, Qreg); CstPtrTo Corereg |],
+ "vld3Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
+
+ Vldx_lane 3,
+ [Disassembles_as [Use_operands
+ [| VecArray (3, Element_of_dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg;
+ VecArray (3, Dreg); Immed |],
+ "vld3_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
+ Vldx_lane 3,
+ [Disassembles_as [Use_operands
+ [| VecArray (3, Element_of_dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| VecArray (3, Qreg); CstPtrTo Corereg;
+ VecArray (3, Qreg); Immed |],
+ "vld3Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
+
+ Vldx_dup 3,
+ [Disassembles_as [Use_operands
+ [| VecArray (3, All_elements_of_dreg); CstPtrTo Corereg |]]],
+ Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
+ "vld3_dup", bits_1, pf_su_8_32;
+ Vldx_dup 3,
+ [Instruction_name ["vld1"]; Disassembles_as [Use_operands
+ [| VecArray (3, Dreg); CstPtrTo Corereg |]]],
+ Use_operands [| VecArray (3, Dreg); CstPtrTo Corereg |],
+ "vld3_dup", bits_1, [S64; U64];
+
+ (* VST3 variants. *)
+ Vstx 3, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
+ PtrTo Corereg |]]],
+ Use_operands [| PtrTo Corereg; VecArray (3, Dreg) |], "vst3",
+ store_1, pf_su_8_32;
+ Vstx 3, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
+ PtrTo Corereg |]];
+ Instruction_name ["vst1"]],
+ Use_operands [| PtrTo Corereg; VecArray (3, Dreg) |], "vst3",
+ store_1, [S64; U64];
+ Vstx 3, [Disassembles_as [Use_operands [| VecArray (3, Dreg);
+ PtrTo Corereg |];
+ Use_operands [| VecArray (3, Dreg);
+ PtrTo Corereg |]]],
+ Use_operands [| PtrTo Corereg; VecArray (3, Qreg) |], "vst3Q",
+ store_1, pf_su_8_32;
+
+ Vstx_lane 3,
+ [Disassembles_as [Use_operands
+ [| VecArray (3, Element_of_dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| PtrTo Corereg; VecArray (3, Dreg); Immed |], "vst3_lane",
+ store_3, P8 :: P16 :: F32 :: su_8_32;
+ Vstx_lane 3,
+ [Disassembles_as [Use_operands
+ [| VecArray (3, Element_of_dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| PtrTo Corereg; VecArray (3, Qreg); Immed |], "vst3Q_lane",
+ store_3, [P16; F32; U16; U32; S16; S32];
+
+ (* VLD4/VST4 variants. *)
+ Vldx 4, [], Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
+ "vld4", bits_1, pf_su_8_32;
+ Vldx 4, [Instruction_name ["vld1"]],
+ Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
+ "vld4", bits_1, [S64; U64];
+ Vldx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
+ CstPtrTo Corereg |];
+ Use_operands [| VecArray (4, Dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| VecArray (4, Qreg); CstPtrTo Corereg |],
+ "vld4Q", bits_1, P8 :: P16 :: F32 :: su_8_32;
+
+ Vldx_lane 4,
+ [Disassembles_as [Use_operands
+ [| VecArray (4, Element_of_dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg;
+ VecArray (4, Dreg); Immed |],
+ "vld4_lane", bits_3, P8 :: P16 :: F32 :: su_8_32;
+ Vldx_lane 4,
+ [Disassembles_as [Use_operands
+ [| VecArray (4, Element_of_dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| VecArray (4, Qreg); CstPtrTo Corereg;
+ VecArray (4, Qreg); Immed |],
+ "vld4Q_lane", bits_3, [P16; F32; U16; U32; S16; S32];
+
+ Vldx_dup 4,
+ [Disassembles_as [Use_operands
+ [| VecArray (4, All_elements_of_dreg); CstPtrTo Corereg |]]],
+ Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
+ "vld4_dup", bits_1, pf_su_8_32;
+ Vldx_dup 4,
+ [Instruction_name ["vld1"]; Disassembles_as [Use_operands
+ [| VecArray (4, Dreg); CstPtrTo Corereg |]]],
+ Use_operands [| VecArray (4, Dreg); CstPtrTo Corereg |],
+ "vld4_dup", bits_1, [S64; U64];
+
+ Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
+ PtrTo Corereg |]]],
+ Use_operands [| PtrTo Corereg; VecArray (4, Dreg) |], "vst4",
+ store_1, pf_su_8_32;
+ Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
+ PtrTo Corereg |]];
+ Instruction_name ["vst1"]],
+ Use_operands [| PtrTo Corereg; VecArray (4, Dreg) |], "vst4",
+ store_1, [S64; U64];
+ Vstx 4, [Disassembles_as [Use_operands [| VecArray (4, Dreg);
+ PtrTo Corereg |];
+ Use_operands [| VecArray (4, Dreg);
+ PtrTo Corereg |]]],
+ Use_operands [| PtrTo Corereg; VecArray (4, Qreg) |], "vst4Q",
+ store_1, pf_su_8_32;
+
+ Vstx_lane 4,
+ [Disassembles_as [Use_operands
+ [| VecArray (4, Element_of_dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| PtrTo Corereg; VecArray (4, Dreg); Immed |], "vst4_lane",
+ store_3, P8 :: P16 :: F32 :: su_8_32;
+ Vstx_lane 4,
+ [Disassembles_as [Use_operands
+ [| VecArray (4, Element_of_dreg);
+ CstPtrTo Corereg |]]],
+ Use_operands [| PtrTo Corereg; VecArray (4, Qreg); Immed |], "vst4Q_lane",
+ store_3, [P16; F32; U16; U32; S16; S32];
+
+ (* Logical operations. And. *)
+ Vand, [], All (3, Dreg), "vand", notype_2, su_8_64;
+ Vand, [], All (3, Qreg), "vandQ", notype_2, su_8_64;
+
+ (* Or. *)
+ Vorr, [], All (3, Dreg), "vorr", notype_2, su_8_64;
+ Vorr, [], All (3, Qreg), "vorrQ", notype_2, su_8_64;
+
+ (* Eor. *)
+ Veor, [], All (3, Dreg), "veor", notype_2, su_8_64;
+ Veor, [], All (3, Qreg), "veorQ", notype_2, su_8_64;
+
+ (* Bic (And-not). *)
+ Vbic, [], All (3, Dreg), "vbic", notype_2, su_8_64;
+ Vbic, [], All (3, Qreg), "vbicQ", notype_2, su_8_64;
+
+ (* Or-not. *)
+ Vorn, [], All (3, Dreg), "vorn", notype_2, su_8_64;
+ Vorn, [], All (3, Qreg), "vornQ", notype_2, su_8_64;
+ ]
+
+let reinterp =
+ let elems = P8 :: P16 :: F32 :: su_8_64 in
+ List.fold_right
+ (fun convto acc ->
+ let types = List.fold_right
+ (fun convfrom acc ->
+ if convfrom <> convto then
+ Cast (convto, convfrom) :: acc
+ else
+ acc)
+ elems
+ []
+ in
+ let dconv = Vreinterp, [No_op], Use_operands [| Dreg; Dreg |],
+ "vreinterpret", conv_1, types
+ and qconv = Vreinterp, [No_op], Use_operands [| Qreg; Qreg |],
+ "vreinterpretQ", conv_1, types in
+ dconv :: qconv :: acc)
+ elems
+ []
+
+(* Output routines. *)
+
+let rec string_of_elt = function
+ S8 -> "s8" | S16 -> "s16" | S32 -> "s32" | S64 -> "s64"
+ | U8 -> "u8" | U16 -> "u16" | U32 -> "u32" | U64 -> "u64"
+ | I8 -> "i8" | I16 -> "i16" | I32 -> "i32" | I64 -> "i64"
+ | B8 -> "8" | B16 -> "16" | B32 -> "32" | B64 -> "64"
+ | F32 -> "f32" | P8 -> "p8" | P16 -> "p16"
+ | Conv (a, b) | Cast (a, b) -> string_of_elt a ^ "_" ^ string_of_elt b
+ | NoElts -> failwith "No elts"
+
+let string_of_elt_dots elt =
+ match elt with
+ Conv (a, b) | Cast (a, b) -> string_of_elt a ^ "." ^ string_of_elt b
+ | _ -> string_of_elt elt
+
+let string_of_vectype vt =
+ let rec name affix = function
+ T_int8x8 -> affix "int8x8"
+ | T_int8x16 -> affix "int8x16"
+ | T_int16x4 -> affix "int16x4"
+ | T_int16x8 -> affix "int16x8"
+ | T_int32x2 -> affix "int32x2"
+ | T_int32x4 -> affix "int32x4"
+ | T_int64x1 -> affix "int64x1"
+ | T_int64x2 -> affix "int64x2"
+ | T_uint8x8 -> affix "uint8x8"
+ | T_uint8x16 -> affix "uint8x16"
+ | T_uint16x4 -> affix "uint16x4"
+ | T_uint16x8 -> affix "uint16x8"
+ | T_uint32x2 -> affix "uint32x2"
+ | T_uint32x4 -> affix "uint32x4"
+ | T_uint64x1 -> affix "uint64x1"
+ | T_uint64x2 -> affix "uint64x2"
+ | T_float32x2 -> affix "float32x2"
+ | T_float32x4 -> affix "float32x4"
+ | T_poly8x8 -> affix "poly8x8"
+ | T_poly8x16 -> affix "poly8x16"
+ | T_poly16x4 -> affix "poly16x4"
+ | T_poly16x8 -> affix "poly16x8"
+ | T_int8 -> affix "int8"
+ | T_int16 -> affix "int16"
+ | T_int32 -> affix "int32"
+ | T_int64 -> affix "int64"
+ | T_uint8 -> affix "uint8"
+ | T_uint16 -> affix "uint16"
+ | T_uint32 -> affix "uint32"
+ | T_uint64 -> affix "uint64"
+ | T_poly8 -> affix "poly8"
+ | T_poly16 -> affix "poly16"
+ | T_float32 -> affix "float32"
+ | T_immediate _ -> "const int"
+ | T_void -> "void"
+ | T_intQI -> "__builtin_neon_qi"
+ | T_intHI -> "__builtin_neon_hi"
+ | T_intSI -> "__builtin_neon_si"
+ | T_intDI -> "__builtin_neon_di"
+ | T_arrayof (num, base) ->
+ let basename = name (fun x -> x) base in
+ affix (Printf.sprintf "%sx%d" basename num)
+ | T_ptrto x ->
+ let basename = name affix x in
+ Printf.sprintf "%s *" basename
+ | T_const x ->
+ let basename = name affix x in
+ Printf.sprintf "const %s" basename
+ in
+ name (fun x -> x ^ "_t") vt
+
+let string_of_inttype = function
+ B_TImode -> "__builtin_neon_ti"
+ | B_EImode -> "__builtin_neon_ei"
+ | B_OImode -> "__builtin_neon_oi"
+ | B_CImode -> "__builtin_neon_ci"
+ | B_XImode -> "__builtin_neon_xi"
+
+let string_of_mode = function
+ V8QI -> "v8qi" | V4HI -> "v4hi" | V2SI -> "v2si" | V2SF -> "v2sf"
+ | DI -> "di" | V16QI -> "v16qi" | V8HI -> "v8hi" | V4SI -> "v4si"
+ | V4SF -> "v4sf" | V2DI -> "v2di" | QI -> "qi" | HI -> "hi" | SI -> "si"
+ | SF -> "sf"
+
+(* Use uppercase chars for letters which form part of the intrinsic name, but
+ should be omitted from the builtin name (the info is passed in an extra
+ argument, instead). *)
+let intrinsic_name name = String.lowercase name
+
+(* Allow the name of the builtin to be overridden by things (e.g. Flipped)
+ found in the features list. *)
+let builtin_name features name =
+ let name = List.fold_right
+ (fun el name ->
+ match el with
+ Flipped x | Builtin_name x -> x
+ | _ -> name)
+ features name in
+ let islower x = let str = String.make 1 x in (String.lowercase str) = str
+ and buf = Buffer.create (String.length name) in
+ String.iter (fun c -> if islower c then Buffer.add_char buf c) name;
+ Buffer.contents buf
+
+(* Transform an arity into a list of strings. *)
+let strings_of_arity a =
+ match a with
+ | Arity0 vt -> [string_of_vectype vt]
+ | Arity1 (vt1, vt2) -> [string_of_vectype vt1; string_of_vectype vt2]
+ | Arity2 (vt1, vt2, vt3) -> [string_of_vectype vt1;
+ string_of_vectype vt2;
+ string_of_vectype vt3]
+ | Arity3 (vt1, vt2, vt3, vt4) -> [string_of_vectype vt1;
+ string_of_vectype vt2;
+ string_of_vectype vt3;
+ string_of_vectype vt4]
+ | Arity4 (vt1, vt2, vt3, vt4, vt5) -> [string_of_vectype vt1;
+ string_of_vectype vt2;
+ string_of_vectype vt3;
+ string_of_vectype vt4;
+ string_of_vectype vt5]
+
+(* Suffixes on the end of builtin names that are to be stripped in order
+ to obtain the name used as an instruction. They are only stripped if
+ preceded immediately by an underscore. *)
+let suffixes_to_strip = [ "n"; "lane"; "dup" ]
+
+(* Get the possible names of an instruction corresponding to a "name" from the
+ ops table. This is done by getting the equivalent builtin name and
+ stripping any suffixes from the list at the top of this file, unless
+ the features list presents with an Instruction_name entry, in which
+ case that is used; or unless the features list presents with a Flipped
+ entry, in which case that is used. If both such entries are present,
+ the first in the list will be chosen. *)
+let get_insn_names features name =
+ let names = try
+ begin
+ match List.find (fun feature -> match feature with
+ Instruction_name _ -> true
+ | Flipped _ -> true
+ | _ -> false) features
+ with
+ Instruction_name names -> names
+ | Flipped name -> [name]
+ | _ -> assert false
+ end
+ with Not_found -> [builtin_name features name]
+ in
+ begin
+ List.map (fun name' ->
+ try
+ let underscore = String.rindex name' '_' in
+ let our_suffix = String.sub name' (underscore + 1)
+ ((String.length name') - underscore - 1)
+ in
+ let rec strip remaining_suffixes =
+ match remaining_suffixes with
+ [] -> name'
+ | s::ss when our_suffix = s -> String.sub name' 0 underscore
+ | _::ss -> strip ss
+ in
+ strip suffixes_to_strip
+ with (Not_found | Invalid_argument _) -> name') names
+ end
+
+(* Apply a function to each element of a list and then comma-separate
+ the resulting strings. *)
+let rec commas f elts acc =
+ match elts with
+ [] -> acc
+ | [elt] -> acc ^ (f elt)
+ | elt::elts ->
+ commas f elts (acc ^ (f elt) ^ ", ")
+
+(* Given a list of features and the shape specified in the "ops" table, apply
+ a function to each possible shape that the instruction may have.
+ By default, this is the "shape" entry in "ops". If the features list
+ contains a Disassembles_as entry, the shapes contained in that entry are
+ mapped to corresponding outputs and returned in a list. If there is more
+ than one Disassembles_as entry, only the first is used. *)
+let analyze_all_shapes features shape f =
+ try
+ match List.find (fun feature ->
+ match feature with Disassembles_as _ -> true
+ | _ -> false)
+ features with
+ Disassembles_as shapes -> List.map f shapes
+ | _ -> assert false
+ with Not_found -> [f shape]
+
Modified: llvm-gcc-4.2/trunk/gcc/config/arm/pr-support.c
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/pr-support.c?rev=76781&r1=76780&r2=76781&view=diff
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/pr-support.c (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/pr-support.c Wed Jul 22 15:36:27 2009
@@ -282,13 +282,25 @@
}
if (op == 0xc8)
{
- /* Pop FPA registers. */
- op = next_unwind_byte (uws);
+/* APPLE LOCAL begin v7 support. Merge from mainline */
+#ifndef __VFP_FP__
+ /* Pop FPA registers. */
+ op = next_unwind_byte (uws);
op = ((op & 0xf0) << 12) | ((op & 0xf) + 1);
- if (_Unwind_VRS_Pop (context, _UVRSC_FPA, op, _UVRSD_FPAX)
- != _UVRSR_OK)
- return _URC_FAILURE;
- continue;
+ if (_Unwind_VRS_Pop (context, _UVRSC_FPA, op, _UVRSD_FPAX)
+ != _UVRSR_OK)
+ return _URC_FAILURE;
+ continue;
+#else
+ /* Pop VFPv3 registers D[16+ssss]-D[16+ssss+cccc] with vldm. */
+ op = next_unwind_byte (uws);
+ op = (((op & 0xf0) + 16) << 12) | ((op & 0xf) + 1);
+ if (_Unwind_VRS_Pop (context, _UVRSC_VFP, op, _UVRSD_DOUBLE)
+ != _UVRSR_OK)
+ return _URC_FAILURE;
+ continue;
+#endif
+/* APPLE LOCAL end v7 support. Merge from mainline */
}
if (op == 0xc9)
{
@@ -379,3 +391,19 @@
return ptr;
}
+/* APPLE LOCAL begin v7 support. Merge from Codesourcery */
+
+/* These two should never be used. */
+
+_Unwind_Ptr
+_Unwind_GetDataRelBase (_Unwind_Context *context __attribute__ ((unused)))
+{
+ abort ();
+}
+
+_Unwind_Ptr
+_Unwind_GetTextRelBase (_Unwind_Context *context __attribute__ ((unused)))
+{
+ abort ();
+}
+/* APPLE LOCAL end v7 support. Merge from Codesourcery */
Modified: llvm-gcc-4.2/trunk/gcc/config/arm/predicates.md
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/predicates.md?rev=76781&r1=76780&r2=76781&view=diff
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/predicates.md (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/predicates.md Wed Jul 22 15:36:27 2009
@@ -39,6 +39,18 @@
return REGNO (op) < FIRST_PSEUDO_REGISTER;
})
+;; APPLE LOCAL begin v7 support. Merge from mainline
+;; A low register.
+(define_predicate "low_register_operand"
+ (and (match_code "reg")
+ (match_test "REGNO (op) <= LAST_LO_REGNUM")))
+
+;; A low register or const_int.
+(define_predicate "low_reg_or_int_operand"
+ (ior (match_code "const_int")
+ (match_operand 0 "low_register_operand")))
+;; APPLE LOCAL end v7 support. Merge from mainline
+
;; Any core register, or any pseudo. */
(define_predicate "arm_general_register_operand"
(match_code "reg,subreg")
@@ -158,6 +170,16 @@
|| (GET_CODE (op) == REG
&& REGNO (op) >= FIRST_PSEUDO_REGISTER)))")))
+;; APPLE LOCAL begin 6160917
+;; Allow any mem reference through here. By doing this, instead of just
+;; ignoring unhandled cases in SECONDARY_*_RELOAD_CLASS macros we will
+;; get an assertion failure in neon_reload_{in,out}.
+;; We don't use memory_operand because it fails for out-of-range
+;; indexed addressing.
+(define_predicate "neon_reload_mem_operand"
+ (match_code "mem"))
+;; APPLE LOCAL end 6160917
+
;; True for valid operands for the rhs of an floating point insns.
;; Allows regs or certain consts on FPA, just regs for everything else.
(define_predicate "arm_float_rhs_operand"
@@ -215,6 +237,12 @@
(match_code "ashift,ashiftrt,lshiftrt,rotatert"))
(match_test "mode == GET_MODE (op)")))
+;; APPLE LOCAL begin v7 support. Merge from mainline
+;; True for operators that have 16-bit thumb variants. */
+(define_special_predicate "thumb_16bit_operator"
+ (match_code "plus,minus,and,ior,xor"))
+;; APPLE LOCAL end v7 support. Merge from mainline
+
;; True for EQ & NE
(define_special_predicate "equality_operator"
(match_code "eq,ne"))
@@ -440,13 +468,15 @@
;; Thumb predicates
;;
-(define_predicate "thumb_cmp_operand"
+;; APPLE LOCAL v7 support. Merge from mainline
+(define_predicate "thumb1_cmp_operand"
(ior (and (match_code "reg,subreg")
(match_operand 0 "s_register_operand"))
(and (match_code "const_int")
(match_test "((unsigned HOST_WIDE_INT) INTVAL (op)) < 256"))))
-(define_predicate "thumb_cmpneg_operand"
+;; APPLE LOCAL v7 support. Merge from mainline
+(define_predicate "thumb1_cmpneg_operand"
(and (match_code "const_int")
(match_test "INTVAL (op) < 0 && INTVAL (op) > -256")))
@@ -496,6 +526,51 @@
(and (match_code "const_int")
(match_test "((unsigned HOST_WIDE_INT) INTVAL (op)) < 64")))
+;; APPLE LOCAL begin v7 support. Merge from Codesourcery
+
+;; Neon predicates
+
+(define_predicate "const_multiple_of_8_operand"
+ (match_code "const_int")
+{
+ unsigned HOST_WIDE_INT val = INTVAL (op);
+ return (val & 7) == 0;
+})
+
+(define_predicate "imm_for_neon_mov_operand"
+ (match_code "const_vector")
+{
+ return neon_immediate_valid_for_move (op, mode, NULL, NULL);
+})
+
+(define_predicate "imm_for_neon_logic_operand"
+ (match_code "const_vector")
+{
+ return neon_immediate_valid_for_logic (op, mode, 0, NULL, NULL);
+})
+
+(define_predicate "imm_for_neon_inv_logic_operand"
+ (match_code "const_vector")
+{
+ return neon_immediate_valid_for_logic (op, mode, 1, NULL, NULL);
+})
+
+(define_predicate "neon_logic_op2"
+ (ior (match_operand 0 "imm_for_neon_logic_operand")
+ (match_operand 0 "s_register_operand")))
+
+(define_predicate "neon_inv_logic_op2"
+ (ior (match_operand 0 "imm_for_neon_inv_logic_operand")
+ (match_operand 0 "s_register_operand")))
+
+;; TODO: We could check lane numbers more precisely based on the mode.
+(define_predicate "neon_lane_number"
+ (and (match_code "const_int")
+ (match_test "INTVAL (op) >= 0 && INTVAL (op) <= 7")))
+
+
+
+;; APPLE LOCAL end v7 support. Merge from Codesourcery
;; APPLE LOCAL begin ARM pic support
;; Allow local symbols and stub references
(define_predicate "arm_branch_target"
Modified: llvm-gcc-4.2/trunk/gcc/config/arm/t-arm
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/t-arm?rev=76781&r1=76780&r2=76781&view=diff
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/t-arm (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/t-arm Wed Jul 22 15:36:27 2009
@@ -1,5 +1,6 @@
# Rules common to all arm targets
+# APPLE LOCAL begin v7 support. Merge from Codesourcery
MD_INCLUDES= $(srcdir)/config/arm/arm-tune.md \
$(srcdir)/config/arm/predicates.md \
$(srcdir)/config/arm/arm-generic.md \
@@ -9,8 +10,13 @@
$(srcdir)/config/arm/arm926ejs.md \
$(srcdir)/config/arm/cirrus.md \
$(srcdir)/config/arm/fpa.md \
+ $(srcdir)/config/arm/vec-common.md \
$(srcdir)/config/arm/iwmmxt.md \
- $(srcdir)/config/arm/vfp.md
+ $(srcdir)/config/arm/vfp.md \
+ $(srcdir)/config/arm/neon.md \
+ $(srcdir)/config/arm/thumb2.md \
+ $(srcdir)/config/arm/hwdiv.md
+# APPLE LOCAL end v7 support. Merge from Codesourcery
s-config s-conditions s-flags s-codes s-constants s-emit s-recog s-preds \
s-opinit s-extract s-peep s-attr s-attrtab s-output: $(MD_INCLUDES)
Modified: llvm-gcc-4.2/trunk/gcc/config/arm/t-arm-elf
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/t-arm-elf?rev=76781&r1=76780&r2=76781&view=diff
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/t-arm-elf (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/t-arm-elf Wed Jul 22 15:36:27 2009
@@ -11,6 +11,18 @@
MULTILIB_EXCEPTIONS =
MULTILIB_MATCHES =
+# APPLE LOCAL begin v7 support. Merge from mainline
+#MULTILIB_OPTIONS += march=armv7
+#MULTILIB_DIRNAMES += thumb2
+#MULTILIB_EXCEPTIONS += march=armv7* marm/*march=armv7*
+#MULTILIB_MATCHES += march?armv7=march?armv7-a
+#MULTILIB_MATCHES += march?armv7=march?armv7-r
+#MULTILIB_MATCHES += march?armv7=march?armv7-m
+#MULTILIB_MATCHES += march?armv7=mcpu?cortex-a8
+#MULTILIB_MATCHES += march?armv7=mcpu?cortex-r4
+#MULTILIB_MATCHES += march?armv7=mcpu?cortex-m3
+# APPLE LOCAL end v7 support. Merge from mainline
+
# MULTILIB_OPTIONS += mcpu=ep9312
# MULTILIB_DIRNAMES += ep9312
# MULTILIB_EXCEPTIONS += *mthumb/*mcpu=ep9312*
Modified: llvm-gcc-4.2/trunk/gcc/config/arm/t-darwin
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/t-darwin?rev=76781&r1=76780&r2=76781&view=diff
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/t-darwin (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/t-darwin Wed Jul 22 15:36:27 2009
@@ -24,9 +24,19 @@
$(srcdir)/config/arm/_fixunssfdi.c
# APPLE LOCAL end 5316398 improved float/double -> int64 functions
-MULTILIB_OPTIONS = march=armv6k
-MULTILIB_DIRNAMES = v6
-MULTILIB_EXCEPTIONS =
+# APPLE LOCAL begin 6611402 configurable multilib architectures
+ifndef ARM_MULTILIB_ARCHS
+ARM_MULTILIB_ARCHS:=armv5 armv6 armv7
+endif
+
+MULTILIB_OPTIONS:=$(shell echo $(ARM_MULTILIB_ARCHS) | \
+ sed -e s/armv5/march=armv5tej/ \
+ -e s/armv6/march=armv6k/ \
+ -e s/armv7/march=armv7a/)
+MULTILIB_DIRNAMES:=$(shell echo $(ARM_MULTILIB_ARCHS) | sed -e s/arm//g)
+MULTILIB_EXCEPTIONS:=$(shell $(srcdir)/config/arm/gen-darwin-multilib-exceptions.sh $(ARM_MULTILIB_ARCHS))
+# APPLE LOCAL end 6611402 configurable multilib architectures
+
MULTILIB_MATCHES =
TARGET_LIBGCC2_CFLAGS = -fno-inline
Modified: llvm-gcc-4.2/trunk/gcc/config/arm/t-pe
URL: http://llvm.org/viewvc/llvm-project/llvm-gcc-4.2/trunk/gcc/config/arm/t-pe?rev=76781&r1=76780&r2=76781&view=diff
==============================================================================
--- llvm-gcc-4.2/trunk/gcc/config/arm/t-pe (original)
+++ llvm-gcc-4.2/trunk/gcc/config/arm/t-pe Wed Jul 22 15:36:27 2009
@@ -29,4 +29,5 @@
LIBGCC = stmp-multilib
INSTALL_LIBGCC = install-multilib
-TARGET_LIBGCC2_CFLAGS =
\ No newline at end of file
+# APPLE LOCAL v7 support
+TARGET_LIBGCC2_CFLAGS =
More information about the llvm-commits
mailing list